Skip to content
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions cmd/thv-operator/controllers/mcpserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"fmt"
"maps"
"os"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -1151,6 +1152,20 @@ func (r *MCPServerReconciler) deploymentForMCPServer(
// name collision (ResourceOverrides.Env only accepts plain strings).
env = append(env, r.buildRedisPasswordEnvVar(m)...)

// Project the MCPServer generation pod-template annotation into the
// proxyrunner container via the downward API. The proxyrunner uses this
// to override the value read from the live-mounted RunConfig ConfigMap,
// freezing it per pod at creation time. See #5360.
env = append(env, corev1.EnvVar{
Name: kubernetes.EnvVarMCPServerGeneration,
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: fmt.Sprintf("metadata.annotations['%s']",
kubernetes.RunConfigMCPServerGenerationAnnotation),
},
},
})

// Add volume mounts for user-defined volumes
for _, v := range m.Spec.Volumes {
volumeMounts = append(volumeMounts, corev1.VolumeMount{
Expand Down Expand Up @@ -1264,6 +1279,12 @@ func (r *MCPServerReconciler) deploymentForMCPServer(
// Add RunConfig checksum annotation to trigger pod rollout when config changes
deploymentTemplateAnnotations = checksum.AddRunConfigChecksumToPodTemplate(deploymentTemplateAnnotations, runConfigChecksum)

// Stamp the MCPServer generation on the proxy Deployment's pod template so the
// downward-API env var below resolves to a value that is frozen at pod creation
// time, not live-updated like the runconfig.json ConfigMap mount. See #5360.
deploymentTemplateAnnotations[kubernetes.RunConfigMCPServerGenerationAnnotation] =
strconv.FormatInt(m.Generation, 10)

if m.Spec.ResourceOverrides != nil && m.Spec.ResourceOverrides.ProxyDeployment != nil {
if m.Spec.ResourceOverrides.ProxyDeployment.Labels != nil {
deploymentLabels = ctrlutil.MergeLabels(ls, m.Spec.ResourceOverrides.ProxyDeployment.Labels)
Expand Down Expand Up @@ -1759,6 +1780,21 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate(
}
}

// Project the MCPServer generation pod-template annotation into the
// proxyrunner container via the downward API. Position mirrors the
// construction site in deploymentForMCPServer (after Redis password,
// before embedded auth) so DeepEqual against container.Env succeeds.
// See #5360.
expectedProxyEnv = append(expectedProxyEnv, corev1.EnvVar{
Comment thread
ChrisJBurns marked this conversation as resolved.
Name: kubernetes.EnvVarMCPServerGeneration,
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: fmt.Sprintf("metadata.annotations['%s']",
kubernetes.RunConfigMCPServerGenerationAnnotation),
},
},
})

// Add embedded auth server environment variables. AuthServerRef takes precedence;
// externalAuthConfigRef is used as a fallback (legacy path).
if configName := ctrlutil.EmbeddedAuthServerConfigName(
Expand Down Expand Up @@ -1879,6 +1915,11 @@ func (r *MCPServerReconciler) deploymentNeedsUpdate(
// Check if pod template annotations have changed (including runconfig checksum)
expectedPodTemplateAnnotations := make(map[string]string)
expectedPodTemplateAnnotations = checksum.AddRunConfigChecksumToPodTemplate(expectedPodTemplateAnnotations, runConfigChecksum)
// Mirrors deploymentForMCPServer: stamp the MCPServer generation so the
// downward-API env var injected into the proxyrunner container resolves
// to a frozen-per-pod value (#5360).
expectedPodTemplateAnnotations[kubernetes.RunConfigMCPServerGenerationAnnotation] =
strconv.FormatInt(mcpServer.Generation, 10)

if mcpServer.Spec.ResourceOverrides != nil &&
mcpServer.Spec.ResourceOverrides.ProxyDeployment != nil &&
Expand Down
121 changes: 94 additions & 27 deletions cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
Comment thread
ChrisJBurns marked this conversation as resolved.
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -315,7 +316,8 @@
"monitoring/scrape": "true",
},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -376,7 +378,8 @@
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -415,7 +418,8 @@
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -481,7 +485,8 @@
"version": "v1.2.3",
},
expectedPodTemplateAnns: map[string]string{
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -525,9 +530,10 @@
},
expectedDeploymentAnns: map[string]string{},
expectedPodTemplateAnns: map[string]string{
"vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/role": "toolhive-mcp-workloads",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"vault.hashicorp.com/agent-inject": "true",
"vault.hashicorp.com/role": "toolhive-mcp-workloads",
"toolhive.stacklok.dev/runconfig-checksum": "test-checksum",
"toolhive.stacklok.dev/mcpserver-generation": "0",
},
expectedServiceLabels: map[string]string{
"app": "mcpserver",
Expand Down Expand Up @@ -582,30 +588,33 @@
switch tt.name {
case "with proxy environment variables":
expectedEnvVars = map[string]string{
"HTTP_PROXY": "http://proxy.example.com:8080",
"NO_PROXY": "localhost,127.0.0.1",
"CUSTOM_ENV": "custom-value",
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
"HTTP_PROXY": "http://proxy.example.com:8080",

Check failure on line 591 in cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go

View workflow job for this annotation

GitHub Actions / Linting / Lint Go Code

File is not properly formatted (gci)
"NO_PROXY": "localhost,127.0.0.1",
"CUSTOM_ENV": "custom-value",
"THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
}
case "with debug logging via TOOLHIVE_DEBUG env var":
expectedEnvVars = map[string]string{
"TOOLHIVE_DEBUG": "true",
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
"TOOLHIVE_DEBUG": "true",
"THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
}
default:
expectedEnvVars = map[string]string{
"LOG_LEVEL": "debug",
"METRICS_ENABLED": "true",
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
"LOG_LEVEL": "debug",
"METRICS_ENABLED": "true",
"THV_MCPSERVER_GENERATION": "", // downward API; Value is empty, ValueFrom set
"XDG_CONFIG_HOME": "/tmp",
"HOME": "/tmp",
"TOOLHIVE_RUNTIME": "kubernetes",
"UNSTRUCTURED_LOGS": "false",
}
}

Expand Down Expand Up @@ -657,7 +666,10 @@
assert.Equal(t, "value",
deployment.Spec.Template.Annotations["user.example.com/some-key"],
"user override must survive")
assert.Len(t, deployment.Spec.Template.Annotations, 2,
assert.Contains(t, deployment.Spec.Template.Annotations,
kubernetes.RunConfigMCPServerGenerationAnnotation,
"mcpserver-generation must be stamped for the downward-API env var (#5360)")
assert.Len(t, deployment.Spec.Template.Annotations, 3,
"no extra keys should leak into the pod template")
}

Expand Down Expand Up @@ -1140,3 +1152,58 @@
})
}
}

// TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI verifies that the
// proxy Deployment stamps the MCPServer generation as a pod-template annotation
// AND projects that annotation into the proxyrunner container as the
// THV_MCPSERVER_GENERATION env var via the downward API. This is the
// frozen-per-pod path that closes the race described in #5360 — the env var's
// value is bound to the pod's own annotations at creation time, so a restarted
// old-RS pod cannot acquire the new generation by re-reading the live-mounted
// RunConfig ConfigMap.
func TestDeploymentForMCPServer_MCPServerGenerationDownwardAPI(t *testing.T) {
t.Parallel()

scheme := runtime.NewScheme()
require.NoError(t, mcpv1beta1.AddToScheme(scheme))
require.NoError(t, corev1.AddToScheme(scheme))

client := fake.NewClientBuilder().WithScheme(scheme).Build()
r := newTestMCPServerReconciler(client, scheme, kubernetes.PlatformKubernetes)

mcpServer := &mcpv1beta1.MCPServer{
ObjectMeta: metav1.ObjectMeta{
Name: "test-server",
Namespace: "default",
Generation: 7,
},
Spec: mcpv1beta1.MCPServerSpec{
Image: "test-image",
ProxyPort: 8080,
},
}

deployment, err := r.deploymentForMCPServer(t.Context(), mcpServer, "test-checksum")
require.NoError(t, err)
require.NotNil(t, deployment)

assert.Equal(t, "7",
deployment.Spec.Template.Annotations[kubernetes.RunConfigMCPServerGenerationAnnotation],
"pod template must stamp the MCPServer generation so the downward-API env var resolves")

require.Len(t, deployment.Spec.Template.Spec.Containers, 1)
var got *corev1.EnvVar
for i := range deployment.Spec.Template.Spec.Containers[0].Env {
if deployment.Spec.Template.Spec.Containers[0].Env[i].Name == kubernetes.EnvVarMCPServerGeneration {
got = &deployment.Spec.Template.Spec.Containers[0].Env[i]
break
}
}
require.NotNil(t, got, "container must declare the %s env var", kubernetes.EnvVarMCPServerGeneration)
require.NotNil(t, got.ValueFrom, "env var must use ValueFrom (downward API), not a literal Value")
require.NotNil(t, got.ValueFrom.FieldRef)
assert.Equal(t,
"metadata.annotations['"+kubernetes.RunConfigMCPServerGenerationAnnotation+"']",
got.ValueFrom.FieldRef.FieldPath,
"FieldRef must point at the mcpserver-generation pod annotation")
}
Loading
Loading