-
Notifications
You must be signed in to change notification settings - Fork 763
Feat/remove restart policy never if autoscaler v2 and ray 2.55 or greater #4816
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 7 commits
7cad015
31cf894
715c260
797624a
5978fd1
0937f50
6be130a
2f8947f
a309e2a
f3458fd
565d9a8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -164,6 +164,8 @@ func configureGCSFaultTolerance(podTemplate *corev1.PodTemplateSpec, instance ra | |
|
|
||
| // DefaultHeadPodTemplate sets the config values | ||
| func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, headSpec rayv1.HeadGroupSpec, podName string, headPort string) corev1.PodTemplateSpec { | ||
| log := ctrl.LoggerFrom(ctx) | ||
|
|
||
| // TODO (Dmitri) The argument headPort is essentially unused; | ||
| // headPort is passed into setMissingRayStartParams but unused there for the head pod. | ||
| // To mitigate this awkwardness and reduce code redundancy, unify head and worker pod configuration logic. | ||
|
|
@@ -213,9 +215,16 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head | |
| mergeAutoscalerOverrides(&autoscalerContainer, instance.Spec.AutoscalerOptions) | ||
| podTemplate.Spec.Containers = append(podTemplate.Spec.Containers, autoscalerContainer) | ||
|
|
||
| // The error is ignored here because the function will return false if there's an error parsing the version. | ||
| // For example, if rayVersion is empty or unparseable, it considers the feature is not valid. | ||
| if utils.IsAutoscalingV2Enabled(&instance.Spec) { | ||
| setAutoscalerV2EnvVars(&podTemplate) | ||
| podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever | ||
| if autoscalerRestartValid, err := utils.IsRayVersionAtLeast(instance.Spec.RayVersion, utils.MinAutoscalerRestartValidVersion); !autoscalerRestartValid { | ||
| if err != nil { | ||
| log.Info("Parsing ray version failed, fall back the restart policy to Never.", "rayVersion", instance.Spec.RayVersion, "error", err) | ||
| } | ||
| podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever | ||
| } | ||
| } else if utils.IsAutoscalingV1Enabled(&instance.Spec) { | ||
| setAutoscalerV1EnvVars(&podTemplate) | ||
| } | ||
|
|
@@ -364,6 +373,8 @@ func getEnableProbesInjection() bool { | |
|
|
||
| // DefaultWorkerPodTemplate sets the config values | ||
| func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, workerSpec rayv1.WorkerGroupSpec, podName string, fqdnRayIP string, headPort string, replicaGrpName string, replicaIndex int, numHostIndex int) corev1.PodTemplateSpec { | ||
| log := ctrl.LoggerFrom(ctx) | ||
|
|
||
| podTemplate := workerSpec.Template | ||
| podTemplate.GenerateName = podName | ||
| // Pods created by RayCluster should be restricted to the namespace of the RayCluster. | ||
|
|
@@ -466,7 +477,14 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo | |
| podTemplate.Spec.Containers[utils.RayContainerIndex].Ports = append(podTemplate.Spec.Containers[utils.RayContainerIndex].Ports, metricsPort) | ||
| } | ||
|
|
||
| if utils.IsAutoscalingEnabled(&instance.Spec) && utils.IsAutoscalingV2Enabled(&instance.Spec) { | ||
| // Use the RayVersion and autoscaler version to determine whether the RestartPolicy should be Never or not, since the head pod is the one that runs the autoscaler. | ||
| // The error is ignored here because the function will return false if there's an error parsing the version. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed at 565d9a8 along with the above. |
||
| // For example, if rayVersion is empty or unparseable, it considers the feature is not valid. | ||
| autoscalerRestartValid, err := utils.IsRayVersionAtLeast(instance.Spec.RayVersion, utils.MinAutoscalerRestartValidVersion) | ||
| if err != nil { | ||
| log.Info("Parsing ray version failed, fallback the decision of restart policy.", "rayVersion", instance.Spec.RayVersion, "error", err) | ||
| } | ||
|
cursor[bot] marked this conversation as resolved.
Outdated
|
||
| if !autoscalerRestartValid && utils.IsAutoscalingEnabled(&instance.Spec) && utils.IsAutoscalingV2Enabled(&instance.Spec) { | ||
| podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1175,19 +1175,43 @@ func TestHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) { | |
|
|
||
| func TestDefaultHeadPodTemplate_Autoscaling(t *testing.T) { | ||
| clusterNoAutoscaling := instance.DeepCopy() | ||
|
|
||
| clusterAutoscalingVersionNotSet := instance.DeepCopy() | ||
| clusterAutoscalingVersionNotSet.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV1 := instance.DeepCopy() | ||
| clusterAutoscalingV1.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV1.Spec.AutoscalerOptions = &rayv1.AutoscalerOptions{ | ||
| Version: ptr.To(rayv1.AutoscalerVersionV1), | ||
| } | ||
|
|
||
| // clusterAutoscalingV2 has no RayVersion set, so IsRayVersionAtLeast | ||
| // returns false: env var is injected AND RestartPolicy is set to Never. | ||
| clusterAutoscalingV2 := instance.DeepCopy() | ||
| clusterAutoscalingV2.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV2.Spec.AutoscalerOptions = &rayv1.AutoscalerOptions{ | ||
| Version: ptr.To(rayv1.AutoscalerVersionV2), | ||
| } | ||
|
|
||
| // clusterAutoscalingV2OldRay uses a Ray version strictly below MinAutoscalerRestartValidVersion | ||
| // (2.55.0): env var is injected AND RestartPolicy is set to Never. | ||
| clusterAutoscalingV2OldRay := instance.DeepCopy() | ||
| clusterAutoscalingV2OldRay.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV2OldRay.Spec.AutoscalerOptions = &rayv1.AutoscalerOptions{ | ||
| Version: ptr.To(rayv1.AutoscalerVersionV2), | ||
| } | ||
| clusterAutoscalingV2OldRay.Spec.RayVersion = "2.54.0" | ||
|
|
||
| // clusterAutoscalingV2NewRay uses a Ray version >= MinAutoscalerRestartValidVersion (2.55.0). | ||
| // With the updated logic, setAutoscalerV2EnvVars is always called when AutoscalerV2 is enabled, | ||
| // so the env var IS injected. Only RestartPolicy=Never is skipped for newer versions. | ||
| clusterAutoscalingV2NewRay := instance.DeepCopy() | ||
| clusterAutoscalingV2NewRay.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV2NewRay.Spec.AutoscalerOptions = &rayv1.AutoscalerOptions{ | ||
| Version: ptr.To(rayv1.AutoscalerVersionV2), | ||
| } | ||
| clusterAutoscalingV2NewRay.Spec.RayVersion = "2.55.0" | ||
| clusterAutoscalingV2NewRay.Spec.HeadGroupSpec.Template.Spec.RestartPolicy = corev1.RestartPolicyOnFailure | ||
|
|
||
| ctx := context.Background() | ||
| podName := strings.ToLower(instance.Name + utils.DashSymbol + string(rayv1.HeadNode) + utils.DashSymbol + utils.FormatInt32(0)) | ||
|
|
||
|
|
@@ -1212,20 +1236,35 @@ func TestDefaultHeadPodTemplate_Autoscaling(t *testing.T) { | |
| expectedAutoscalerV1EnvVar: false, | ||
| expectedRestartPolicy: "", | ||
| }, | ||
| "Pod template with autoscaling v1 enabled should the correct autoscaler v1 fields": { | ||
| "Pod template with autoscaling v1 enabled should have the correct autoscaler v1 fields": { | ||
| cluster: *clusterAutoscalingV1, | ||
| expectedHeadContainers: 2, | ||
| expectedAutoscalerV2EnvVar: false, | ||
| expectedAutoscalerV1EnvVar: true, | ||
| expectedRestartPolicy: "", | ||
| }, | ||
| "Pod template with autoscaling v2 enabled should the correct autoscaler v2 fields": { | ||
| "Pod template with autoscaling v2 enabled and unparseable Ray image should inject env var and set RestartPolicy to Never": { | ||
| cluster: *clusterAutoscalingV2, | ||
| expectedHeadContainers: 2, | ||
| expectedAutoscalerV2EnvVar: true, | ||
| expectedAutoscalerV1EnvVar: false, | ||
| expectedRestartPolicy: corev1.RestartPolicyNever, | ||
| }, | ||
| "Pod template with autoscaling v2 enabled and Ray version below MinAutoscalerRestartValidVersion should inject env var and set RestartPolicy to Never": { | ||
| cluster: *clusterAutoscalingV2OldRay, | ||
| expectedHeadContainers: 2, | ||
| expectedAutoscalerV2EnvVar: true, | ||
| expectedRestartPolicy: corev1.RestartPolicyNever, | ||
| }, | ||
| // setAutoscalerV2EnvVars is now always called when AutoscalerV2 is enabled regardless of | ||
| // the Ray version, so the env var is injected even for new Ray versions. Only | ||
| // RestartPolicy=Never is version-gated (skipped when version >= MinAutoscalerRestartValidVersion). | ||
| "Pod template with autoscaling v2 enabled and Ray version at or above MinAutoscalerRestartValidVersion should inject env var but not set RestartPolicy to Never": { | ||
| cluster: *clusterAutoscalingV2NewRay, | ||
| expectedHeadContainers: 2, | ||
| expectedAutoscalerV2EnvVar: true, | ||
| expectedRestartPolicy: corev1.RestartPolicyOnFailure, | ||
| }, | ||
| } | ||
|
|
||
| for name, tc := range tests { | ||
|
|
@@ -1443,12 +1482,34 @@ func TestDefaultWorkerPodTemplate_Autoscaling(t *testing.T) { | |
| clusterNoAutoscaling := instance.DeepCopy() | ||
| clusterAutoscalingV1 := instance.DeepCopy() | ||
| clusterAutoscalingV1.Spec.EnableInTreeAutoscaling = new(true) | ||
|
|
||
| // clusterAutoscalingV2 has no RayVersion set, so IsRayVersionAtLeast | ||
| // returns false and RestartPolicy should be set to Never. | ||
| clusterAutoscalingV2 := instance.DeepCopy() | ||
| clusterAutoscalingV2.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV2.Spec.AutoscalerOptions = &rayv1.AutoscalerOptions{ | ||
| Version: ptr.To(rayv1.AutoscalerVersionV2), | ||
| } | ||
|
|
||
| // clusterAutoscalingV2OldRay uses a Ray version strictly below MinAutoscalerRestartValidVersion | ||
| // (2.55.0), so RestartPolicy should still be set to Never. | ||
| clusterAutoscalingV2OldRay := instance.DeepCopy() | ||
| clusterAutoscalingV2OldRay.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV2OldRay.Spec.AutoscalerOptions = &rayv1.AutoscalerOptions{ | ||
| Version: ptr.To(rayv1.AutoscalerVersionV2), | ||
| } | ||
| clusterAutoscalingV2OldRay.Spec.RayVersion = "2.54.0" | ||
|
|
||
| // clusterAutoscalingV2NewRay uses a Ray version >= MinAutoscalerRestartValidVersion (2.55.0), | ||
| // so autoscalerRestartValid is true and RestartPolicy should NOT be set to Never. | ||
| clusterAutoscalingV2NewRay := instance.DeepCopy() | ||
| clusterAutoscalingV2NewRay.Spec.EnableInTreeAutoscaling = new(true) | ||
| clusterAutoscalingV2NewRay.Spec.AutoscalerOptions = &rayv1.AutoscalerOptions{ | ||
| Version: ptr.To(rayv1.AutoscalerVersionV2), | ||
| } | ||
| clusterAutoscalingV2NewRay.Spec.RayVersion = "2.55.0" | ||
| clusterAutoscalingV2NewRay.Spec.WorkerGroupSpecs[0].Template.Spec.RestartPolicy = corev1.RestartPolicyOnFailure | ||
|
|
||
| ctx := context.Background() | ||
| podName := strings.ToLower(instance.Name + utils.DashSymbol + string(rayv1.WorkerNode) + utils.DashSymbol + utils.FormatInt32(0)) | ||
| fqdnRayIP := utils.GenerateFQDNServiceName(ctx, instance, instance.Namespace) | ||
|
|
@@ -1461,14 +1522,22 @@ func TestDefaultWorkerPodTemplate_Autoscaling(t *testing.T) { | |
| cluster: *clusterNoAutoscaling, | ||
| expectedRestartPolicy: "", | ||
| }, | ||
| "Pod template with autoscaling v1 enabled should the correct autoscaler v1 fields": { | ||
| "Pod template with autoscaling v1 enabled should the the correct autoscaler v1 fields": { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor typo: "have"
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix at f3458fd. |
||
| cluster: *clusterAutoscalingV1, | ||
| expectedRestartPolicy: "", | ||
| }, | ||
| "Pod template with autoscaling v2 enabled should the correct autoscaler v2 fields": { | ||
| "Pod template with autoscaling v2 enabled and unparseable Ray image should set RestartPolicy to Never": { | ||
| cluster: *clusterAutoscalingV2, | ||
| expectedRestartPolicy: corev1.RestartPolicyNever, | ||
| }, | ||
| "Pod template with autoscaling v2 enabled and Ray version below MinAutoscalerRestartValidVersion should set RestartPolicy to Never": { | ||
| cluster: *clusterAutoscalingV2OldRay, | ||
| expectedRestartPolicy: corev1.RestartPolicyNever, | ||
| }, | ||
| "Pod template with autoscaling v2 enabled and Ray version at or above MinAutoscalerRestartValidVersion should not set RestartPolicy to Never": { | ||
| cluster: *clusterAutoscalingV2NewRay, | ||
| expectedRestartPolicy: corev1.RestartPolicyOnFailure, | ||
| }, | ||
| } | ||
|
|
||
| for name, tc := range tests { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
since we keep
errhere, i think we should also re-word the comment?