diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 6cce6a2a9d..b9ef800f27 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -18,6 +18,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" ctrl "sigs.k8s.io/controller-runtime" ) @@ -248,27 +249,11 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo } func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType rayv1.RayNodeType, creatorCRDType utils.CRDType) { - rayAgentRayletHealthCommand := fmt.Sprintf( - utils.BaseWgetHealthCommand, - utils.DefaultReadinessProbeTimeoutSeconds, - utils.DefaultDashboardAgentListenPort, - utils.RayAgentRayletHealthPath, - ) - rayDashboardGCSHealthCommand := fmt.Sprintf( - utils.BaseWgetHealthCommand, - utils.DefaultReadinessProbeFailureThreshold, - utils.DefaultDashboardPort, - utils.RayDashboardGCSHealthPath, - ) - - // Generally, the liveness and readiness probes perform the same checks. - // For head node => Check GCS and Raylet status. - // For worker node => Check Raylet status. - commands := []string{} + healthCheckPath := utils.RayAgentRayletHealthPath + healthCheckPort := intstr.FromInt(utils.DefaultDashboardAgentListenPort) if rayNodeType == rayv1.HeadNode { - commands = append(commands, rayAgentRayletHealthCommand, rayDashboardGCSHealthCommand) - } else { - commands = append(commands, rayAgentRayletHealthCommand) + healthCheckPath = utils.RayDashboardGCSHealthPath + healthCheckPort = intstr.FromInt(utils.DefaultDashboardPort) } if rayContainer.LivenessProbe == nil { @@ -284,7 +269,7 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold, FailureThreshold: utils.DefaultLivenessProbeFailureThreshold, } - rayContainer.LivenessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}} + rayContainer.LivenessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort} } if rayContainer.ReadinessProbe == nil { @@ -299,21 +284,14 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold, FailureThreshold: utils.DefaultReadinessProbeFailureThreshold, } - rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}} + rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort} // For worker Pods serving traffic, we need to add an additional HTTP proxy health check for the readiness probe. // Note: head Pod checks the HTTP proxy's health at every rayservice controller reconcile instaed of using readiness probe. // See https://github.com/ray-project/kuberay/pull/1808 for reasons. if creatorCRDType == utils.RayServiceCRD && rayNodeType == rayv1.WorkerNode { rayContainer.ReadinessProbe.FailureThreshold = utils.ServeReadinessProbeFailureThreshold - rayServeProxyHealthCommand := fmt.Sprintf( - utils.BaseWgetHealthCommand, - utils.DefaultReadinessProbeInitialDelaySeconds, - utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort), - utils.RayServeProxyHealthPath, - ) - commands = append(commands, rayServeProxyHealthCommand) - rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}} + rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: utils.RayServeProxyHealthPath, Port: intstr.FromInt(utils.DefaultServingPort)} } } } diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index b9890f28b6..1749ce2d39 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1168,23 +1168,21 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) { rayContainer := &podTemplateSpec.Spec.Containers[utils.RayContainerIndex] // Test 1: User defines a custom HTTPGet probe. - httpGetProbe := corev1.Probe{ + execProbe := corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - // Check Raylet status - Path: fmt.Sprintf("/%s", utils.RayAgentRayletHealthPath), - Port: intstr.FromInt(utils.DefaultDashboardAgentListenPort), + Exec: &corev1.ExecAction{ + Command: []string{"foo", "bar"}, }, }, } - rayContainer.LivenessProbe = &httpGetProbe - rayContainer.ReadinessProbe = &httpGetProbe + rayContainer.LivenessProbe = &execProbe + rayContainer.ReadinessProbe = &execProbe initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, "") - assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet) - assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet) - assert.Nil(t, rayContainer.LivenessProbe.Exec) - assert.Nil(t, rayContainer.ReadinessProbe.Exec) + assert.NotNil(t, rayContainer.LivenessProbe.Exec) + assert.NotNil(t, rayContainer.ReadinessProbe.Exec) + assert.Nil(t, rayContainer.LivenessProbe.HTTPGet) + assert.Nil(t, rayContainer.ReadinessProbe.HTTPGet) // Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod. // Here we test the case where the Ray Pod originates from RayServiceCRD, @@ -1205,13 +1203,12 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) { rayContainer.LivenessProbe = nil rayContainer.ReadinessProbe = nil initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD) - assert.NotNil(t, rayContainer.LivenessProbe.Exec) - assert.NotNil(t, rayContainer.ReadinessProbe.Exec) - // head pod should not have Ray Serve proxy health probes - assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) - assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds) assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds) + assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet) + assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayServeProxyHealthPath) + assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultServingPort)) } func TestGenerateRayStartCommand(t *testing.T) {