Skip to content

Commit

Permalink
Use HTTP probes for Ray readiness and liviness probes
Browse files Browse the repository at this point in the history
Signed-off-by: Andrew Sy Kim <[email protected]>
  • Loading branch information
andrewsykim committed Oct 31, 2024
1 parent 33ba385 commit e7cdb70
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 46 deletions.
38 changes: 8 additions & 30 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
ctrl "sigs.k8s.io/controller-runtime"
)

Expand Down Expand Up @@ -248,27 +249,11 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo
}

func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType rayv1.RayNodeType, creatorCRDType utils.CRDType) {
rayAgentRayletHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeTimeoutSeconds,
utils.DefaultDashboardAgentListenPort,
utils.RayAgentRayletHealthPath,
)
rayDashboardGCSHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeFailureThreshold,
utils.DefaultDashboardPort,
utils.RayDashboardGCSHealthPath,
)

// Generally, the liveness and readiness probes perform the same checks.
// For head node => Check GCS and Raylet status.
// For worker node => Check Raylet status.
commands := []string{}
healthCheckPath := utils.RayAgentRayletHealthPath
healthCheckPort := intstr.FromInt(utils.DefaultDashboardAgentListenPort)
if rayNodeType == rayv1.HeadNode {
commands = append(commands, rayAgentRayletHealthCommand, rayDashboardGCSHealthCommand)
} else {
commands = append(commands, rayAgentRayletHealthCommand)
healthCheckPath = utils.RayDashboardGCSHealthPath
healthCheckPort = intstr.FromInt(utils.DefaultDashboardPort)
}

if rayContainer.LivenessProbe == nil {
Expand All @@ -284,7 +269,7 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold,
FailureThreshold: utils.DefaultLivenessProbeFailureThreshold,
}
rayContainer.LivenessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.LivenessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort}
}

if rayContainer.ReadinessProbe == nil {
Expand All @@ -299,21 +284,14 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold,
FailureThreshold: utils.DefaultReadinessProbeFailureThreshold,
}
rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort}

// For worker Pods serving traffic, we need to add an additional HTTP proxy health check for the readiness probe.
// Note: head Pod checks the HTTP proxy's health at every rayservice controller reconcile instaed of using readiness probe.
// See https://github.com/ray-project/kuberay/pull/1808 for reasons.
if creatorCRDType == utils.RayServiceCRD && rayNodeType == rayv1.WorkerNode {
rayContainer.ReadinessProbe.FailureThreshold = utils.ServeReadinessProbeFailureThreshold
rayServeProxyHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeInitialDelaySeconds,
utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort),
utils.RayServeProxyHealthPath,
)
commands = append(commands, rayServeProxyHealthCommand)
rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: utils.RayServeProxyHealthPath, Port: intstr.FromInt(utils.DefaultServingPort)}
}
}
}
Expand Down
29 changes: 13 additions & 16 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1168,23 +1168,21 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
rayContainer := &podTemplateSpec.Spec.Containers[utils.RayContainerIndex]

// Test 1: User defines a custom HTTPGet probe.
httpGetProbe := corev1.Probe{
execProbe := corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
// Check Raylet status
Path: fmt.Sprintf("/%s", utils.RayAgentRayletHealthPath),
Port: intstr.FromInt(utils.DefaultDashboardAgentListenPort),
Exec: &corev1.ExecAction{
Command: []string{"foo", "bar"},
},
},
}

rayContainer.LivenessProbe = &httpGetProbe
rayContainer.ReadinessProbe = &httpGetProbe
rayContainer.LivenessProbe = &execProbe
rayContainer.ReadinessProbe = &execProbe
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, "")
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Nil(t, rayContainer.LivenessProbe.Exec)
assert.Nil(t, rayContainer.ReadinessProbe.Exec)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
assert.Nil(t, rayContainer.LivenessProbe.HTTPGet)
assert.Nil(t, rayContainer.ReadinessProbe.HTTPGet)

// Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod.
// Here we test the case where the Ray Pod originates from RayServiceCRD,
Expand All @@ -1205,13 +1203,12 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
// head pod should not have Ray Serve proxy health probes
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds)
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayServeProxyHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultServingPort))
}

func TestGenerateRayStartCommand(t *testing.T) {
Expand Down

0 comments on commit e7cdb70

Please sign in to comment.