Skip to content

Commit

Permalink
Use HTTP probes for Ray readiness and liviness probes
Browse files Browse the repository at this point in the history
Signed-off-by: Andrew Sy Kim <[email protected]>
  • Loading branch information
andrewsykim committed Oct 31, 2024
1 parent 33ba385 commit d106ab9
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 76 deletions.
51 changes: 10 additions & 41 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
ctrl "sigs.k8s.io/controller-runtime"
)

Expand Down Expand Up @@ -248,72 +249,40 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo
}

func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType rayv1.RayNodeType, creatorCRDType utils.CRDType) {
rayAgentRayletHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeTimeoutSeconds,
utils.DefaultDashboardAgentListenPort,
utils.RayAgentRayletHealthPath,
)
rayDashboardGCSHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeFailureThreshold,
utils.DefaultDashboardPort,
utils.RayDashboardGCSHealthPath,
)

// Generally, the liveness and readiness probes perform the same checks.
// For head node => Check GCS and Raylet status.
// For worker node => Check Raylet status.
commands := []string{}
healthCheckPath := utils.RayAgentRayletHealthPath
healthCheckPort := intstr.FromInt(utils.DefaultDashboardAgentListenPort)
if rayNodeType == rayv1.HeadNode {
commands = append(commands, rayAgentRayletHealthCommand, rayDashboardGCSHealthCommand)
} else {
commands = append(commands, rayAgentRayletHealthCommand)
healthCheckPath = utils.RayDashboardGCSHealthPath
healthCheckPort = intstr.FromInt(utils.DefaultDashboardPort)
}

if rayContainer.LivenessProbe == nil {
probeTimeout := int32(utils.DefaultLivenessProbeTimeoutSeconds)
if rayNodeType == rayv1.HeadNode {
probeTimeout = int32(utils.DefaultHeadLivenessProbeTimeoutSeconds)
}

rayContainer.LivenessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: probeTimeout,
TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds,
PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds,
SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold,
FailureThreshold: utils.DefaultLivenessProbeFailureThreshold,
}
rayContainer.LivenessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.LivenessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort}
}

if rayContainer.ReadinessProbe == nil {
probeTimeout := int32(utils.DefaultReadinessProbeTimeoutSeconds)
if rayNodeType == rayv1.HeadNode {
probeTimeout = int32(utils.DefaultHeadReadinessProbeTimeoutSeconds)
}
rayContainer.ReadinessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: probeTimeout,
TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds,
PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds,
SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold,
FailureThreshold: utils.DefaultReadinessProbeFailureThreshold,
}
rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: healthCheckPath, Port: healthCheckPort}

// For worker Pods serving traffic, we need to add an additional HTTP proxy health check for the readiness probe.
// Note: head Pod checks the HTTP proxy's health at every rayservice controller reconcile instaed of using readiness probe.
// See https://github.com/ray-project/kuberay/pull/1808 for reasons.
if creatorCRDType == utils.RayServiceCRD && rayNodeType == rayv1.WorkerNode {
rayContainer.ReadinessProbe.FailureThreshold = utils.ServeReadinessProbeFailureThreshold
rayServeProxyHealthCommand := fmt.Sprintf(
utils.BaseWgetHealthCommand,
utils.DefaultReadinessProbeInitialDelaySeconds,
utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort),
utils.RayServeProxyHealthPath,
)
commands = append(commands, rayServeProxyHealthCommand)
rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}}
rayContainer.ReadinessProbe.HTTPGet = &corev1.HTTPGetAction{Path: utils.RayServeProxyHealthPath, Port: intstr.FromInt(utils.DefaultServingPort)}
}
}
}
Expand Down
67 changes: 43 additions & 24 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1167,35 +1167,33 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
podTemplateSpec := DefaultHeadPodTemplate(context.Background(), *cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
rayContainer := &podTemplateSpec.Spec.Containers[utils.RayContainerIndex]

// Test 1: User defines a custom HTTPGet probe.
httpGetProbe := corev1.Probe{
// Test 1: User defines a custom Exec probe to override default HTTP probe.
execProbe := corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
// Check Raylet status
Path: fmt.Sprintf("/%s", utils.RayAgentRayletHealthPath),
Port: intstr.FromInt(utils.DefaultDashboardAgentListenPort),
Exec: &corev1.ExecAction{
Command: []string{"foo", "bar"},
},
},
}

rayContainer.LivenessProbe = &httpGetProbe
rayContainer.ReadinessProbe = &httpGetProbe
rayContainer.LivenessProbe = &execProbe
rayContainer.ReadinessProbe = &execProbe
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, "")
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Nil(t, rayContainer.LivenessProbe.Exec)
assert.Nil(t, rayContainer.ReadinessProbe.Exec)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
assert.Nil(t, rayContainer.LivenessProbe.HTTPGet)
assert.Nil(t, rayContainer.ReadinessProbe.HTTPGet)

// Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod.
// Test 2: User does not define a custom probe. KubeRay will inject HTTP probe for worker pod.
// Here we test the case where the Ray Pod originates from RayServiceCRD,
// implying that an additional serve health check will be added to the readiness probe.
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.WorkerNode, utils.RayServiceCRD)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.True(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayServeProxyHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultServingPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

Expand All @@ -1205,13 +1203,34 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
// head pod should not have Ray Serve proxy health probes
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds)
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayDashboardGCSHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

// Test 4: User does not define custom probe. Pod is a worker Pod for a RayJob
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.WorkerNode, utils.RayJobCRD)
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayAgentRayletHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardAgentListenPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

// Test 5: User does not define custom probe. Pod is a head Pod for a RayJob
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayJobCRD)
assert.NotNil(t, rayContainer.LivenessProbe.HTTPGet)
assert.NotNil(t, rayContainer.ReadinessProbe.HTTPGet)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Path, utils.RayDashboardGCSHealthPath)
assert.Equal(t, rayContainer.ReadinessProbe.HTTPGet.Port, intstr.FromInt(utils.DefaultDashboardPort))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)
}

func TestGenerateRayStartCommand(t *testing.T) {
Expand Down
18 changes: 7 additions & 11 deletions ray-operator/controllers/ray/utils/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,21 +151,17 @@ const (
// Ray FT default readiness probe values
DefaultReadinessProbeInitialDelaySeconds = 10
DefaultReadinessProbeTimeoutSeconds = 2
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadReadinessProbeTimeoutSeconds = 5
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1

// Ray FT default liveness probe values
DefaultLivenessProbeInitialDelaySeconds = 30
DefaultLivenessProbeTimeoutSeconds = 2
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadLivenessProbeTimeoutSeconds = 5
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120

// Ray health check related configurations
// Note: Since the Raylet process and the dashboard agent process are fate-sharing,
Expand Down

0 comments on commit d106ab9

Please sign in to comment.