From aa2b12f740034873e5ce20bca50253543b161c6a Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 31 Jul 2024 00:45:45 +0000 Subject: [PATCH 1/8] Fake TPU test initial commit Signed-off-by: Ryan O'Leary --- .../e2eautoscaler/create_detached_actor.py | 3 +- .../raycluster_autoscaler_test.go | 78 +++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/ray-operator/test/e2eautoscaler/create_detached_actor.py b/ray-operator/test/e2eautoscaler/create_detached_actor.py index 9a5ab96879..490bd2e3ff 100644 --- a/ray-operator/test/e2eautoscaler/create_detached_actor.py +++ b/ray-operator/test/e2eautoscaler/create_detached_actor.py @@ -6,10 +6,11 @@ parser.add_argument('name') parser.add_argument('--num-cpus', type=float, default=1) parser.add_argument('--num-gpus', type=float, default=0) +parser.add_argument('--custom-resource-name', type=str, default="CustomResource") parser.add_argument('--num-custom-resources', type=float, default=0) args = parser.parse_args() -@ray.remote(num_cpus=args.num_cpus, num_gpus=args.num_gpus, resources={"CustomResource": args.num_custom_resources}) +@ray.remote(num_cpus=args.num_cpus, num_gpus=args.num_gpus, resources={args.custom-resource-name: args.num_custom_resources}) class Actor: pass diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index 92a250d1e9..8bda86041b 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -134,6 +134,84 @@ func TestRayClusterAutoscalerWithFakeGPU(t *testing.T) { }) } +func TestRayClusterAutoscalerWithFakeTPU(t *testing.T) { + test := With(t) + + // Create a namespace + namespace := test.NewTestNamespace() + test.StreamKubeRayOperatorLogs() + + // Scripts for creating and terminating detached actors to trigger autoscaling + scriptsAC := newConfigMap(namespace.Name, "scripts", files(test, "create_detached_actor.py", "terminate_detached_actor.py")) + scripts, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), scriptsAC, TestApplyOptions) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name) + + test.T().Run("Create a RayCluster with autoscaling enabled", func(_ *testing.T) { + rayClusterSpecAC := rayv1ac.RayClusterSpec(). + WithEnableInTreeAutoscaling(true). + WithRayVersion(GetRayVersion()). + WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). + WithRayStartParams(map[string]string{"num-cpus": "0"}). + WithTemplate(headPodTemplateApplyConfiguration())). + WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec(). + WithReplicas(0). + WithMinReplicas(0). + WithMaxReplicas(3). + WithNumOfHosts(4). + WithGroupName("tpu-group"). + WithRayStartParams(map[string]string{"num-cpus": "1", "resources": `"{\"TPU\": 4}"`}). + WithTemplate(workerPodTemplateApplyConfiguration())) + rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name). + WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts"))) + + rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Apply(test.Ctx(), rayClusterAC, TestApplyOptions) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name) + + // Wait for RayCluster to become ready and verify the number of available worker replicas. + test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + rayCluster = GetRayCluster(test, rayCluster.Namespace, rayCluster.Name) + test.Expect(rayCluster.Status.DesiredWorkerReplicas).To(Equal(int32(0))) + + headPod := GetHeadPod(test, rayCluster) + test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name) + + // Create a detached tpu actor, and 4 workers in the multi-host "tpu-group" should be created. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_1", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) + test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) + // We don't use real TPU resources of Kubernetes here, therefore we can't test the RayClusterDesiredTPU. + // We test the Pods count of the "tpu-group" instead. + // Each TPU multi-host replica should have 4 workers, so we check for 4 pods in 'tpu-group'. + test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(4)) + + // Each TPU multi-host worker should have a task or actor scheduled on it, therefore we create 3 more detached actors + // to run on each node in the multi-host TPU worker group. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_2", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_3", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_4", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) + + // Each new TPU detached actor should get scheduled to an existing scaled-up worker, so we check that there are still 4 pods in 'tpu-group'. + test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(4)) + + // Terminating one TPU detached actor will result in the Ray node becoming idle, causing Ray to scale down the entire multi-host + // worker group. A new multi-host worker group will then be scaled back up since the remaining detached actors are running. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_1"}) + test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0)))) + + // Terminate the remaining 3 TPU detached actors, and the worker group should be deleted. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_2"}) + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_3"}) + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_4"}) + test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0)))) + test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(0)) + }) +} + func TestRayClusterAutoscalerWithCustomResource(t *testing.T) { test := With(t) From 9bc92a6b352da7e09481a89dc4bd6d1083717bd6 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 31 Jul 2024 02:50:08 +0000 Subject: [PATCH 2/8] Add single host test Signed-off-by: Ryan O'Leary --- .../raycluster_autoscaler_test.go | 61 ++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index 8bda86041b..f892106310 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -134,7 +134,66 @@ func TestRayClusterAutoscalerWithFakeGPU(t *testing.T) { }) } -func TestRayClusterAutoscalerWithFakeTPU(t *testing.T) { +func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) { + test := With(t) + + // Create a namespace + namespace := test.NewTestNamespace() + test.StreamKubeRayOperatorLogs() + + // Scripts for creating and terminating detached actors to trigger autoscaling + scriptsAC := newConfigMap(namespace.Name, "scripts", files(test, "create_detached_actor.py", "terminate_detached_actor.py")) + scripts, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), scriptsAC, TestApplyOptions) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name) + + test.T().Run("Create a RayCluster with autoscaling enabled", func(_ *testing.T) { + rayClusterSpecAC := rayv1ac.RayClusterSpec(). + WithEnableInTreeAutoscaling(true). + WithRayVersion(GetRayVersion()). + WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). + WithRayStartParams(map[string]string{"num-cpus": "0"}). + WithTemplate(headPodTemplateApplyConfiguration())). + WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec(). + WithReplicas(0). + WithMinReplicas(0). + WithMaxReplicas(3). + WithNumOfHosts(1). + WithGroupName("tpu-group"). + WithRayStartParams(map[string]string{"num-cpus": "1", "resources": `"{\"TPU\": 4}"`}). + WithTemplate(workerPodTemplateApplyConfiguration())) + rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name). + WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts"))) + + rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Apply(test.Ctx(), rayClusterAC, TestApplyOptions) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name) + + // Wait for RayCluster to become ready and verify the number of available worker replicas. + test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) + rayCluster = GetRayCluster(test, rayCluster.Namespace, rayCluster.Name) + test.Expect(rayCluster.Status.DesiredWorkerReplicas).To(Equal(int32(0))) + + headPod := GetHeadPod(test, rayCluster) + test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name) + + // Create a detached tpu actor, and 1 worker in the multi-host "tpu-group" should be created. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) + test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) + + // Each TPU multi-host replica should have 1 workers, so we check for 1 pod in 'tpu-group'. + test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(1)) + + // Terminate the TPU detached actos, and the worker group replica should be deleted. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor"}) + test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0)))) + }) +} + +func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) { test := With(t) // Create a namespace From 81fce4cdba38d3e452314e816dda71283a1b3194 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 31 Jul 2024 02:50:31 +0000 Subject: [PATCH 3/8] remove comment Signed-off-by: Ryan O'Leary --- ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index f892106310..d7a414b18e 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -241,8 +241,7 @@ func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) { ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_1", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) - // We don't use real TPU resources of Kubernetes here, therefore we can't test the RayClusterDesiredTPU. - // We test the Pods count of the "tpu-group" instead. + // Each TPU multi-host replica should have 4 workers, so we check for 4 pods in 'tpu-group'. test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(4)) From 01d5978308aa3d4feb3e98a472fe003d793f9562 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 31 Jul 2024 03:00:15 +0000 Subject: [PATCH 4/8] Lint changes Signed-off-by: Ryan O'Leary --- ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index d7a414b18e..8dc9606fb9 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -182,7 +182,7 @@ func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) { ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) - + // Each TPU multi-host replica should have 1 workers, so we check for 1 pod in 'tpu-group'. test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(1)) From dd96c66f20e55397efd25a41a3cc992a91506c03 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 31 Jul 2024 04:01:44 +0000 Subject: [PATCH 5/8] Fix build errors Signed-off-by: Ryan O'Leary --- ray-operator/test/e2eautoscaler/create_detached_actor.py | 2 +- ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/create_detached_actor.py b/ray-operator/test/e2eautoscaler/create_detached_actor.py index 490bd2e3ff..3e12add133 100644 --- a/ray-operator/test/e2eautoscaler/create_detached_actor.py +++ b/ray-operator/test/e2eautoscaler/create_detached_actor.py @@ -10,7 +10,7 @@ parser.add_argument('--num-custom-resources', type=float, default=0) args = parser.parse_args() -@ray.remote(num_cpus=args.num_cpus, num_gpus=args.num_gpus, resources={args.custom-resource-name: args.num_custom_resources}) +@ray.remote(num_cpus=args.num_cpus, num_gpus=args.num_gpus, resources={args.custom_resource_name: args.num_custom_resources}) class Actor: pass diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index 8dc9606fb9..76f4452cb2 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -183,10 +183,10 @@ func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) { test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) - // Each TPU multi-host replica should have 1 workers, so we check for 1 pod in 'tpu-group'. + // Each TPU multi-host replica should have 1 worker, so we check for 1 pod in 'tpu-group'. test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(1)) - // Terminate the TPU detached actos, and the worker group replica should be deleted. + // Terminate the TPU detached actor and the worker group replica should be deleted. ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor"}) test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0)))) From 0bab89202c639aa8836b9387318de90e5d43972d Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 7 Aug 2024 19:24:45 +0000 Subject: [PATCH 6/8] Fix unparam lint error Signed-off-by: Ryan O'Leary --- ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index 76f4452cb2..d6c1f7b618 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -142,7 +142,7 @@ func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) { test.StreamKubeRayOperatorLogs() // Scripts for creating and terminating detached actors to trigger autoscaling - scriptsAC := newConfigMap(namespace.Name, "scripts", files(test, "create_detached_actor.py", "terminate_detached_actor.py")) + scriptsAC := newConfigMap(namespace.Name, "scripts-tpu", files(test, "create_detached_actor.py", "terminate_detached_actor.py")) scripts, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), scriptsAC, TestApplyOptions) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name) @@ -201,7 +201,7 @@ func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) { test.StreamKubeRayOperatorLogs() // Scripts for creating and terminating detached actors to trigger autoscaling - scriptsAC := newConfigMap(namespace.Name, "scripts", files(test, "create_detached_actor.py", "terminate_detached_actor.py")) + scriptsAC := newConfigMap(namespace.Name, "scripts-tpu", files(test, "create_detached_actor.py", "terminate_detached_actor.py")) scripts, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), scriptsAC, TestApplyOptions) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name) From 0c6bb58ca35b6ed6258b6d6dd421a916c21ca915 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 9 Aug 2024 08:29:06 +0000 Subject: [PATCH 7/8] Change to 2x2x2 topology and remove idle node behavior Signed-off-by: Ryan O'Leary --- .../raycluster_autoscaler_test.go | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index d6c1f7b618..16abb507d1 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -160,7 +160,7 @@ func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) { WithMaxReplicas(3). WithNumOfHosts(1). WithGroupName("tpu-group"). - WithRayStartParams(map[string]string{"num-cpus": "1", "resources": `"{\"TPU\": 4}"`}). + WithRayStartParams(map[string]string{"resources": `"{\"TPU\": 4}"`}). WithTemplate(workerPodTemplateApplyConfiguration())) rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name). WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts"))) @@ -179,7 +179,7 @@ func TestRayClusterAutoscalerWithFakeSingleHostTPU(t *testing.T) { test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name) // Create a detached tpu actor, and 1 worker in the multi-host "tpu-group" should be created. - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor", "--custom-resource-name=TPU", "--num-custom-resources=4"}) test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) @@ -206,10 +206,18 @@ func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created ConfigMap %s/%s successfully", scripts.Namespace, scripts.Name) + // Set 'replicaIndex' label that would be set by the GKE Ray TPU webhook. This is used to scale + // down entire multi-host replicas atomically. + replicaIndexLabel := map[string]string{ + "replicaIndex": "tpu-group-0", + } + podTemplate := workerPodTemplateApplyConfiguration().WithLabels(replicaIndexLabel) + minRayVersion := "2.32.0" // Multi-host autoscaling support starts in this version. + test.T().Run("Create a RayCluster with autoscaling enabled", func(_ *testing.T) { rayClusterSpecAC := rayv1ac.RayClusterSpec(). WithEnableInTreeAutoscaling(true). - WithRayVersion(GetRayVersion()). + WithRayVersion(minRayVersion). WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). WithRayStartParams(map[string]string{"num-cpus": "0"}). WithTemplate(headPodTemplateApplyConfiguration())). @@ -217,10 +225,10 @@ func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) { WithReplicas(0). WithMinReplicas(0). WithMaxReplicas(3). - WithNumOfHosts(4). + WithNumOfHosts(2). WithGroupName("tpu-group"). - WithRayStartParams(map[string]string{"num-cpus": "1", "resources": `"{\"TPU\": 4}"`}). - WithTemplate(workerPodTemplateApplyConfiguration())) + WithRayStartParams(map[string]string{"resources": `"{\"TPU\": 4}"`}). + WithTemplate(podTemplate)) rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name). WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts"))) @@ -237,36 +245,26 @@ func TestRayClusterAutoscalerWithFakeMultiHostTPU(t *testing.T) { headPod := GetHeadPod(test, rayCluster) test.T().Logf("Found head pod %s/%s", headPod.Namespace, headPod.Name) - // Create a detached tpu actor, and 4 workers in the multi-host "tpu-group" should be created. - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_1", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) + // Create a detached TPU actor, and 1 multi-host replica with 2 TPU workers should be created. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_1", "--custom-resource-name=TPU", "--num-custom-resources=4"}) test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) - // Each TPU multi-host replica should have 4 workers, so we check for 4 pods in 'tpu-group'. - test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(4)) - - // Each TPU multi-host worker should have a task or actor scheduled on it, therefore we create 3 more detached actors - // to run on each node in the multi-host TPU worker group. - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_2", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_3", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_4", "--custom-resource-name=\"TPU\"", "--num-custom-resources=4"}) - - // Each new TPU detached actor should get scheduled to an existing scaled-up worker, so we check that there are still 4 pods in 'tpu-group'. - test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(4)) + // Each TPU multi-host replica should have NumOfHosts workers, so we check for 2 pods in 'tpu-group'. + test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(2)) - // Terminating one TPU detached actor will result in the Ray node becoming idle, causing Ray to scale down the entire multi-host - // worker group. A new multi-host worker group will then be scaled back up since the remaining detached actors are running. - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_1"}) + // Each TPU multi-host worker should have a task or actor scheduled on it, therefore we create another detached actor + // to run on the second node in the multi-host TPU worker group. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "tpu_actor_2", "--custom-resource-name=TPU", "--num-custom-resources=4"}) test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). - Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0)))) + Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1)))) + test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(2)) - // Terminate the remaining 3 TPU detached actors, and the worker group should be deleted. + // Terminate the TPU detached actors, and the multi-host replica should be scaled down. + ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_1"}) ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_2"}) - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_3"}) - ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "tpu_actor_4"}) test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0)))) - test.Expect(GetGroupPods(test, rayCluster, "tpu-group")).To(HaveLen(0)) }) } @@ -285,7 +283,6 @@ func TestRayClusterAutoscalerWithCustomResource(t *testing.T) { test.T().Run("Create a RayCluster with autoscaling enabled", func(_ *testing.T) { groupName := "custom-resource-group" - rayClusterSpecAC := rayv1ac.RayClusterSpec(). WithEnableInTreeAutoscaling(true). WithRayVersion(GetRayVersion()). From 2649e1f4b7a92d0ac66ffe2162fa665fce902976 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 9 Aug 2024 08:40:10 +0000 Subject: [PATCH 8/8] Add back in new line Signed-off-by: Ryan O'Leary --- ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go index 16abb507d1..a8cd2420c8 100644 --- a/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go +++ b/ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go @@ -283,6 +283,7 @@ func TestRayClusterAutoscalerWithCustomResource(t *testing.T) { test.T().Run("Create a RayCluster with autoscaling enabled", func(_ *testing.T) { groupName := "custom-resource-group" + rayClusterSpecAC := rayv1ac.RayClusterSpec(). WithEnableInTreeAutoscaling(true). WithRayVersion(GetRayVersion()).