From 59697466f12d69127173af408703c9f9b8372c7d Mon Sep 17 00:00:00 2001 From: Maskym Vavilov Date: Tue, 26 Nov 2024 13:16:37 +0000 Subject: [PATCH] reconcile premature on probes nil -> healthy Signed-off-by: Maskym Vavilov --- api/v1alpha1/dnshealthcheckprobe_types.go | 2 +- .../dns-operator.clusterserviceversion.yaml | 2 +- .../kuadrant.io_dnshealthcheckprobes.yaml | 2 - charts/dns-operator/templates/manifests.yaml | 2 - .../kuadrant.io_dnshealthcheckprobes.yaml | 2 - internal/controller/dnsrecord_controller.go | 42 +++++----- .../controller/dnsrecord_healthchecks_test.go | 81 ------------------- 7 files changed, 24 insertions(+), 109 deletions(-) diff --git a/api/v1alpha1/dnshealthcheckprobe_types.go b/api/v1alpha1/dnshealthcheckprobe_types.go index 630f91e..72cf75f 100644 --- a/api/v1alpha1/dnshealthcheckprobe_types.go +++ b/api/v1alpha1/dnshealthcheckprobe_types.go @@ -79,7 +79,7 @@ type DNSHealthCheckProbeStatus struct { ConsecutiveFailures int `json:"consecutiveFailures,omitempty"` Reason string `json:"reason,omitempty"` Status int `json:"status,omitempty"` - Healthy *bool `json:"healthy"` + Healthy *bool `json:"healthy,omitempty"` ObservedGeneration int64 `json:"observedGeneration,omitempty"` } diff --git a/bundle/manifests/dns-operator.clusterserviceversion.yaml b/bundle/manifests/dns-operator.clusterserviceversion.yaml index 7cab15e..d522d78 100644 --- a/bundle/manifests/dns-operator.clusterserviceversion.yaml +++ b/bundle/manifests/dns-operator.clusterserviceversion.yaml @@ -58,7 +58,7 @@ metadata: capabilities: Basic Install categories: Integration & Delivery containerImage: quay.io/kuadrant/dns-operator:latest - createdAt: "2024-11-15T11:06:08Z" + createdAt: "2024-11-26T11:14:48Z" description: A Kubernetes Operator to manage the lifecycle of DNS resources operators.operatorframework.io/builder: operator-sdk-v1.33.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v4 diff --git a/bundle/manifests/kuadrant.io_dnshealthcheckprobes.yaml b/bundle/manifests/kuadrant.io_dnshealthcheckprobes.yaml index b7aef8a..8b64d9f 100644 --- a/bundle/manifests/kuadrant.io_dnshealthcheckprobes.yaml +++ b/bundle/manifests/kuadrant.io_dnshealthcheckprobes.yaml @@ -114,8 +114,6 @@ spec: type: string status: type: integer - required: - - healthy type: object type: object served: true diff --git a/charts/dns-operator/templates/manifests.yaml b/charts/dns-operator/templates/manifests.yaml index 2be476c..250f620 100644 --- a/charts/dns-operator/templates/manifests.yaml +++ b/charts/dns-operator/templates/manifests.yaml @@ -115,8 +115,6 @@ spec: type: string status: type: integer - required: - - healthy type: object type: object served: true diff --git a/config/crd/bases/kuadrant.io_dnshealthcheckprobes.yaml b/config/crd/bases/kuadrant.io_dnshealthcheckprobes.yaml index edb7592..c8e0bdf 100644 --- a/config/crd/bases/kuadrant.io_dnshealthcheckprobes.yaml +++ b/config/crd/bases/kuadrant.io_dnshealthcheckprobes.yaml @@ -114,8 +114,6 @@ spec: type: string status: type: integer - required: - - healthy type: object type: object served: true diff --git a/internal/controller/dnsrecord_controller.go b/internal/controller/dnsrecord_controller.go index 5973834..693a008 100644 --- a/internal/controller/dnsrecord_controller.go +++ b/internal/controller/dnsrecord_controller.go @@ -389,36 +389,39 @@ func (r *DNSRecordReconciler) SetupWithManager(mgr ctrl.Manager, maxRequeue, val logger.V(1).Info("unexpected object type", "error", fmt.Sprintf("%T is not a *v1alpha1.DNSHealthCheckProbe", o)) return []reconcile.Request{} } - onCluster := &v1alpha1.DNSHealthCheckProbe{} - if err := mgr.GetClient().Get(ctx, client.ObjectKeyFromObject(probe), onCluster); client.IgnoreNotFound(err) != nil { - logger.Error(err, "failed to get probe") + + // haven't probed yet or deleting - nothing to do + if probe.Status.Healthy == nil { return []reconcile.Request{} } - probeOwner := &v1alpha1.DNSRecord{ - ObjectMeta: metav1.ObjectMeta{}, - } + record := &v1alpha1.DNSRecord{ObjectMeta: metav1.ObjectMeta{}} for _, ro := range probe.GetOwnerReferences() { if ro.Kind == "DNSRecord" { - probeOwner.Name = ro.Name - probeOwner.Namespace = probe.Namespace + record.Name = ro.Name + record.Namespace = probe.Namespace break } } - // haven't probed yet or deleting - nothing to do - if probe.Status.Healthy == nil { + if err := mgr.GetClient().Get(ctx, client.ObjectKeyFromObject(record), record); client.IgnoreNotFound(err) != nil { + logger.Error(err, "failed to get record") return []reconcile.Request{} } - // probe.Status.Healthy is not nil here - // we probed for the first time - reconcile - // or none are nil and status changed - reconcile - if onCluster.Status.Healthy == nil || *probe.Status.Healthy != *onCluster.Status.Healthy { - return []reconcile.Request{{NamespacedName: client.ObjectKeyFromObject(probeOwner)}} + condition := meta.FindStatusCondition(record.Status.Conditions, string(v1alpha1.ConditionTypeHealthy)) + // no condition - record is not precessed yet + if condition == nil { + return []reconcile.Request{} } - // none are nil and status is the same - nothing to do + isHealthy := condition.Status == metav1.ConditionTrue + + // record and probe disagree on health - requeue + if *probe.Status.Healthy != isHealthy { + return []reconcile.Request{{NamespacedName: client.ObjectKeyFromObject(record)}} + } + // nothing to do return []reconcile.Request{} })). Complete(r) @@ -490,14 +493,13 @@ func recordReceivedPrematurely(record *v1alpha1.DNSRecord, probes *v1alpha1.DNSH // healthy is true only if we have probes and they are all healthy isHealthy := healthyCond.Status == metav1.ConditionTrue - // if at least one not healthy - this will lock in false - allProbesHealthy := true + // if at least one is healthy - this will lock in true + allProbesHealthy := false for _, probe := range probes.Items { if probe.Status.Healthy != nil { - allProbesHealthy = allProbesHealthy && *probe.Status.Healthy + allProbesHealthy = allProbesHealthy || *probe.Status.Healthy } } - // prematurely is true here. return false in case we need full reconcile return isHealthy == allProbesHealthy, requeueIn } diff --git a/internal/controller/dnsrecord_healthchecks_test.go b/internal/controller/dnsrecord_healthchecks_test.go index 6a16e62..7a65967 100644 --- a/internal/controller/dnsrecord_healthchecks_test.go +++ b/internal/controller/dnsrecord_healthchecks_test.go @@ -265,87 +265,6 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { }, TestTimeoutMedium, time.Second).Should(Succeed()) }) - It("Should reconcile prematurely on healthcheck change", func() { - // Create a default record - Expect(k8sClient.Create(ctx, dnsRecord)).To(Succeed()) - - By("Making probes healthy") - Eventually(func(g Gomega) { - probes := &v1alpha1.DNSHealthCheckProbeList{} - - g.Expect(k8sClient.List(ctx, probes, &client.ListOptions{ - LabelSelector: labels.SelectorFromSet(map[string]string{ - ProbeOwnerLabel: BuildOwnerLabelValue(dnsRecord), - }), - Namespace: dnsRecord.Namespace, - })).To(Succeed()) - g.Expect(len(probes.Items)).To(Equal(2)) - - // make probes healthy - for _, probe := range probes.Items { - probe.Status.Healthy = ptr.To(true) - probe.Status.LastCheckedAt = metav1.Now() - probe.Status.ConsecutiveFailures = 0 - g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed()) - } - }, TestTimeoutMedium, time.Second).Should(Succeed()) - - // Override valid for to 1 hour to make sure we are always premature - Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed()) - // update only after we are ready - g.Expect(dnsRecord.Status.Conditions).To( - ContainElement(MatchFields(IgnoreExtras, Fields{ - "Type": Equal(string(v1alpha1.ConditionTypeReady)), - "Status": Equal(metav1.ConditionTrue), - })), - ) - dnsRecord.Status.ValidFor = "1h" - g.Expect(k8sClient.Status().Update(ctx, dnsRecord)).To(Succeed()) - }, TestTimeoutMedium, time.Second).Should(Succeed()) - - // validate that it is still 1 hour - we hit the premature loop - // we purposefully are not updating anything on premature loops - // so can't be sure if we have had a short loop or not yet - By("Verify premature loop did not occur") - Eventually(func(g Gomega) { - newRecord := &v1alpha1.DNSRecord{} - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), newRecord)).To(Succeed()) - g.Expect(newRecord.Status.ValidFor).To(Equal("1h")) - }, TestTimeoutMedium, time.Second).Should(Succeed()) - - By("Making probes unhealthy") - Eventually(func(g Gomega) { - probes := &v1alpha1.DNSHealthCheckProbeList{} - - g.Expect(k8sClient.List(ctx, probes, &client.ListOptions{ - LabelSelector: labels.SelectorFromSet(map[string]string{ - ProbeOwnerLabel: BuildOwnerLabelValue(dnsRecord), - }), - Namespace: dnsRecord.Namespace, - })).To(Succeed()) - g.Expect(len(probes.Items)).To(Equal(2)) - - // make probes healthy - for _, probe := range probes.Items { - probe.Status.Healthy = ptr.To(false) - probe.Status.LastCheckedAt = metav1.Now() - probe.Status.ConsecutiveFailures = 5 - g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed()) - } - }, TestTimeoutMedium, time.Second).Should(Succeed()) - - // check that record has valid for not an hour - we did the full reconcile and updated valid for - By("Validate premature loop occurred") - Eventually(func(g Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed()) - // on the full loop we will always update ValidFor unless it is max duration. - // since we had 1 hour the logic should set it to the max. - // meaning that changing probes triggered dnsrecornd reconciler - // and we had a fool loop before we were supposed to - g.Expect(dnsRecord.Status.ValidFor).To(Equal(RequeueDuration.String())) - }, TestTimeoutMedium, time.Second).Should(Succeed()) - }) It("Should not publish unhealthy enpoints", func() { //Create default test dnsrecord Expect(k8sClient.Create(ctx, dnsRecord)).To(Succeed())