Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reconcile premature on probes nil -> healthy #317

Merged
merged 1 commit into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/v1alpha1/dnshealthcheckprobe_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ type DNSHealthCheckProbeStatus struct {
ConsecutiveFailures int `json:"consecutiveFailures,omitempty"`
Reason string `json:"reason,omitempty"`
Status int `json:"status,omitempty"`
Healthy *bool `json:"healthy"`
Healthy *bool `json:"healthy,omitempty"`
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
}

Expand Down
2 changes: 1 addition & 1 deletion bundle/manifests/dns-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ metadata:
capabilities: Basic Install
categories: Integration & Delivery
containerImage: quay.io/kuadrant/dns-operator:latest
createdAt: "2024-11-15T11:06:08Z"
createdAt: "2024-11-26T11:14:48Z"
description: A Kubernetes Operator to manage the lifecycle of DNS resources
operators.operatorframework.io/builder: operator-sdk-v1.33.0
operators.operatorframework.io/project_layout: go.kubebuilder.io/v4
Expand Down
2 changes: 0 additions & 2 deletions bundle/manifests/kuadrant.io_dnshealthcheckprobes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@ spec:
type: string
status:
type: integer
required:
- healthy
type: object
type: object
served: true
Expand Down
2 changes: 0 additions & 2 deletions charts/dns-operator/templates/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,6 @@ spec:
type: string
status:
type: integer
required:
- healthy
type: object
type: object
served: true
Expand Down
2 changes: 0 additions & 2 deletions config/crd/bases/kuadrant.io_dnshealthcheckprobes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@ spec:
type: string
status:
type: integer
required:
- healthy
type: object
type: object
served: true
Expand Down
42 changes: 22 additions & 20 deletions internal/controller/dnsrecord_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -389,36 +389,39 @@ func (r *DNSRecordReconciler) SetupWithManager(mgr ctrl.Manager, maxRequeue, val
logger.V(1).Info("unexpected object type", "error", fmt.Sprintf("%T is not a *v1alpha1.DNSHealthCheckProbe", o))
return []reconcile.Request{}
}
onCluster := &v1alpha1.DNSHealthCheckProbe{}
if err := mgr.GetClient().Get(ctx, client.ObjectKeyFromObject(probe), onCluster); client.IgnoreNotFound(err) != nil {
logger.Error(err, "failed to get probe")

// haven't probed yet or deleting - nothing to do
if probe.Status.Healthy == nil {
return []reconcile.Request{}
}

probeOwner := &v1alpha1.DNSRecord{
ObjectMeta: metav1.ObjectMeta{},
}
record := &v1alpha1.DNSRecord{ObjectMeta: metav1.ObjectMeta{}}
for _, ro := range probe.GetOwnerReferences() {
if ro.Kind == "DNSRecord" {
probeOwner.Name = ro.Name
probeOwner.Namespace = probe.Namespace
record.Name = ro.Name
record.Namespace = probe.Namespace
break
}
}

// haven't probed yet or deleting - nothing to do
if probe.Status.Healthy == nil {
if err := mgr.GetClient().Get(ctx, client.ObjectKeyFromObject(record), record); client.IgnoreNotFound(err) != nil {
logger.Error(err, "failed to get record")
return []reconcile.Request{}
}

// probe.Status.Healthy is not nil here
// we probed for the first time - reconcile
// or none are nil and status changed - reconcile
if onCluster.Status.Healthy == nil || *probe.Status.Healthy != *onCluster.Status.Healthy {
return []reconcile.Request{{NamespacedName: client.ObjectKeyFromObject(probeOwner)}}
condition := meta.FindStatusCondition(record.Status.Conditions, string(v1alpha1.ConditionTypeHealthy))
// no condition - record is not precessed yet
if condition == nil {
return []reconcile.Request{}
}

// none are nil and status is the same - nothing to do
isHealthy := condition.Status == metav1.ConditionTrue

// record and probe disagree on health - requeue
if *probe.Status.Healthy != isHealthy {
return []reconcile.Request{{NamespacedName: client.ObjectKeyFromObject(record)}}
}
// nothing to do
return []reconcile.Request{}
})).
Complete(r)
Expand Down Expand Up @@ -490,14 +493,13 @@ func recordReceivedPrematurely(record *v1alpha1.DNSRecord, probes *v1alpha1.DNSH
// healthy is true only if we have probes and they are all healthy
isHealthy := healthyCond.Status == metav1.ConditionTrue

// if at least one not healthy - this will lock in false
allProbesHealthy := true
// if at least one is healthy - this will lock in true
allProbesHealthy := false
for _, probe := range probes.Items {
if probe.Status.Healthy != nil {
allProbesHealthy = allProbesHealthy && *probe.Status.Healthy
allProbesHealthy = allProbesHealthy || *probe.Status.Healthy
}
}

// prematurely is true here. return false in case we need full reconcile
return isHealthy == allProbesHealthy, requeueIn
}
Expand Down
81 changes: 0 additions & 81 deletions internal/controller/dnsrecord_healthchecks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,87 +265,6 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
}, TestTimeoutMedium, time.Second).Should(Succeed())
})

It("Should reconcile prematurely on healthcheck change", func() {
// Create a default record
Expect(k8sClient.Create(ctx, dnsRecord)).To(Succeed())

By("Making probes healthy")
Eventually(func(g Gomega) {
probes := &v1alpha1.DNSHealthCheckProbeList{}

g.Expect(k8sClient.List(ctx, probes, &client.ListOptions{
LabelSelector: labels.SelectorFromSet(map[string]string{
ProbeOwnerLabel: BuildOwnerLabelValue(dnsRecord),
}),
Namespace: dnsRecord.Namespace,
})).To(Succeed())
g.Expect(len(probes.Items)).To(Equal(2))

// make probes healthy
for _, probe := range probes.Items {
probe.Status.Healthy = ptr.To(true)
probe.Status.LastCheckedAt = metav1.Now()
probe.Status.ConsecutiveFailures = 0
g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed())
}
}, TestTimeoutMedium, time.Second).Should(Succeed())

// Override valid for to 1 hour to make sure we are always premature
Eventually(func(g Gomega) {
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed())
// update only after we are ready
g.Expect(dnsRecord.Status.Conditions).To(
ContainElement(MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeReady)),
"Status": Equal(metav1.ConditionTrue),
})),
)
dnsRecord.Status.ValidFor = "1h"
g.Expect(k8sClient.Status().Update(ctx, dnsRecord)).To(Succeed())
}, TestTimeoutMedium, time.Second).Should(Succeed())

// validate that it is still 1 hour - we hit the premature loop
// we purposefully are not updating anything on premature loops
// so can't be sure if we have had a short loop or not yet
By("Verify premature loop did not occur")
Eventually(func(g Gomega) {
newRecord := &v1alpha1.DNSRecord{}
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), newRecord)).To(Succeed())
g.Expect(newRecord.Status.ValidFor).To(Equal("1h"))
}, TestTimeoutMedium, time.Second).Should(Succeed())

By("Making probes unhealthy")
Eventually(func(g Gomega) {
probes := &v1alpha1.DNSHealthCheckProbeList{}

g.Expect(k8sClient.List(ctx, probes, &client.ListOptions{
LabelSelector: labels.SelectorFromSet(map[string]string{
ProbeOwnerLabel: BuildOwnerLabelValue(dnsRecord),
}),
Namespace: dnsRecord.Namespace,
})).To(Succeed())
g.Expect(len(probes.Items)).To(Equal(2))

// make probes healthy
for _, probe := range probes.Items {
probe.Status.Healthy = ptr.To(false)
probe.Status.LastCheckedAt = metav1.Now()
probe.Status.ConsecutiveFailures = 5
g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed())
}
}, TestTimeoutMedium, time.Second).Should(Succeed())

// check that record has valid for not an hour - we did the full reconcile and updated valid for
By("Validate premature loop occurred")
Eventually(func(g Gomega) {
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed())
// on the full loop we will always update ValidFor unless it is max duration.
// since we had 1 hour the logic should set it to the max.
// meaning that changing probes triggered dnsrecornd reconciler
// and we had a fool loop before we were supposed to
g.Expect(dnsRecord.Status.ValidFor).To(Equal(RequeueDuration.String()))
}, TestTimeoutMedium, time.Second).Should(Succeed())
})
It("Should not publish unhealthy enpoints", func() {
//Create default test dnsrecord
Expect(k8sClient.Create(ctx, dnsRecord)).To(Succeed())
Expand Down
Loading