From 5de275cdad048a24e3ba017a203944fc9290ba03 Mon Sep 17 00:00:00 2001 From: Isa Aguilar Date: Wed, 25 Oct 2023 01:53:25 -0400 Subject: [PATCH] enable retry based on labels Fixes https://github.com/GalleyBytes/terraform-operator/issues/150 WIP still need to handle - retry when already in delete stages - cleanup of status after generation changes - fix typos in comments and/or descriptions --- .../tf.galleybytes.com_terraforms_crd.yaml | 15 +++++++- pkg/apis/tf/v1beta1/terraform_types.go | 17 +++++++++ pkg/controllers/terraform_controller.go | 38 +++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/deploy/crds/tf.galleybytes.com_terraforms_crd.yaml b/deploy/crds/tf.galleybytes.com_terraforms_crd.yaml index ab416fc..36d655a 100644 --- a/deploy/crds/tf.galleybytes.com_terraforms_crd.yaml +++ b/deploy/crds/tf.galleybytes.com_terraforms_crd.yaml @@ -3,7 +3,8 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.12.0 + controller-gen.kubebuilder.io/version: v0.9.2 + creationTimestamp: null name: terraforms.tf.galleybytes.com spec: group: tf.galleybytes.com @@ -2563,6 +2564,18 @@ spec: it, the chance of recycling existing resources is reduced to virtually nil. type: string + retryEventReson: + description: "RetryEventReason writes a copy of the current resource + label for 'kubernetes.io/change-cause'. The value will be written + as here as the retry reason. \n When '.setup' is is the suffix, + the pipeline will be started from the setup task. \n Example of + starting from setup: \n ```yaml metadata: labels: kubernetes.io/change-cause: + triggered-by-isa_aguilar-20231025T011600.setup ``` \n A default + retry will start from the init task otherwise." + type: string + retryTimestamp: + format: date-time + type: string stage: description: Stage stores information about the current stage properties: diff --git a/pkg/apis/tf/v1beta1/terraform_types.go b/pkg/apis/tf/v1beta1/terraform_types.go index e0e78b3..8aab2c8 100644 --- a/pkg/apis/tf/v1beta1/terraform_types.go +++ b/pkg/apis/tf/v1beta1/terraform_types.go @@ -675,6 +675,23 @@ type TerraformStatus struct { // refreshed each generation. // +optional PluginsStarted []TaskName `json:"pluginsStarted,omitempty"` + + // RetryEventReason writes a copy of the current resource label for 'kubernetes.io/change-cause'. The + // value will be written as here as the retry reason. + // + // When '.setup' is is the suffix, the pipeline will be started from the setup task. + // + // Example of starting from setup: + // + // ```yaml + // metadata: + // labels: + // kubernetes.io/change-cause: triggered-by-isa_aguilar-20231025T011600.setup + // ``` + // + // A default retry will start from the init task otherwise. + RetryEventReason *string `json:"retryEventReson,omitempty"` + RetryTimestamp *metav1.Time `json:"retryTimestamp,omitempty"` } type Exported string diff --git a/pkg/controllers/terraform_controller.go b/pkg/controllers/terraform_controller.go index cff6fbd..d13c61c 100644 --- a/pkg/controllers/terraform_controller.go +++ b/pkg/controllers/terraform_controller.go @@ -588,6 +588,34 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re return reconcile.Result{}, err } + if tf.Labels != nil { + if label, found := tf.Labels["kubernetes.io/change-cause"]; found { + retry := false + if tf.Status.RetryEventReason == nil { + retry = true + } else if *tf.Status.RetryEventReason != label { + retry = true + } + + if retry { + now := metav1.Now() + tf.Status.RetryEventReason = &label + tf.Status.RetryTimestamp = &now + taskType := tfv1beta1.RunInit + if strings.HasSuffix(label, ".setup") { + taskType = tfv1beta1.RunSetup + } + stage := newStage(tf, taskType, label, isTaskInterruptable(taskType), tfv1beta1.StateInitializing) + tf.Status.Stage = *stage + err := r.updateStatusWithRetry(ctx, tf, &tf.Status, reqLogger) + if err != nil { + reqLogger.V(1).Info(err.Error()) + } + return reconcile.Result{}, nil + } + } + } + // Final delete by removing finalizers if tf.Status.Phase == tfv1beta1.PhaseDeleted { reqLogger.Info("Remove finalizers") @@ -765,6 +793,16 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re return reconcile.Result{}, nil } + if tf.Status.RetryTimestamp != nil { + podSlice := []corev1.Pod{} + for _, pod := range pods.Items { + if pod.CreationTimestamp.IsZero() || !pod.CreationTimestamp.Before(tf.Status.RetryTimestamp) { + podSlice = append(podSlice, pod) + } + } + pods.Items = podSlice + } + if len(pods.Items) == 0 && tf.Status.Stage.State == tfv1beta1.StateInProgress { // This condition is generally met when the user deletes the pod. // Force the state to transition away from in-progress and then