Skip to content

Commit

Permalink
enable retry based on labels
Browse files Browse the repository at this point in the history
Fixes #150

- handle retry when already in delete stages
- account for new generation and deleted
  • Loading branch information
isaaguilar committed Oct 25, 2023
1 parent ec174bd commit 20ea3f9
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 15 deletions.
14 changes: 13 additions & 1 deletion deploy/crds/tf.galleybytes.com_terraforms_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.12.0
controller-gen.kubebuilder.io/version: v0.9.2
creationTimestamp: null
name: terraforms.tf.galleybytes.com
spec:
group: tf.galleybytes.com
Expand Down Expand Up @@ -2563,6 +2564,17 @@ spec:
it, the chance of recycling existing resources is reduced to virtually
nil.
type: string
retryEventReson:
description: "RetryEventReason copies the value of the resource label
for 'kubernetes.io/change-cause'. When '.setup' is is the suffix
of the value, the pipeline will retry from the setup task. \n Example
of starting from setup: \n ```yaml metadata: labels: kubernetes.io/change-cause:
triggered-by-isa_aguilar-20231025T011600.setup ``` \n A default
retry will start from the init task otherwise."
type: string
retryTimestamp:
format: date-time
type: string
stage:
description: Stage stores information about the current stage
properties:
Expand Down
15 changes: 15 additions & 0 deletions pkg/apis/tf/v1beta1/terraform_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,21 @@ type TerraformStatus struct {
// refreshed each generation.
// +optional
PluginsStarted []TaskName `json:"pluginsStarted,omitempty"`

// RetryEventReason copies the value of the resource label for 'kubernetes.io/change-cause'.
// When '.setup' is is the suffix of the value, the pipeline will retry from the setup task.
//
// Example of starting from setup:
//
// ```yaml
// metadata:
// labels:
// kubernetes.io/change-cause: triggered-by-isa_aguilar-20231025T011600.setup
// ```
//
// A default retry will start from the init task otherwise.
RetryEventReason *string `json:"retryEventReson,omitempty"`
RetryTimestamp *metav1.Time `json:"retryTimestamp,omitempty"`
}

type Exported string
Expand Down
9 changes: 9 additions & 0 deletions pkg/apis/tf/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 13 additions & 1 deletion pkg/apis/tf/v1beta1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

78 changes: 65 additions & 13 deletions pkg/controllers/terraform_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -670,14 +670,41 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
//
// }
// }
stage := r.checkSetNewStage(ctx, tf)

retry := false
if tf.Labels != nil {
if label, found := tf.Labels["kubernetes.io/change-cause"]; found {

if tf.Status.RetryEventReason == nil {
retry = true
} else if *tf.Status.RetryEventReason != label {
retry = true
}

if retry {
// Once a single retry is triggered via the change-cause label method,
// the retry* status entries will persist for the lifetime of
// the resource. This doesn't affect workflows, but it's a little annoying to see the
// status long after the retry has occurred. In the future, see if there is a way to clean
// up the status.
// As of today, attempting to clean the retry* status when the change-cause label still exists
// causes the controller to skip new generation steps like creating configmaps, secrets, etc.
// TODO clean retry* status
now := metav1.Now()
tf.Status.RetryEventReason = &label // saved via updateStatusWithRetry
tf.Status.RetryTimestamp = &now // saved via updateStatusWithRetry
}
}
}

stage := r.checkSetNewStage(ctx, tf, retry)
if stage != nil {
tf.Status.Stage = *stage
if stage.Reason == "RESTARTED_WORKFLOW" || stage.Reason == "RESTARTED_DELETE_WORKFLOW" {
_ = r.removeOldPlan(tf)
_ = r.removeOldPlan(tf.Namespace, tf.Name, tf.Status.Stage.Reason, tf.Generation)
// TODO what to do if the remove old plan function fails
}
reqLogger.V(2).Info(fmt.Sprintf("Stage moving from '%s' -> '%s'", tf.Status.Stage.TaskType, stage.TaskType))
tf.Status.Stage = *stage
desiredStatus := tf.Status
err := r.updateStatusWithRetry(ctx, tf, &desiredStatus, reqLogger)
if err != nil {
Expand Down Expand Up @@ -765,6 +792,16 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
return reconcile.Result{}, nil
}

if tf.Status.RetryTimestamp != nil {
podSlice := []corev1.Pod{}
for _, pod := range pods.Items {
if pod.CreationTimestamp.IsZero() || !pod.CreationTimestamp.Before(tf.Status.RetryTimestamp) {
podSlice = append(podSlice, pod)
}
}
pods.Items = podSlice
}

if len(pods.Items) == 0 && tf.Status.Stage.State == tfv1beta1.StateInProgress {
// This condition is generally met when the user deletes the pod.
// Force the state to transition away from in-progress and then
Expand Down Expand Up @@ -851,7 +888,7 @@ func (r *ReconcileTerraform) Reconcile(ctx context.Context, request reconcile.Re
reqLogger.V(1).Info(fmt.Sprintf("Setting up the '%s' pod", podType))
err := r.setupAndRun(ctx, tf, runOpts)
if err != nil {
reqLogger.Error(err, "")
reqLogger.Error(err, err.Error())
return reconcile.Result{}, err
}
if tf.Status.Phase == tfv1beta1.PhaseInitializing {
Expand Down Expand Up @@ -1030,7 +1067,7 @@ func getConfiguredTasks(taskOptions *[]tfv1beta1.TaskOption) []tfv1beta1.TaskNam
// When a stage has already triggered a pod, the only way for the pod to transition to the next stage is for
// the pod to complete successfully. Any other pod phase will keep the pod in the current stage, or in the
// case of the apply task, the workflow will be restarted.
func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.Terraform) *tfv1beta1.Stage {
func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.Terraform, isRetry bool) *tfv1beta1.Stage {
var isNewStage bool
var podType tfv1beta1.TaskName
var reason string
Expand All @@ -1052,8 +1089,23 @@ func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.
currentStageIsRunning := currentStage.State == tfv1beta1.StateInProgress
isNewGeneration := currentStage.Generation != tf.Generation

// resource status
if currentStageCanNotBeInterrupted && currentStageIsRunning {
if isRetry && !isToBeDeletedOrIsDeleting && !isNewGeneration {
isNewStage = true
reason = *tf.Status.RetryEventReason
podType = tfv1beta1.RunInit
if strings.HasSuffix(reason, ".setup") {
podType = tfv1beta1.RunSetup
}
interruptible = isTaskInterruptable(podType)
} else if isRetry && isToBeDeletedOrIsDeleting && !isNewGeneration {
isNewStage = true
reason = *tf.Status.RetryEventReason
podType = tfv1beta1.RunInitDelete
if strings.HasSuffix(reason, ".setup") {
podType = tfv1beta1.RunSetupDelete
}
interruptible = isTaskInterruptable(podType)
} else if currentStageCanNotBeInterrupted && currentStageIsRunning {
// Cannot change to the next stage because the current stage cannot be
// interrupted and is currently running
isNewStage = false
Expand Down Expand Up @@ -1125,20 +1177,20 @@ func (r ReconcileTerraform) checkSetNewStage(ctx context.Context, tf *tfv1beta1.

}

func (r ReconcileTerraform) removeOldPlan(tf *tfv1beta1.Terraform) error {
func (r ReconcileTerraform) removeOldPlan(namespace, name, reason string, generation int64) error {
labelSelectors := []string{
fmt.Sprintf("terraforms.tf.galleybytes.com/generation==%d", tf.Generation),
fmt.Sprintf("terraforms.tf.galleybytes.com/resourceName=%s", tf.Name),
fmt.Sprintf("terraforms.tf.galleybytes.com/generation==%d", generation),
fmt.Sprintf("terraforms.tf.galleybytes.com/resourceName=%s", name),
"app.kubernetes.io/instance",
}
if tf.Status.Stage.Reason == "RESTARTED_WORKFLOW" {
if reason == "RESTARTED_WORKFLOW" {
labelSelectors = append(labelSelectors, []string{
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunSetup),
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunPreInit),
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunInit),
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunPostInit),
}...)
} else if tf.Status.Stage.Reason == "RESTARTED_DELETE_WORKFLOW" {
} else if reason == "RESTARTED_DELETE_WORKFLOW" {
labelSelectors = append(labelSelectors, []string{
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunSetupDelete),
fmt.Sprintf("app.kubernetes.io/instance!=%s", tfv1beta1.RunPreInitDelete),
Expand All @@ -1157,7 +1209,7 @@ func (r ReconcileTerraform) removeOldPlan(tf *tfv1beta1.Terraform) error {
err = r.Client.DeleteAllOf(context.TODO(), &corev1.Pod{}, &client.DeleteAllOfOptions{
ListOptions: client.ListOptions{
LabelSelector: labelSelector,
Namespace: tf.Namespace,
Namespace: namespace,
FieldSelector: fieldSelector,
},
})
Expand Down

0 comments on commit 20ea3f9

Please sign in to comment.