Skip to content

Commit

Permalink
Merge pull request #1460 from flux-iac/feat/exponential-retry-on-failure
Browse files Browse the repository at this point in the history
Added exponential backoff on reconciliation failure
  • Loading branch information
akselleirv authored Oct 14, 2024
2 parents 3bbe9d6 + 69475bd commit 9d30779
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 4 deletions.
44 changes: 40 additions & 4 deletions api/v1alpha2/terraform_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ package v1alpha2
import (
"bytes"
"fmt"
"math"
"net"
"strings"
"time"
"unicode/utf8"

"github.com/flux-iac/tofu-controller/api/planid"
"github.com/fluxcd/pkg/apis/meta"
sourcev1 "github.com/fluxcd/source-controller/api/v1"
corev1 "k8s.io/api/core/v1"
Expand All @@ -33,6 +33,8 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/serializer"

"github.com/flux-iac/tofu-controller/api/planid"
)

const (
Expand Down Expand Up @@ -145,6 +147,21 @@ type TerraformSpec struct {
// +optional
RetryInterval *metav1.Duration `json:"retryInterval,omitempty"`

// The strategy to use when retrying a previously failed reconciliation.
// The default strategy is StaticInterval and the retry interval is based on the RetryInterval value.
// The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a
// maximum requeue duration of MaxRetryInterval.
// +kubebuilder:validation:Enum=StaticInterval;ExponentialBackoff
// +kubebuilder:default:string=StaticInterval
// +optional
RetryStrategy RetryStrategyEnum `json:"retryStrategy,omitempty"`

// The maximum requeue duration after a previously failed reconciliation.
// Only applicable when RetryStrategy is set to ExponentialBackoff.
// The default value is 24 hours when not specified.
// +optional
MaxRetryInterval *metav1.Duration `json:"maxRetryInterval,omitempty"`

// Path to the directory containing Terraform (.tf) files.
// Defaults to 'None', which translates to the root path of the SourceRef.
// +optional
Expand Down Expand Up @@ -521,6 +538,13 @@ const (
ForceUnlockEnumNo ForceUnlockEnum = "no"
)

type RetryStrategyEnum string

const (
StaticInterval RetryStrategyEnum = "StaticInterval"
ExponentialBackoff RetryStrategyEnum = "ExponentialBackoff"
)

const (
TerraformKind = "Terraform"
TerraformFinalizer = "finalizers.tf.contrib.fluxcd.io"
Expand Down Expand Up @@ -892,12 +916,24 @@ func (in Terraform) GetDependsOn() []meta.NamespacedObjectReference {

// GetRetryInterval returns the retry interval
func (in Terraform) GetRetryInterval() time.Duration {
retryInterval := 15 * time.Second
if in.Spec.RetryInterval != nil {
return in.Spec.RetryInterval.Duration
retryInterval = in.Spec.RetryInterval.Duration
}

if in.Spec.RetryStrategy == ExponentialBackoff {
retryInterval *= time.Duration(math.Pow(2, float64(in.Status.ReconciliationFailures)))
maxRetryInterval := 24 * time.Hour
if in.Spec.MaxRetryInterval != nil {
maxRetryInterval = in.Spec.MaxRetryInterval.Duration
}

if retryInterval > maxRetryInterval {
return maxRetryInterval
}
}

// The default retry interval is 15 seconds.
return 15 * time.Second
return retryInterval
}

// GetStatusConditions returns a pointer to the Status.Conditions slice.
Expand Down
81 changes: 81 additions & 0 deletions api/v1alpha2/terraform_types_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package v1alpha2

import (
"testing"
"time"

. "github.com/onsi/gomega"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestGetRetryInterval(t *testing.T) {
g := NewGomegaWithT(t)

tests := []struct {
name string
terraform Terraform
expectedRetryInterval time.Duration
}{
{
name: "default retry interval",
terraform: Terraform{
Spec: TerraformSpec{},
},
expectedRetryInterval: 15 * time.Second,
},
{
name: "custom retry interval",
terraform: Terraform{
Spec: TerraformSpec{
RetryInterval: &metav1.Duration{Duration: 30 * time.Second},
},
},
expectedRetryInterval: 30 * time.Second,
},
{
name: "exponential backoff with default retry interval",
terraform: Terraform{
Spec: TerraformSpec{
RetryStrategy: ExponentialBackoff,
},
Status: TerraformStatus{
ReconciliationFailures: 2,
},
},
expectedRetryInterval: 60 * time.Second,
},
{
name: "exponential backoff",
terraform: Terraform{
Spec: TerraformSpec{
RetryStrategy: ExponentialBackoff,
RetryInterval: &metav1.Duration{Duration: 60 * time.Second},
},
Status: TerraformStatus{
ReconciliationFailures: 4,
},
},
expectedRetryInterval: 960 * time.Second,
},
{
name: "exponential backoff with max retry interval",
terraform: Terraform{
Spec: TerraformSpec{
RetryStrategy: ExponentialBackoff,
RetryInterval: &metav1.Duration{Duration: 60 * time.Second},
MaxRetryInterval: &metav1.Duration{Duration: 45 * time.Second},
},
Status: TerraformStatus{
ReconciliationFailures: 4,
},
},
expectedRetryInterval: 45 * time.Second,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g.Expect(tt.terraform.GetRetryInterval()).To(Equal(tt.expectedRetryInterval))
})
}
}
5 changes: 5 additions & 0 deletions api/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions charts/tofu-controller/crds/crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5536,6 +5536,12 @@ spec:
interval:
description: The interval at which to reconcile the Terraform.
type: string
maxRetryInterval:
description: |-
The maximum requeue duration after a previously failed reconciliation.
Only applicable when RetryStrategy is set to ExponentialBackoff.
The default value is 24 hours when not specified.
type: string
parallelism:
default: 0
description: Parallelism limits the number of concurrent operations
Expand Down Expand Up @@ -5587,6 +5593,17 @@ spec:
The interval at which to retry a previously failed reconciliation.
The default value is 15 when not specified.
type: string
retryStrategy:
default: StaticInterval
description: |-
The strategy to use when retrying a previously failed reconciliation.
The default strategy is StaticInterval and the retry interval is based on the RetryInterval value.
The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a
maximum requeue duration of MaxRetryInterval.
enum:
- StaticInterval
- ExponentialBackoff
type: string
runnerPodTemplate:
properties:
metadata:
Expand Down
17 changes: 17 additions & 0 deletions config/crd/bases/infra.contrib.fluxcd.io_terraforms.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5536,6 +5536,12 @@ spec:
interval:
description: The interval at which to reconcile the Terraform.
type: string
maxRetryInterval:
description: |-
The maximum requeue duration after a previously failed reconciliation.
Only applicable when RetryStrategy is set to ExponentialBackoff.
The default value is 24 hours when not specified.
type: string
parallelism:
default: 0
description: Parallelism limits the number of concurrent operations
Expand Down Expand Up @@ -5587,6 +5593,17 @@ spec:
The interval at which to retry a previously failed reconciliation.
The default value is 15 when not specified.
type: string
retryStrategy:
default: StaticInterval
description: |-
The strategy to use when retrying a previously failed reconciliation.
The default strategy is StaticInterval and the retry interval is based on the RetryInterval value.
The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a
maximum requeue duration of MaxRetryInterval.
enum:
- StaticInterval
- ExponentialBackoff
type: string
runnerPodTemplate:
properties:
metadata:
Expand Down
72 changes: 72 additions & 0 deletions docs/References/terraform.md
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,12 @@ string
</table>
</div>
</div>
<h3 id="infra.contrib.fluxcd.io/v1alpha2.RetryStrategyEnum">RetryStrategyEnum
(<code>string</code> alias)</h3>
<p>
(<em>Appears on:</em>
<a href="#infra.contrib.fluxcd.io/v1alpha2.TerraformSpec">TerraformSpec</a>)
</p>
<h3 id="infra.contrib.fluxcd.io/v1alpha2.RunnerPodMetadata">RunnerPodMetadata
</h3>
<p>
Expand Down Expand Up @@ -1568,6 +1574,39 @@ The default value is 15 when not specified.</p>
</tr>
<tr>
<td>
<code>retryStrategy</code><br>
<em>
<a href="#infra.contrib.fluxcd.io/v1alpha2.RetryStrategyEnum">
RetryStrategyEnum
</a>
</em>
</td>
<td>
<em>(Optional)</em>
<p>The strategy to use when retrying a previously failed reconciliation.
The default strategy is StaticInterval and the retry interval is based on the RetryInterval value.
The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a
maximum requeue duration of MaxRetryInterval.</p>
</td>
</tr>
<tr>
<td>
<code>maxRetryInterval</code><br>
<em>
<a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
Kubernetes meta/v1.Duration
</a>
</em>
</td>
<td>
<em>(Optional)</em>
<p>The maximum requeue duration after a previously failed reconciliation.
Only applicable when RetryStrategy is set to ExponentialBackoff.
The default value is 24 hours when not specified.</p>
</td>
</tr>
<tr>
<td>
<code>path</code><br>
<em>
string
Expand Down Expand Up @@ -2131,6 +2170,39 @@ The default value is 15 when not specified.</p>
</tr>
<tr>
<td>
<code>retryStrategy</code><br>
<em>
<a href="#infra.contrib.fluxcd.io/v1alpha2.RetryStrategyEnum">
RetryStrategyEnum
</a>
</em>
</td>
<td>
<em>(Optional)</em>
<p>The strategy to use when retrying a previously failed reconciliation.
The default strategy is StaticInterval and the retry interval is based on the RetryInterval value.
The ExponentialBackoff strategy uses the formula: 2^reconciliationFailures * RetryInterval with a
maximum requeue duration of MaxRetryInterval.</p>
</td>
</tr>
<tr>
<td>
<code>maxRetryInterval</code><br>
<em>
<a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
Kubernetes meta/v1.Duration
</a>
</em>
</td>
<td>
<em>(Optional)</em>
<p>The maximum requeue duration after a previously failed reconciliation.
Only applicable when RetryStrategy is set to ExponentialBackoff.
The default value is 24 hours when not specified.</p>
</td>
</tr>
<tr>
<td>
<code>path</code><br>
<em>
string
Expand Down

0 comments on commit 9d30779

Please sign in to comment.