From 11c80b81771b32d6554259f094d2cc1bddea62d1 Mon Sep 17 00:00:00 2001 From: Joe Lanford Date: Sat, 11 Apr 2026 08:28:48 -0400 Subject: [PATCH 1/7] =?UTF-8?q?=F0=9F=90=9B=20fix:=20allow=20reconciliatio?= =?UTF-8?q?n=20of=20deadline-exceeded=20ClusterObjectSets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, a watch predicate dropped update events for ClusterObjectSets with ProgressDeadlineExceeded, preventing them from being archived or cleaned up. Remove the predicate and replace the inline deadline logic in Reconcile with an enforceProgressDeadline() helper and a progressDeadline() function that computes the absolute deadline from spec and metadata. Add a deadline-aware rate limiter that caps exponential backoff at the deadline and allows one immediate requeue when it passes, without inspecting status conditions. Add an e2e scenario that forces ProgressDeadlineExceeded, archives the COS, and verifies resource cleanup. --- .../clusterobjectset_controller.go | 104 +++++----- .../clusterobjectset_controller_test.go | 2 +- .../controllers/progress_deadline.go | 110 +++++++++++ .../controllers/progress_deadline_test.go | 183 ++++++++++++++++++ test/e2e/features/install.feature | 4 +- test/e2e/features/revision.feature | 69 +++++++ test/e2e/steps/steps.go | 18 ++ 7 files changed, 430 insertions(+), 60 deletions(-) create mode 100644 internal/operator-controller/controllers/progress_deadline.go create mode 100644 internal/operator-controller/controllers/progress_deadline_test.go diff --git a/internal/operator-controller/controllers/clusterobjectset_controller.go b/internal/operator-controller/controllers/clusterobjectset_controller.go index 2d81b71c9..76e389538 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller.go @@ -23,6 +23,7 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" "pkg.package-operator.run/boxcutter" "pkg.package-operator.run/boxcutter/machinery" @@ -32,8 +33,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -83,29 +84,6 @@ func (c *ClusterObjectSetReconciler) Reconcile(ctx context.Context, req ctrl.Req reconciledRev := existingRev.DeepCopy() res, reconcileErr := c.reconcile(ctx, reconciledRev) - if pd := existingRev.Spec.ProgressDeadlineMinutes; pd > 0 { - cnd := meta.FindStatusCondition(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) - isStillProgressing := cnd != nil && cnd.Status == metav1.ConditionTrue && cnd.Reason != ocv1.ReasonSucceeded - succeeded := meta.IsStatusConditionTrue(reconciledRev.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded) - // check if we reached the progress deadline only if the revision is still progressing and has not succeeded yet - if isStillProgressing && !succeeded { - timeout := time.Duration(pd) * time.Minute - if c.Clock.Since(existingRev.CreationTimestamp.Time) > timeout { - // progress deadline reached, reset any errors and stop reconciling this revision - markAsNotProgressing(reconciledRev, ocv1.ReasonProgressDeadlineExceeded, fmt.Sprintf("Revision has not rolled out for %d minute(s).", pd)) - reconcileErr = nil - res = ctrl.Result{} - } else if reconcileErr == nil { - // We want to requeue so far in the future that the next reconciliation - // can detect if the revision did not progress within the given timeout. - // Thus, we plan the next reconcile slightly after (+2secs) the timeout is passed. - drift := 2 * time.Second - requeueAfter := existingRev.CreationTimestamp.Time.Add(timeout).Add(drift).Sub(c.Clock.Now()).Round(time.Second) - l.Info(fmt.Sprintf("ProgressDeadline not exceeded, requeue after ~%v to check again.", requeueAfter)) - res = ctrl.Result{RequeueAfter: requeueAfter} - } - } - } // Do checks before any Update()s, as Update() may modify the resource structure! updateStatus := !equality.Semantic.DeepEqual(existingRev.Status, reconciledRev.Status) @@ -146,6 +124,9 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl return c.delete(ctx, cos) } + remaining, hasDeadline := durationUntilDeadline(c.Clock, cos) + isDeadlineExceeded := hasDeadline && remaining <= 0 + if err := c.verifyReferencedSecretsImmutable(ctx, cos); err != nil { l.Error(err, "referenced Secret verification failed, blocking reconciliation") markAsNotProgressing(cos, ocv1.ClusterObjectSetReasonBlocked, err.Error()) @@ -154,7 +135,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl phases, currentPhases, opts, err := c.buildBoxcutterPhases(ctx, cos) if err != nil { - setRetryingConditions(cos, err.Error()) + setRetryingConditions(cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("converting to boxcutter revision: %v", err) } @@ -168,7 +149,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl revisionEngine, err := c.RevisionEngineFactory.CreateRevisionEngine(ctx, cos) if err != nil { - setRetryingConditions(cos, err.Error()) + setRetryingConditions(cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("failed to create revision engine: %v", err) } @@ -194,7 +175,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl if err := c.establishWatch(ctx, cos, revision); err != nil { werr := fmt.Errorf("establish watch: %v", err) - setRetryingConditions(cos, werr.Error()) + setRetryingConditions(cos, werr.Error(), isDeadlineExceeded) return ctrl.Result{}, werr } @@ -204,7 +185,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl // Log detailed reconcile reports only in debug mode (V(1)) to reduce verbosity. l.V(1).Info("reconcile report", "report", rres.String()) } - setRetryingConditions(cos, err.Error()) + setRetryingConditions(cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("revision reconcile: %v", err) } @@ -212,14 +193,14 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl // TODO: report status, backoff? if verr := rres.GetValidationError(); verr != nil { l.Error(fmt.Errorf("%w", verr), "preflight validation failed, retrying after 10s") - setRetryingConditions(cos, fmt.Sprintf("revision validation error: %s", verr)) + setRetryingConditions(cos, fmt.Sprintf("revision validation error: %s", verr), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } for i, pres := range rres.GetPhases() { if verr := pres.GetValidationError(); verr != nil { l.Error(fmt.Errorf("%w", verr), "phase preflight validation failed, retrying after 10s", "phase", i) - setRetryingConditions(cos, fmt.Sprintf("phase %d validation error: %s", i, verr)) + setRetryingConditions(cos, fmt.Sprintf("phase %d validation error: %s", i, verr), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } @@ -232,14 +213,14 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl if len(collidingObjs) > 0 { l.Error(fmt.Errorf("object collision detected"), "object collision, retrying after 10s", "phase", i, "collisions", collidingObjs) - setRetryingConditions(cos, fmt.Sprintf("revision object collisions in phase %d\n%s", i, strings.Join(collidingObjs, "\n\n"))) + setRetryingConditions(cos, fmt.Sprintf("revision object collisions in phase %d\n%s", i, strings.Join(collidingObjs, "\n\n")), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } } revVersion := cos.GetAnnotations()[labels.BundleVersionKey] if rres.InTransition() { - markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) + markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) } //nolint:nestif @@ -259,7 +240,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl } } - markAsProgressing(cos, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion)) + markAsProgressing(cos, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion), isDeadlineExceeded) markAsAvailable(cos, ocv1.ClusterObjectSetReasonProbesSucceeded, "Objects are available and pass all probes.") // We'll probably only want to remove this once we are done updating the ClusterExtension conditions @@ -304,8 +285,9 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl } else { markAsUnavailable(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) } - if meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) == nil { - markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) + markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) + if hasDeadline && !isDeadlineExceeded { + return ctrl.Result{RequeueAfter: remaining}, nil } } @@ -327,11 +309,11 @@ func (c *ClusterObjectSetReconciler) archive(ctx context.Context, revisionEngine tdres, err := revisionEngine.Teardown(ctx, revision) if err != nil { err = fmt.Errorf("error archiving revision: %v", err) - setRetryingConditions(cos, err.Error()) + setRetryingConditions(cos, err.Error(), false) return ctrl.Result{}, err } if tdres != nil && !tdres.IsComplete() { - setRetryingConditions(cos, "removing revision resources that are not owned by another revision") + setRetryingConditions(cos, "removing revision resources that are not owned by another revision", false) return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } // Ensure conditions are set before removing the finalizer when archiving @@ -349,29 +331,19 @@ type Sourcoser interface { } func (c *ClusterObjectSetReconciler) SetupWithManager(mgr ctrl.Manager) error { - skipProgressDeadlineExceededPredicate := predicate.Funcs{ - UpdateFunc: func(e event.UpdateEvent) bool { - rev, ok := e.ObjectNew.(*ocv1.ClusterObjectSet) - if !ok { - return true - } - // allow deletions to happen - if !rev.DeletionTimestamp.IsZero() { - return true - } - if cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing); cnd != nil && cnd.Status == metav1.ConditionFalse && cnd.Reason == ocv1.ReasonProgressDeadlineExceeded { - return false - } - return true - }, - } c.Clock = clock.RealClock{} return ctrl.NewControllerManagedBy(mgr). + WithOptions(controller.Options{ + RateLimiter: newDeadlineAwareRateLimiter( + workqueue.DefaultTypedControllerRateLimiter[ctrl.Request](), + mgr.GetClient(), + c.Clock, + ), + }). For( &ocv1.ClusterObjectSet{}, builder.WithPredicates( predicate.ResourceVersionChangedPredicate{}, - skipProgressDeadlineExceededPredicate, ), ). WatchesRawSource( @@ -659,14 +631,32 @@ func buildProgressionProbes(progressionProbes []ocv1.ProgressionProbe) (probing. return userProbes, nil } -func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string) { - markAsProgressing(cos, ocv1.ClusterObjectSetReasonRetrying, message) +func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string, isDeadlineExceeded bool) { + markAsProgressing(cos, ocv1.ClusterObjectSetReasonRetrying, message, isDeadlineExceeded) if meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable) != nil { markAsAvailableUnknown(cos, ocv1.ClusterObjectSetReasonReconciling, message) } } -func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string) { +func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string, isDeadlineExceeded bool) { + nonTerminalReasons := map[string]struct{}{ + ocv1.ReasonRollingOut: {}, + ocv1.ClusterObjectSetReasonRetrying: {}, + } + + switch reason { + case ocv1.ReasonSucceeded: + // Terminal — always apply. + default: + if _, known := nonTerminalReasons[reason]; !known { + log.Log.Error(fmt.Errorf("unregistered progressing reason: %q", reason), "treating as non-terminal for deadline enforcement") + } + if isDeadlineExceeded { + markAsNotProgressing(cos, ocv1.ReasonProgressDeadlineExceeded, + fmt.Sprintf("Revision has not rolled out for %d minute(s). Last status: %s", cos.Spec.ProgressDeadlineMinutes, message)) + return + } + } meta.SetStatusCondition(&cos.Status.Conditions, metav1.Condition{ Type: ocv1.ClusterObjectSetTypeProgressing, Status: metav1.ConditionTrue, diff --git a/internal/operator-controller/controllers/clusterobjectset_controller_test.go b/internal/operator-controller/controllers/clusterobjectset_controller_test.go index b3b10f575..b43a608e1 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller_test.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller_test.go @@ -988,7 +988,7 @@ func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadline(t *testing.T) { revisionResult: &mockRevisionResult{ inTransition: true, }, - reconcileResult: ctrl.Result{RequeueAfter: 62 * time.Second}, + reconcileResult: ctrl.Result{RequeueAfter: 60 * time.Second}, validate: func(t *testing.T, c client.Client) { rev := &ocv1.ClusterObjectSet{} err := c.Get(t.Context(), client.ObjectKey{ diff --git a/internal/operator-controller/controllers/progress_deadline.go b/internal/operator-controller/controllers/progress_deadline.go new file mode 100644 index 000000000..5bce12307 --- /dev/null +++ b/internal/operator-controller/controllers/progress_deadline.go @@ -0,0 +1,110 @@ +//go:build !standard + +package controllers + +import ( + "context" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/client-go/util/workqueue" + "k8s.io/utils/clock" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + ocv1 "github.com/operator-framework/operator-controller/api/v1" +) + +// deadlineAwareRateLimiter wraps a delegate rate limiter and caps the backoff +// duration to the time remaining until the COS progress deadline, ensuring +// that ProgressDeadlineExceeded is set promptly even during exponential backoff. +// +// After the deadline passes, it allows one immediate requeue (returning 0) so +// the reconciler can set the ProgressDeadlineExceeded condition, then falls +// back to the delegate's normal backoff. This avoids both tight-looping and +// coupling to the COS's status conditions. +type deadlineAwareRateLimiter struct { + delegate workqueue.TypedRateLimiter[ctrl.Request] + client client.Reader + clock clock.Clock + pastDeadline sync.Map +} + +func newDeadlineAwareRateLimiter( + delegate workqueue.TypedRateLimiter[ctrl.Request], + c client.Reader, + clk clock.Clock, +) *deadlineAwareRateLimiter { + return &deadlineAwareRateLimiter{delegate: delegate, client: c, clock: clk} +} + +func (r *deadlineAwareRateLimiter) When(item ctrl.Request) time.Duration { + backoff := r.delegate.When(item) + + cos := &ocv1.ClusterObjectSet{} + if err := r.client.Get(context.Background(), item.NamespacedName, cos); err != nil { + return backoff + } + + remaining, hasDeadline := durationUntilDeadline(r.clock, cos) + if !hasDeadline { + return backoff + } + if remaining > 0 { + if remaining < backoff { + return remaining + } + return backoff + } + + // Deadline has passed — allow one immediate requeue, then delegate. + if _, already := r.pastDeadline.LoadOrStore(item, struct{}{}); !already { + return 0 + } + return backoff +} + +func (r *deadlineAwareRateLimiter) Forget(item ctrl.Request) { + r.delegate.Forget(item) + r.pastDeadline.Delete(item) +} + +func (r *deadlineAwareRateLimiter) NumRequeues(item ctrl.Request) int { + return r.delegate.NumRequeues(item) +} + +// durationUntilDeadline returns how much time remains before the progress deadline +// expires. A negative duration means the deadline has already passed. +// +// It derives the deadline from spec and metadata only, with one exception: +// it checks the Succeeded status condition so that a revision recovering +// from drift is not penalised by the original deadline. +// +// Succeeded is a latch: there is no way to deduce from current cluster state +// alone that a COS succeeded in the past. If Succeeded is removed or set to +// False, this function will return a deadline and the reconciler will set +// ProgressDeadlineExceeded even though the revision previously succeeded. +// +// Returns (0, false) when there is no active deadline: +// - progressDeadlineMinutes is 0 +// - the revision has already succeeded +// - the revision is archived (deadline is irrelevant) +// - the revision is being deleted +func durationUntilDeadline(clk clock.Clock, cos *ocv1.ClusterObjectSet) (time.Duration, bool) { + pd := cos.Spec.ProgressDeadlineMinutes + if pd <= 0 { + return 0, false + } + if meta.IsStatusConditionTrue(cos.Status.Conditions, ocv1.ClusterObjectSetTypeSucceeded) { + return 0, false + } + if cos.Spec.LifecycleState == ocv1.ClusterObjectSetLifecycleStateArchived { + return 0, false + } + if !cos.DeletionTimestamp.IsZero() { + return 0, false + } + deadline := cos.CreationTimestamp.Add(time.Duration(pd) * time.Minute) + return deadline.Sub(clk.Now()), true +} diff --git a/internal/operator-controller/controllers/progress_deadline_test.go b/internal/operator-controller/controllers/progress_deadline_test.go new file mode 100644 index 000000000..05a60adb6 --- /dev/null +++ b/internal/operator-controller/controllers/progress_deadline_test.go @@ -0,0 +1,183 @@ +//go:build !standard + +package controllers + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clocktesting "k8s.io/utils/clock/testing" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + ocv1 "github.com/operator-framework/operator-controller/api/v1" +) + +type fixedRateLimiter struct { + duration time.Duration +} + +func (f *fixedRateLimiter) When(_ ctrl.Request) time.Duration { return f.duration } +func (f *fixedRateLimiter) Forget(_ ctrl.Request) {} +func (f *fixedRateLimiter) NumRequeues(_ ctrl.Request) int { return 0 } + +func TestDeadlineAwareRateLimiter(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, ocv1.AddToScheme(scheme)) + + creation := time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC) + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "test-cos"}} + + for _, tc := range []struct { + name string + backoff time.Duration + cos *ocv1.ClusterObjectSet + clockTime time.Time + expectDuration time.Duration + }{ + { + name: "no deadline configured — uses delegate backoff", + backoff: 30 * time.Second, + clockTime: creation, + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + }, + }, + expectDuration: 30 * time.Second, + }, + { + name: "deadline not exceeded and backoff is shorter — uses delegate backoff", + backoff: 5 * time.Second, + clockTime: creation, + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + }, + expectDuration: 5 * time.Second, + }, + { + name: "deadline not exceeded and backoff is longer — caps at deadline", + backoff: 5 * time.Minute, + clockTime: creation, + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + }, + expectDuration: 60 * time.Second, + }, + { + name: "deadline exceeded — first call returns immediate requeue", + backoff: 30 * time.Second, + clockTime: creation.Add(61 * time.Second), + cos: &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + }, + expectDuration: 0, + }, + { + name: "COS not found — uses delegate backoff", + backoff: 30 * time.Second, + clockTime: creation, + cos: nil, + expectDuration: 30 * time.Second, + }, + } { + t.Run(tc.name, func(t *testing.T) { + builder := fake.NewClientBuilder().WithScheme(scheme) + if tc.cos != nil { + builder = builder.WithObjects(tc.cos) + } + + limiter := newDeadlineAwareRateLimiter( + &fixedRateLimiter{duration: tc.backoff}, + builder.Build(), + clocktesting.NewFakeClock(tc.clockTime), + ) + + testReq := req + if tc.cos == nil { + testReq.Name = "nonexistent" + } + + result := limiter.When(testReq) + require.Equal(t, tc.expectDuration, result) + }) + } + + t.Run("deadline exceeded — second call uses delegate backoff (one-shot)", func(t *testing.T) { + cos := &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + } + limiter := newDeadlineAwareRateLimiter( + &fixedRateLimiter{duration: 30 * time.Second}, + fake.NewClientBuilder().WithScheme(scheme).WithObjects(cos).Build(), + clocktesting.NewFakeClock(creation.Add(61*time.Second)), + ) + + first := limiter.When(req) + require.Equal(t, time.Duration(0), first) + + second := limiter.When(req) + require.Equal(t, 30*time.Second, second) + }) + + t.Run("Forget resets the one-shot flag", func(t *testing.T) { + cos := &ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cos", + CreationTimestamp: metav1.NewTime(creation), + }, + Spec: ocv1.ClusterObjectSetSpec{ + LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive, + ProgressDeadlineMinutes: 1, + }, + } + limiter := newDeadlineAwareRateLimiter( + &fixedRateLimiter{duration: 30 * time.Second}, + fake.NewClientBuilder().WithScheme(scheme).WithObjects(cos).Build(), + clocktesting.NewFakeClock(creation.Add(61*time.Second)), + ) + + require.Equal(t, time.Duration(0), limiter.When(req)) + require.Equal(t, 30*time.Second, limiter.When(req)) + + limiter.Forget(req) + + require.Equal(t, time.Duration(0), limiter.When(req)) + }) +} diff --git a/test/e2e/features/install.feature b/test/e2e/features/install.feature index ceb5e32c2..6265a8eff 100644 --- a/test/e2e/features/install.feature +++ b/test/e2e/features/install.feature @@ -435,7 +435,7 @@ Feature: Install ClusterExtension Then ClusterObjectSet "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message: """ - Revision has not rolled out for 1 minute(s). + Revision has not rolled out for 1 minute(s). Last status: Revision 1.0.2 is rolling out. """ And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation @@ -471,7 +471,7 @@ Feature: Install ClusterExtension Then ClusterObjectSet "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message: """ - Revision has not rolled out for 1 minute(s). + Revision has not rolled out for 1 minute(s). Last status: Revision 1.0.3 is rolling out. """ And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation diff --git a/test/e2e/features/revision.feature b/test/e2e/features/revision.feature index 38ed16027..32f98de5f 100644 --- a/test/e2e/features/revision.feature +++ b/test/e2e/features/revision.feature @@ -601,3 +601,72 @@ Feature: Install ClusterObjectSet """ And ClusterObjectSet "${COS_NAME}" reconciliation is triggered Then ClusterObjectSet "${COS_NAME}" reports Progressing as True with Reason Succeeded + + + @ProgressDeadline + Scenario: Archiving a COS with ProgressDeadlineExceeded cleans up its resources + Given min value for ClusterObjectSet .spec.progressDeadlineMinutes is set to 1 + And ServiceAccount "olm-sa" with needed permissions is available in test namespace + When ClusterObjectSet is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterObjectSet + metadata: + annotations: + olm.operatorframework.io/service-account-name: olm-sa + olm.operatorframework.io/service-account-namespace: ${TEST_NAMESPACE} + name: ${COS_NAME} + spec: + lifecycleState: Active + collisionProtection: Prevent + progressDeadlineMinutes: 1 + progressionProbes: + - selector: + type: GroupKind + groupKind: + group: apps + kind: Deployment + assertions: + - type: ConditionEqual + conditionEqual: + type: Available + status: "True" + phases: + - name: resources + objects: + - object: + apiVersion: v1 + kind: ConfigMap + metadata: + name: test-configmap + namespace: ${TEST_NAMESPACE} + data: + foo: bar + - object: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: test-deployment + namespace: ${TEST_NAMESPACE} + spec: + replicas: 1 + selector: + matchLabels: + app: never-ready + template: + metadata: + labels: + app: never-ready + spec: + containers: + - name: never-ready + image: does-not-exist:latest + revision: 1 + """ + Then resource "configmap/test-configmap" is installed + And resource "deployment/test-deployment" is installed + And ClusterObjectSet "${COS_NAME}" reports Progressing as False with Reason ProgressDeadlineExceeded + When ClusterObjectSet "${COS_NAME}" lifecycle is set to "Archived" + Then ClusterObjectSet "${COS_NAME}" is archived + And resource "configmap/test-configmap" is eventually not found + And resource "deployment/test-deployment" is eventually not found diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index 5466e2fc0..a91bfbacb 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -99,6 +99,7 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)ClusterExtension is applied(?:\s+.*)?$`, ResourceIsApplied) sc.Step(`^(?i)ClusterExtension is updated to version "([^"]+)"$`, ClusterExtensionVersionUpdate) sc.Step(`^(?i)ClusterExtension is updated(?:\s+.*)?$`, ResourceIsApplied) + sc.Step(`^(?i)ClusterObjectSet "([^"]+)" lifecycle is set to "([^"]+)"$`, ClusterObjectSetLifecycleUpdate) sc.Step(`^(?i)ClusterExtension is available$`, ClusterExtensionIsAvailable) sc.Step(`^(?i)ClusterExtension is rolled out$`, ClusterExtensionIsRolledOut) sc.Step(`^(?i)ClusterExtension resources are created and labeled$`, ClusterExtensionResourcesCreatedAndAreLabeled) @@ -377,6 +378,23 @@ func ClusterExtensionVersionUpdate(ctx context.Context, version string) error { return err } +// ClusterObjectSetLifecycleUpdate patches the ClusterObjectSet's lifecycleState to the specified value. +func ClusterObjectSetLifecycleUpdate(ctx context.Context, cosName, lifecycle string) error { + sc := scenarioCtx(ctx) + cosName = substituteScenarioVars(cosName, sc) + patch := map[string]any{ + "spec": map[string]any{ + "lifecycleState": lifecycle, + }, + } + pb, err := json.Marshal(patch) + if err != nil { + return err + } + _, err = k8sClient("patch", "clusterobjectset", cosName, "--type", "merge", "-p", string(pb)) + return err +} + // ResourceIsApplied applies the provided YAML resource to the cluster and in case of ClusterExtension or ClusterObjectSet it captures // its name in the test context so that it can be referred to in later steps with ${NAME} or ${COS_NAME}, respectively func ResourceIsApplied(ctx context.Context, yamlTemplate *godog.DocString) error { From ca0d7771d5def517b4c6dc7c64392a74fbbce7fd Mon Sep 17 00:00:00 2001 From: Joe Lanford Date: Tue, 28 Apr 2026 10:46:33 -0400 Subject: [PATCH 2/7] test: add unit test for recovery from ProgressDeadlineExceeded to Succeeded --- .../clusterobjectset_controller_test.go | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/internal/operator-controller/controllers/clusterobjectset_controller_test.go b/internal/operator-controller/controllers/clusterobjectset_controller_test.go index b43a608e1..75e7a2f8e 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller_test.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller_test.go @@ -1000,6 +1000,39 @@ func Test_ClusterObjectSetReconciler_Reconcile_ProgressDeadline(t *testing.T) { require.Equal(t, ocv1.ReasonRollingOut, cnd.Reason) }, }, + { + name: "recovery from ProgressDeadlineExceeded to Succeeded when revision completes", + existingObjs: func() []client.Object { + ext := newTestClusterExtension() + rev1 := newTestClusterObjectSet(t, clusterObjectSetName, ext, testScheme) + rev1.Spec.ProgressDeadlineMinutes = 1 + rev1.CreationTimestamp = metav1.NewTime(time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC)) + meta.SetStatusCondition(&rev1.Status.Conditions, metav1.Condition{ + Type: ocv1.ClusterObjectSetTypeProgressing, + Status: metav1.ConditionFalse, + Reason: ocv1.ReasonProgressDeadlineExceeded, + Message: "Revision has not rolled out for 1 minute(s). Last status: Revision 1.0.0 is rolling out.", + ObservedGeneration: rev1.Generation, + }) + return []client.Object{rev1, ext} + }, + clock: clocktesting.NewFakeClock(time.Date(2022, 1, 1, 0, 5, 0, 0, time.UTC)), + revisionResult: &mockRevisionResult{ + isComplete: true, + }, + validate: func(t *testing.T, c client.Client) { + rev := &ocv1.ClusterObjectSet{} + err := c.Get(t.Context(), client.ObjectKey{ + Name: clusterObjectSetName, + }, rev) + require.NoError(t, err) + cnd := meta.FindStatusCondition(rev.Status.Conditions, ocv1.ClusterObjectSetTypeProgressing) + require.NotNil(t, cnd) + require.Equal(t, metav1.ConditionTrue, cnd.Status) + require.Equal(t, ocv1.ReasonSucceeded, cnd.Reason) + require.Equal(t, "Revision 1.0.0 has rolled out.", cnd.Message) + }, + }, { name: "no progression deadline checks on revision recovery", existingObjs: func() []client.Object { From 32a74a16362e28605f609dd5d3e6bede77287603 Mon Sep 17 00:00:00 2001 From: Joe Lanford Date: Tue, 28 Apr 2026 10:47:42 -0400 Subject: [PATCH 3/7] test: add table-driven unit tests for durationUntilDeadline --- .../controllers/progress_deadline_test.go | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/internal/operator-controller/controllers/progress_deadline_test.go b/internal/operator-controller/controllers/progress_deadline_test.go index 05a60adb6..73102e9ab 100644 --- a/internal/operator-controller/controllers/progress_deadline_test.go +++ b/internal/operator-controller/controllers/progress_deadline_test.go @@ -17,6 +17,89 @@ import ( ocv1 "github.com/operator-framework/operator-controller/api/v1" ) +func TestDurationUntilDeadline(t *testing.T) { + creation := time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC) + now := creation.Add(30 * time.Second) + clk := clocktesting.NewFakeClock(now) + + for _, tc := range []struct { + name string + cos ocv1.ClusterObjectSet + expectDuration time.Duration + expectHasDeadline bool + }{ + { + name: "progressDeadlineMinutes is 0 — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 0, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "Succeeded is true — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + Status: ocv1.ClusterObjectSetStatus{ + Conditions: []metav1.Condition{{ + Type: ocv1.ClusterObjectSetTypeSucceeded, + Status: metav1.ConditionTrue, + }}, + }, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "lifecycleState is Archived — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateArchived}, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "DeletionTimestamp is set — no deadline", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{ + CreationTimestamp: metav1.NewTime(creation), + DeletionTimestamp: &metav1.Time{Time: now}, + }, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: 0, + expectHasDeadline: false, + }, + { + name: "deadline not yet exceeded — returns positive remaining", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation)}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: 30 * time.Second, + expectHasDeadline: true, + }, + { + name: "deadline already exceeded — returns negative remaining", + cos: ocv1.ClusterObjectSet{ + ObjectMeta: metav1.ObjectMeta{CreationTimestamp: metav1.NewTime(creation.Add(-2 * time.Minute))}, + Spec: ocv1.ClusterObjectSetSpec{ProgressDeadlineMinutes: 1, LifecycleState: ocv1.ClusterObjectSetLifecycleStateActive}, + }, + expectDuration: -90 * time.Second, + expectHasDeadline: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + duration, hasDeadline := durationUntilDeadline(clk, &tc.cos) + require.Equal(t, tc.expectHasDeadline, hasDeadline) + require.Equal(t, tc.expectDuration, duration) + }) + } +} + type fixedRateLimiter struct { duration time.Duration } From 9d522b57d0aef06d883dc64d80e664f34e53a620 Mon Sep 17 00:00:00 2001 From: Joe Lanford Date: Tue, 28 Apr 2026 10:48:41 -0400 Subject: [PATCH 4/7] docs: add comment clarifying Blocked takes precedence over ProgressDeadlineExceeded --- .../controllers/clusterobjectset_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/operator-controller/controllers/clusterobjectset_controller.go b/internal/operator-controller/controllers/clusterobjectset_controller.go index 76e389538..bd76374f4 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller.go @@ -127,6 +127,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl remaining, hasDeadline := durationUntilDeadline(c.Clock, cos) isDeadlineExceeded := hasDeadline && remaining <= 0 + // Blocked takes precedence over ProgressDeadlineExceeded: it is more actionable for the user. if err := c.verifyReferencedSecretsImmutable(ctx, cos); err != nil { l.Error(err, "referenced Secret verification failed, blocking reconciliation") markAsNotProgressing(cos, ocv1.ClusterObjectSetReasonBlocked, err.Error()) From 1bb08e9a6b411a15adf0826b32fbaed791aae234 Mon Sep 17 00:00:00 2001 From: Joe Lanford Date: Tue, 28 Apr 2026 14:30:04 -0400 Subject: [PATCH 5/7] refactor: hoist nonTerminalReasons map to package-level var --- .../controllers/clusterobjectset_controller.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/operator-controller/controllers/clusterobjectset_controller.go b/internal/operator-controller/controllers/clusterobjectset_controller.go index bd76374f4..bff8236bf 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller.go @@ -639,17 +639,17 @@ func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string, isDeadlin } } -func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string, isDeadlineExceeded bool) { - nonTerminalReasons := map[string]struct{}{ - ocv1.ReasonRollingOut: {}, - ocv1.ClusterObjectSetReasonRetrying: {}, - } +var nonTerminalProgressingReasons = map[string]struct{}{ + ocv1.ReasonRollingOut: {}, + ocv1.ClusterObjectSetReasonRetrying: {}, +} +func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string, isDeadlineExceeded bool) { switch reason { case ocv1.ReasonSucceeded: // Terminal — always apply. default: - if _, known := nonTerminalReasons[reason]; !known { + if _, known := nonTerminalProgressingReasons[reason]; !known { log.Log.Error(fmt.Errorf("unregistered progressing reason: %q", reason), "treating as non-terminal for deadline enforcement") } if isDeadlineExceeded { From 9d3df28700e19261a30c794f69cba2bad485f0d4 Mon Sep 17 00:00:00 2001 From: Joe Lanford Date: Tue, 28 Apr 2026 14:31:02 -0400 Subject: [PATCH 6/7] refactor: pass logger into markAsProgressing instead of using global log.Log --- .../clusterobjectset_controller.go | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/internal/operator-controller/controllers/clusterobjectset_controller.go b/internal/operator-controller/controllers/clusterobjectset_controller.go index bff8236bf..bc081b676 100644 --- a/internal/operator-controller/controllers/clusterobjectset_controller.go +++ b/internal/operator-controller/controllers/clusterobjectset_controller.go @@ -14,6 +14,7 @@ import ( "strings" "time" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -136,7 +137,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl phases, currentPhases, opts, err := c.buildBoxcutterPhases(ctx, cos) if err != nil { - setRetryingConditions(cos, err.Error(), isDeadlineExceeded) + setRetryingConditions(l, cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("converting to boxcutter revision: %v", err) } @@ -150,7 +151,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl revisionEngine, err := c.RevisionEngineFactory.CreateRevisionEngine(ctx, cos) if err != nil { - setRetryingConditions(cos, err.Error(), isDeadlineExceeded) + setRetryingConditions(l, cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("failed to create revision engine: %v", err) } @@ -176,7 +177,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl if err := c.establishWatch(ctx, cos, revision); err != nil { werr := fmt.Errorf("establish watch: %v", err) - setRetryingConditions(cos, werr.Error(), isDeadlineExceeded) + setRetryingConditions(l, cos, werr.Error(), isDeadlineExceeded) return ctrl.Result{}, werr } @@ -186,7 +187,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl // Log detailed reconcile reports only in debug mode (V(1)) to reduce verbosity. l.V(1).Info("reconcile report", "report", rres.String()) } - setRetryingConditions(cos, err.Error(), isDeadlineExceeded) + setRetryingConditions(l, cos, err.Error(), isDeadlineExceeded) return ctrl.Result{}, fmt.Errorf("revision reconcile: %v", err) } @@ -194,14 +195,14 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl // TODO: report status, backoff? if verr := rres.GetValidationError(); verr != nil { l.Error(fmt.Errorf("%w", verr), "preflight validation failed, retrying after 10s") - setRetryingConditions(cos, fmt.Sprintf("revision validation error: %s", verr), isDeadlineExceeded) + setRetryingConditions(l, cos, fmt.Sprintf("revision validation error: %s", verr), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } for i, pres := range rres.GetPhases() { if verr := pres.GetValidationError(); verr != nil { l.Error(fmt.Errorf("%w", verr), "phase preflight validation failed, retrying after 10s", "phase", i) - setRetryingConditions(cos, fmt.Sprintf("phase %d validation error: %s", i, verr), isDeadlineExceeded) + setRetryingConditions(l, cos, fmt.Sprintf("phase %d validation error: %s", i, verr), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } @@ -214,14 +215,14 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl if len(collidingObjs) > 0 { l.Error(fmt.Errorf("object collision detected"), "object collision, retrying after 10s", "phase", i, "collisions", collidingObjs) - setRetryingConditions(cos, fmt.Sprintf("revision object collisions in phase %d\n%s", i, strings.Join(collidingObjs, "\n\n")), isDeadlineExceeded) + setRetryingConditions(l, cos, fmt.Sprintf("revision object collisions in phase %d\n%s", i, strings.Join(collidingObjs, "\n\n")), isDeadlineExceeded) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } } revVersion := cos.GetAnnotations()[labels.BundleVersionKey] if rres.InTransition() { - markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) + markAsProgressing(l, cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) } //nolint:nestif @@ -241,7 +242,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl } } - markAsProgressing(cos, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion), isDeadlineExceeded) + markAsProgressing(l, cos, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion), isDeadlineExceeded) markAsAvailable(cos, ocv1.ClusterObjectSetReasonProbesSucceeded, "Objects are available and pass all probes.") // We'll probably only want to remove this once we are done updating the ClusterExtension conditions @@ -286,7 +287,7 @@ func (c *ClusterObjectSetReconciler) reconcile(ctx context.Context, cos *ocv1.Cl } else { markAsUnavailable(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) } - markAsProgressing(cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) + markAsProgressing(l, cos, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion), isDeadlineExceeded) if hasDeadline && !isDeadlineExceeded { return ctrl.Result{RequeueAfter: remaining}, nil } @@ -307,14 +308,15 @@ func (c *ClusterObjectSetReconciler) delete(ctx context.Context, cos *ocv1.Clust } func (c *ClusterObjectSetReconciler) archive(ctx context.Context, revisionEngine RevisionEngine, cos *ocv1.ClusterObjectSet, revision boxcutter.RevisionBuilder) (ctrl.Result, error) { + l := log.FromContext(ctx) tdres, err := revisionEngine.Teardown(ctx, revision) if err != nil { err = fmt.Errorf("error archiving revision: %v", err) - setRetryingConditions(cos, err.Error(), false) + setRetryingConditions(l, cos, err.Error(), false) return ctrl.Result{}, err } if tdres != nil && !tdres.IsComplete() { - setRetryingConditions(cos, "removing revision resources that are not owned by another revision", false) + setRetryingConditions(l, cos, "removing revision resources that are not owned by another revision", false) return ctrl.Result{RequeueAfter: 5 * time.Second}, nil } // Ensure conditions are set before removing the finalizer when archiving @@ -632,8 +634,8 @@ func buildProgressionProbes(progressionProbes []ocv1.ProgressionProbe) (probing. return userProbes, nil } -func setRetryingConditions(cos *ocv1.ClusterObjectSet, message string, isDeadlineExceeded bool) { - markAsProgressing(cos, ocv1.ClusterObjectSetReasonRetrying, message, isDeadlineExceeded) +func setRetryingConditions(l logr.Logger, cos *ocv1.ClusterObjectSet, message string, isDeadlineExceeded bool) { + markAsProgressing(l, cos, ocv1.ClusterObjectSetReasonRetrying, message, isDeadlineExceeded) if meta.FindStatusCondition(cos.Status.Conditions, ocv1.ClusterObjectSetTypeAvailable) != nil { markAsAvailableUnknown(cos, ocv1.ClusterObjectSetReasonReconciling, message) } @@ -644,13 +646,13 @@ var nonTerminalProgressingReasons = map[string]struct{}{ ocv1.ClusterObjectSetReasonRetrying: {}, } -func markAsProgressing(cos *ocv1.ClusterObjectSet, reason, message string, isDeadlineExceeded bool) { +func markAsProgressing(l logr.Logger, cos *ocv1.ClusterObjectSet, reason, message string, isDeadlineExceeded bool) { switch reason { case ocv1.ReasonSucceeded: // Terminal — always apply. default: if _, known := nonTerminalProgressingReasons[reason]; !known { - log.Log.Error(fmt.Errorf("unregistered progressing reason: %q", reason), "treating as non-terminal for deadline enforcement") + l.Error(fmt.Errorf("unregistered progressing reason: %q", reason), "treating as non-terminal for deadline enforcement") } if isDeadlineExceeded { markAsNotProgressing(cos, ocv1.ReasonProgressDeadlineExceeded, From feedd23ca930fbbd70eceac044dce6de93d18ac6 Mon Sep 17 00:00:00 2001 From: Joe Lanford Date: Tue, 28 Apr 2026 14:33:42 -0400 Subject: [PATCH 7/7] test: add e2e test for recovery from ProgressDeadlineExceeded to Succeeded --- test/e2e/features/revision.feature | 69 ++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/test/e2e/features/revision.feature b/test/e2e/features/revision.feature index 32f98de5f..866e8195f 100644 --- a/test/e2e/features/revision.feature +++ b/test/e2e/features/revision.feature @@ -670,3 +670,72 @@ Feature: Install ClusterObjectSet Then ClusterObjectSet "${COS_NAME}" is archived And resource "configmap/test-configmap" is eventually not found And resource "deployment/test-deployment" is eventually not found + + @ProgressDeadline + Scenario: COS recovers from ProgressDeadlineExceeded to Succeeded when probes pass + Given min value for ClusterObjectSet .spec.progressDeadlineMinutes is set to 1 + And ServiceAccount "olm-sa" with needed permissions is available in test namespace + When ClusterObjectSet is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterObjectSet + metadata: + annotations: + olm.operatorframework.io/service-account-name: olm-sa + olm.operatorframework.io/service-account-namespace: ${TEST_NAMESPACE} + name: ${COS_NAME} + spec: + lifecycleState: Active + collisionProtection: Prevent + progressDeadlineMinutes: 1 + progressionProbes: + - selector: + type: GroupKind + groupKind: + group: apps + kind: Deployment + assertions: + - type: ConditionEqual + conditionEqual: + type: Available + status: "True" + phases: + - name: resources + objects: + - object: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: test-deployment + namespace: ${TEST_NAMESPACE} + spec: + replicas: 1 + selector: + matchLabels: + app: delayed-ready + template: + metadata: + labels: + app: delayed-ready + spec: + containers: + - name: delayed-ready + image: busybox:1.36 + command: ["sleep", "1000"] + readinessProbe: + exec: + command: ["true"] + initialDelaySeconds: 65 + securityContext: + runAsNonRoot: true + runAsUser: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + revision: 1 + """ + Then ClusterObjectSet "${COS_NAME}" reports Progressing as False with Reason ProgressDeadlineExceeded + And ClusterObjectSet "${COS_NAME}" reports Progressing as True with Reason Succeeded