From 7c24e9b26c5314233599e754249ef889d7d72416 Mon Sep 17 00:00:00 2001 From: Donnie Adams Date: Mon, 3 Mar 2025 10:32:39 -0500 Subject: [PATCH] fix: ensure proper controller shutdown if also starting Signed-off-by: Donnie Adams --- pkg/leader/leader.go | 37 +++++++------------------------------ pkg/router/router.go | 26 +++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/pkg/leader/leader.go b/pkg/leader/leader.go index baafeb0..32f1340 100644 --- a/pkg/leader/leader.go +++ b/pkg/leader/leader.go @@ -4,8 +4,6 @@ import ( "context" "fmt" "os" - "os/signal" - "syscall" "time" "github.com/obot-platform/nah/pkg/log" @@ -52,7 +50,7 @@ func NewElectionConfig(ttl time.Duration, namespace, name, lockType string, cfg } } -func (ec *ElectionConfig) Run(ctx context.Context, id string, onLeader OnLeader, onSwitchLeader OnNewLeader, signalDone chan struct{}) error { +func (ec *ElectionConfig) Run(ctx context.Context, id string, onLeader OnLeader, onSwitchLeader OnNewLeader, signalDone func()) error { if ec == nil { // Don't start leader election if there is no config. return onLeader(ctx) @@ -69,7 +67,7 @@ func (ec *ElectionConfig) Run(ctx context.Context, id string, onLeader OnLeader, return nil } -func (ec *ElectionConfig) run(ctx context.Context, id string, cb OnLeader, onSwitchLeader OnNewLeader, signalDone chan struct{}) error { +func (ec *ElectionConfig) run(ctx context.Context, id string, cb OnLeader, onSwitchLeader OnNewLeader, signalDone func()) error { rl, err := resourcelock.NewFromKubeconfig( ec.ResourceLockType, ec.Namespace, @@ -84,15 +82,6 @@ func (ec *ElectionConfig) run(ctx context.Context, id string, cb OnLeader, onSwi return fmt.Errorf("error creating leader lock for %s: %v", ec.Name, err) } - // Catch these signals to ensure a graceful shutdown and leader election release. - sigCtx, cancel := signal.NotifyContext(ctx, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGKILL) - defer func() { - if err != nil { - // If we encountered an error, cancel the context because we won't be using it. - cancel() - } - }() - le, err := leaderelection.NewLeaderElector(leaderelection.LeaderElectionConfig{ Lock: rl, LeaseDuration: ec.TTL, @@ -107,23 +96,11 @@ func (ec *ElectionConfig) run(ctx context.Context, id string, cb OnLeader, onSwi OnNewLeader: onSwitchLeader, OnStoppedLeading: func() { select { - case <-sigCtx.Done(): - // Must cancel so that the registered signals are no longer caught. - cancel() - - // The context has been canceled or is otherwise complete. - // This is a request to terminate. Exit 0. - // Exiting cleanly is useful when the context is canceled - // so that Kubernetes doesn't record it exiting in error - // when the exit was requested. For example, the wrangler-cli - // package sets up a context that cancels when SIGTERM is - // sent in. If a node is shut down this is the type of signal - // sent. In that case you want the 0 exit code to mark it as - // complete so that everything comes back up correctly after - // a restart. - // The pattern found here can be found inside the kube-scheduler. + case <-ctx.Done(): log.Infof("requested to terminate, exiting") - close(signalDone) + if signalDone != nil { + signalDone() + } default: log.Fatalf("leader election lost for %s", ec.Name) } @@ -136,7 +113,7 @@ func (ec *ElectionConfig) run(ctx context.Context, id string, cb OnLeader, onSwi } go func() { - le.Run(sigCtx) + le.Run(ctx) }() return nil } diff --git a/pkg/router/router.go b/pkg/router/router.go index d1d4f5b..b776557 100644 --- a/pkg/router/router.go +++ b/pkg/router/router.go @@ -48,6 +48,15 @@ func New(handlerSet *HandlerSet, electionConfig *leader.ElectionConfig, healthzP } func (r *Router) Stopped() <-chan struct{} { + // Hold the start lock to ensure we aren't starting and stopping at the same time. + r.startLock.Lock() + defer r.startLock.Unlock() + + if r.signalStopped == nil { + c := make(chan struct{}) + close(c) + return c + } return r.signalStopped } @@ -220,7 +229,18 @@ func (r *Router) Start(ctx context.Context) error { // Failed to preload caches, panic log.Fatalf("failed to preload caches: %v", err) } - }, r.signalStopped) + }, r.done) +} + +// done is a callback used by leader election to signal that the controllers are shut down. +func (r *Router) done() { + // Hold the start lock to ensure we aren't starting and stopping at the same time. + r.startLock.Lock() + defer r.startLock.Unlock() + + if r.signalStopped != nil { + close(r.signalStopped) + } } // startHandlers gets called when we become the leader or if there is no leader election. @@ -228,6 +248,10 @@ func (r *Router) startHandlers(ctx context.Context) error { r.startLock.Lock() defer r.startLock.Unlock() + if r.signalStopped == nil { + r.signalStopped = make(chan struct{}) + } + var err error // This is the leader now, so not ready until the controller is started and caches are ready. setHealthy(r.name, false)