Skip to content

Commit

Permalink
Cleanup Patch Sidecar Logging (#3973)
Browse files Browse the repository at this point in the history
Missed this little of nuance in code review when moving to Patch with
the SDK Sidecar - in that a Patch operation doesn't return a Conflict
error type/status value if it fails, so every time it does, the Agones
sidecar will log it as an error on production installs.

So this cleaned up a few things:

* renamed "debugError" type to "traceError" type, since we only log it
under Trace, not Debug.
* if `patchGameServer` fails under the error `IsInvalid`, wrap it in a
TraceError so it doesn't show up in prod.
* Wrapped the error with `errors` context, such that if the patch does
fail, it's far easier now to work out where it failed.

Closes #3967
  • Loading branch information
markmandel authored Sep 12, 2024
1 parent 632a866 commit 2b7dd19
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 25 deletions.
12 changes: 6 additions & 6 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -878,19 +878,19 @@ func (c *Controller) syncGameServerStartingState(ctx context.Context, gs *agones
if err != nil {
// expected to happen, so don't log it.
if k8serrors.IsNotFound(err) {
return nil, workerqueue.NewDebugError(err)
return nil, workerqueue.NewTraceError(err)
}

// do log if it's something other than NotFound, since that's weird.
return nil, err
}
if pod.Spec.NodeName == "" {
return gs, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
return gs, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
}

// Ensure the pod IPs are populated
if pod.Status.PodIPs == nil || len(pod.Status.PodIPs) == 0 {
return gs, workerqueue.NewDebugError(errors.Errorf("pod IPs not yet populated for Pod %s", pod.ObjectMeta.Name))
return gs, workerqueue.NewTraceError(errors.Errorf("pod IPs not yet populated for Pod %s", pod.ObjectMeta.Name))
}

node, err := c.nodeLister.Get(pod.Spec.NodeName)
Expand Down Expand Up @@ -943,7 +943,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag
if gs.Status.NodeName == "" {
addressPopulated = true
if pod.Spec.NodeName == "" {
return gs, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
return gs, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
}
node, err := c.nodeLister.Get(pod.Spec.NodeName)
if err != nil {
Expand All @@ -963,7 +963,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag
// check to make sure this container is actually running. If there was a recent crash, the cache may
// not yet have the newer, running container.
if cs.State.Running == nil {
return nil, workerqueue.NewDebugError(fmt.Errorf("game server container for GameServer %s in namespace %s is not currently running, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
return nil, workerqueue.NewTraceError(fmt.Errorf("game server container for GameServer %s in namespace %s is not currently running, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
}
gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] = cs.ContainerID
}
Expand All @@ -972,7 +972,7 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag
}
// Verify that we found the game server container - we may have a stale cache where pod is missing ContainerStatuses.
if _, ok := gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]; !ok {
return nil, workerqueue.NewDebugError(fmt.Errorf("game server container for GameServer %s in namespace %s not present in pod status, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
return nil, workerqueue.NewTraceError(fmt.Errorf("game server container for GameServer %s in namespace %s not present in pod status, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
}

// Also update the pod with the same annotation, so we can check if the Pod data is up-to-date, now and also in the HealthController.
Expand Down
2 changes: 1 addition & 1 deletion pkg/gameservers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ func (hc *HealthController) skipUnhealthyGameContainer(gs *agonesv1.GameServer,
// in which case, send it back to the queue to try again.
gsReadyContainerID := gs.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]
if pod.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] != gsReadyContainerID {
return false, workerqueue.NewDebugError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name))
return false, workerqueue.NewTraceError(errors.Errorf("pod and gameserver %s data are out of sync, retrying", gs.ObjectMeta.Name))
}

if gs.IsBeforeReady() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/gameservers/migration.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (mc *MigrationController) isMigratingGameServerPod(pod *k8sv1.Pod) (*agones
}

if pod.Spec.NodeName == "" {
return nil, nil, false, workerqueue.NewDebugError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
return nil, nil, false, workerqueue.NewTraceError(errors.Errorf("node not yet populated for Pod %s", pod.ObjectMeta.Name))
}

node, err := mc.nodeLister.Get(pod.Spec.NodeName)
Expand Down
8 changes: 7 additions & 1 deletion pkg/sdkserver/sdkserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -459,7 +460,12 @@ func (s *SDKServer) patchGameServer(ctx context.Context, gs, gsCopy *agonesv1.Ga
return nil, err
}

return s.gameServerGetter.GameServers(s.namespace).Patch(ctx, gs.GetObjectMeta().GetName(), types.JSONPatchType, patch, metav1.PatchOptions{})
gs, err = s.gameServerGetter.GameServers(s.namespace).Patch(ctx, gs.GetObjectMeta().GetName(), types.JSONPatchType, patch, metav1.PatchOptions{})
// if the test operation fails, no reason to error log
if err != nil && k8serrors.IsInvalid(err) {
err = workerqueue.NewTraceError(err)
}
return gs, errors.Wrapf(err, "error attempting to patch gameserver: %s/%s", gsCopy.ObjectMeta.Namespace, gsCopy.ObjectMeta.Name)
}

// updateLabels updates the labels on this GameServer to the ones persisted in SDKServer,
Expand Down
22 changes: 11 additions & 11 deletions pkg/util/workerqueue/workerqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,29 +37,29 @@ const (
workFx = time.Second
)

// debugError is a marker type for errors that that should only be logged at a Debug level.
// traceError is a marker type for errors that that should only be logged at a Trace level.
// Useful if you want a Handler to be retried, but not logged at an Error level.
type debugError struct {
type traceError struct {
err error
}

// NewDebugError returns a debugError wrapper around an error.
func NewDebugError(err error) error {
return &debugError{err: err}
// NewTraceError returns a traceError wrapper around an error.
func NewTraceError(err error) error {
return &traceError{err: err}
}

// Error returns the error string
func (l *debugError) Error() string {
func (l *traceError) Error() string {
if l.err == nil {
return "<nil>"
}
return l.err.Error()
}

// isDebugError returns if the error is a debug error or not
func isDebugError(err error) bool {
// isTraceError returns if the error is a trace error or not
func isTraceError(err error) bool {
cause := errors.Cause(err)
_, ok := cause.(*debugError)
_, ok := cause.(*traceError)
return ok
}

Expand Down Expand Up @@ -181,8 +181,8 @@ func (wq *WorkerQueue) processNextWorkItem(ctx context.Context) bool {

if err := wq.SyncHandler(ctx, key); err != nil {
// Conflicts are expected, so only show them in debug operations.
// Also check is debugError for other expected errors.
if k8serror.IsConflict(errors.Cause(err)) || isDebugError(err) {
// Also check is traceError for other expected errors.
if k8serror.IsConflict(errors.Cause(err)) || isTraceError(err) {
wq.logger.WithField(wq.keyName, obj).Trace(err)
} else {
runtime.HandleError(wq.logger.WithField(wq.keyName, obj), err)
Expand Down
10 changes: 5 additions & 5 deletions pkg/util/workerqueue/workerqueue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,13 +198,13 @@ func TestWorkerQueueEnqueueAfter(t *testing.T) {

func TestDebugError(t *testing.T) {
err := errors.New("not a debug error")
assert.False(t, isDebugError(err))
assert.False(t, isTraceError(err))

err = NewDebugError(err)
assert.True(t, isDebugError(err))
err = NewTraceError(err)
assert.True(t, isTraceError(err))
assert.EqualError(t, err, "not a debug error")

err = NewDebugError(nil)
assert.True(t, isDebugError(err))
err = NewTraceError(nil)
assert.True(t, isTraceError(err))
assert.EqualError(t, err, "<nil>")
}

0 comments on commit 2b7dd19

Please sign in to comment.