Skip to content

Commit

Permalink
Integrates upgrades tests into Cloud Build (googleforgames#4037)
Browse files Browse the repository at this point in the history
* Integrates upgrades tests into Cloud Build

* Templates current build version to pass through to test containers

* Use sed for variable replacement as envsubst not found in cloud build

* Upgrade tests wait to be oldest builds before running

* Bumps versions

* Allows the Helm install of the current Agones development build during the upgrade test

* Changes per linter

* Bumps retry timeout by 10s

* Deletes all dangling agones resources

* Updates evictable pods to have the same pod affinity rules as the test game servers

* Removes unnecessary check from the cloudbuild file

* Changes watch events to AddFunc rather than UpdateFunc

The same events are covered by AddFunc as by UpdateFunc with less overhead. Any container crashes are caught by the Game Server Unhealthy watch, and are not needed in this watch.
  • Loading branch information
igooch authored Dec 6, 2024
1 parent 5c64044 commit 171def9
Show file tree
Hide file tree
Showing 12 changed files with 502 additions and 209 deletions.
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ bin
/site/public
/test

# Allow upgrade test directory
!/test/upgrade

# Created by .ignore support plugin (hsz.mobi)
### Go template
# Binaries for programs and plugins
Expand Down
169 changes: 163 additions & 6 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -233,9 +233,18 @@ steps:
# End to end tests
#

# wait for us to be the oldest ongoing build before we run e2es
- name: gcr.io/cloud-builders/gcloud
id: e2e-wait-to-become-leader
# Build and Push upgrade test
- name: make-docker
id: push-upgrade-test
dir: test/upgrade
env: ['REGISTRY=${_REGISTRY}']
args: [push]
waitFor:
- push-images

# Wait for us to be the oldest ongoing build before we run upgrade and e2e tests
- name: gcr.io/google.com/cloudsdktool/cloud-sdk
id: wait-to-become-leader
waitFor: [push-images]
script: |
#!/usr/bin/env bash
Expand All @@ -258,10 +267,157 @@ steps:
- BUILD_ID=$BUILD_ID
- TRIGGER_NAME=$TRIGGER_NAME

# Run the upgrade tests parallel, fail this step if any of the tests fail
- name: gcr.io/google.com/cloudsdktool/cloud-sdk
id: submit-upgrade-test-cloud-build
dir: test/upgrade
entrypoint: bash
args:
- -c
- |
#!/usr/bin/env bash
set -e
set -o pipefail
export KUBECONFIG="/root/.kube/config"
mkdir -p /go/src/agones.dev/ /root/.kube/
ln -s /workspace /go/src/agones.dev/agones
cd /go/src/agones.dev/agones/test/upgrade
pids=()
typeset -A waitPids # Associative array for mapping `kubectl wait job` pid -> `kubectl wait job` output log name
tmpdir=$(mktemp -d)
trap 'rm -rf -- "$tmpdir"' EXIT SIGTERM
# Update image tags to include the current build version.
DevVersion="${_BASE_VERSION}-dev-$(git rev-parse --short=7 HEAD)"
export DevVersion
sed "s/\${DevVersion}/${DevVersion}/" upgradeTest.yaml > "${tmpdir}"/upgradeTest.yaml
sed "s/\${DevVersion}/${DevVersion}/" versionMap.yaml > "${tmpdir}"/versionMap.yaml
# Kill all currently running child processes on exit or if a non-zero signal is seen
trap 'echo Cleaning up any remaining running pids: $(jobs -p) ; kill $(jobs -p) 2> /dev/null || :' EXIT SIGTERM
cloudProducts=("generic" "gke-autopilot")
declare -A versionsAndRegions=( [1.31]=us-east1 [1.30]=us-central1 [1.29]=us-west1 )
for cloudProduct in "${cloudProducts[@]}"
do
for version in "${!versionsAndRegions[@]}"
do
region=${versionsAndRegions[$version]}
if [ "$cloudProduct" = generic ]
then
testCluster="standard-upgrade-test-cluster-${version//./-}"
else
testCluster="gke-autopilot-upgrade-test-cluster-${version//./-}"
fi
testClusterLocation="${region}"
gcloud container clusters get-credentials "$testCluster" --region="$testClusterLocation" --project="$PROJECT_ID"
if [ "$cloudProduct" = gke-autopilot ] ; then
# For autopilot clusters use evictable "balloon" pods to keep a buffer in node pool autoscaling.
kubectl apply -f evictablePods.yaml
fi
# Clean up any existing job / namespace / apiservice from previous run
echo Checking if resources from a previous build of upgrade-test-runner exist and need to be cleaned up on cluster "${testCluster}".
if kubectl get jobs | grep upgrade-test-runner ; then
echo Deleting job from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete job upgrade-test-runner
kubectl wait --for=delete pod -l job-name=upgrade-test-runner --timeout=5m
fi
# Check if there are any dangling game servers.
if kubectl get gs | grep ".*"; then
# Remove any finalizers so that dangling game servers can be manually deleted.
kubectl get gs -o=custom-columns=:.metadata.name --no-headers | xargs kubectl patch gs -p '{"metadata":{"finalizers":[]}}' --type=merge
sleep 5
echo Deleting game servers from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete gs -l app=sdk-client-test
fi
if kubectl get po -l app=sdk-client-test | grep ".*"; then
echo Deleting pods from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete po -l app=sdk-client-test
kubectl wait --for=delete pod -l app=sdk-client-test --timeout=5m
fi
# The v1.allocation.agones.dev apiservice does not get removed automatically and will prevent the namespace from terminating.
if kubectl get apiservice | grep v1.allocation.agones.dev ; then
echo Deleting v1.allocation.agones.dev from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete apiservice v1.allocation.agones.dev
fi
if kubectl get namespace | grep agones-system ; then
echo Deleting agones-system namespace from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl delete namespace agones-system
kubectl wait --for=delete ns agones-system --timeout=5m
fi
if kubectl get crds | grep agones ; then
echo Deleting crds from previous run of upgrade-test-runner on cluster "${testCluster}".
kubectl get crds -o=custom-columns=:.metadata.name | grep agones | xargs kubectl delete crd
fi
echo kubectl apply -f permissions.yaml on cluster "${testCluster}"
kubectl apply -f permissions.yaml
echo kubectl apply -f versionMap.yaml on cluster "${testCluster}"
kubectl apply -f "${tmpdir}"/versionMap.yaml
echo kubectl apply -f gameserverTemplate.yaml on cluster "${testCluster}"
kubectl apply -f gameserverTemplate.yaml
echo kubectl apply -f upgradeTest.yaml on cluster "${testCluster}"
kubectl apply -f "${tmpdir}"/upgradeTest.yaml
# We need to wait for job pod to be created and ready before we can wait on the job itself.
# TODO: Once all test clusters are at Kubernetes Version >= 1.31 use `kubectl wait --for=create` instead of sleep.
# kubectl wait --for=create pod -l job-name=upgrade-test-runner --timeout=1m
sleep 10s
kubectl wait --for=condition=ready pod -l job-name=upgrade-test-runner --timeout=5m
echo Wait for job upgrade-test-runner to complete or fail on cluster "${testCluster}"
kubectl wait job/upgrade-test-runner --timeout=20m --for jsonpath='{.status.conditions[*].status}'=True -o jsonpath='{.status.conditions[*].type}' | tee "${tmpdir}"/"${testCluster}".log &
waitPid=$!
pids+=( "$waitPid" )
waitPids[$waitPid]="${tmpdir}"/"${testCluster}".log
done
done
for pid in "${pids[@]}"; do
# This block executes when the process exits and pid status==0
if wait $pid; then
outputLog="${waitPids[$pid]}"
# wait for output to finish writing to file
until [ -s "$outputLog" ]; do sleep 1; done
output=$(<"${outputLog}")
echo "${outputLog}": "${output}"
# "Complete" is successful job run.
# Version 1.31 has "SuccessCriteriaMet" as the first completion status returned, or "FailureTarget" in case of failure.
if [ "$output" == "Complete" ] || [ "$output" == "SuccessCriteriaMet" ] ; then
continue
else
exit 1
fi
# This block executes when the process exits and pid status!=0
else
status=$?
outputLog="${waitPids[$pid]}"
echo "One of the upgrade tests pid $pid from cluster log $outputLog exited with a non-zero status ${status}."
exit $status
fi
done
echo "End of Upgrade Tests"
waitFor:
- wait-to-become-leader
- push-upgrade-test

# cancel all the orphan e2e test cloud builds, fail to cancel any of the build will fail this whole build
- name: gcr.io/cloud-builders/gcloud
id: cancel-orphan-e2e-tests
waitFor: [e2e-wait-to-become-leader]
waitFor: [wait-to-become-leader]
script: |
#!/usr/bin/env bash
until gcloud builds list --ongoing --filter "tags:'e2e-test'" --format="value(id)" | xargs --no-run-if-empty gcloud builds cancel
Expand Down Expand Up @@ -386,7 +542,7 @@ steps:
#
- name: gcr.io/cloud-builders/gcloud
id: cleanup-services
waitFor: [e2e-wait-to-become-leader]
waitFor: [wait-to-become-leader]
allowFailure: true
entrypoint: bash
args:
Expand All @@ -400,14 +556,15 @@ steps:
done
substitutions:
_BASE_VERSION: 1.46.0
_CACHE_BUCKET: agones-build-cache
_HTMLTEST_CACHE_KEY: htmltest-0.10.1
_CPP_SDK_BUILD_CACHE_KEY: cpp-sdk-build
_CPP_SDK_CONFORMANCE_CACHE_KEY: cpp-sdk-conformance
_RUST_SDK_BUILD_CACHE_KEY: rust-sdk-build
_REGISTRY: us-docker.pkg.dev/${PROJECT_ID}/ci
tags: [ci, 'commit-${COMMIT_SHA}']
timeout: 18000s # 5h: 3h (e2e-wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else)
timeout: 18000s # 5h: 3h (wait-to-become-leader) + 1.5h (e2e timeout) + 0.5h (everything else)
queueTtl: 259200s # 72h
images:
- ${_REGISTRY}/agones-controller
Expand Down
2 changes: 1 addition & 1 deletion test/sdk/go/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ project_path := $(dir $(mkfile_path))
root_path = $(realpath $(project_path)/)
# Because go mod init in the Dockerfile installs the most recently released version of Agones, this
# will need to be built and pushed post-release. During DEV it will be built at DEV - 1.
release_version = 1.44.0
release_version = 1.45.0
server_tag := $(REGISTRY)/sdk-client-test:$(release_version)

# _____ _
Expand Down
34 changes: 14 additions & 20 deletions test/upgrade/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,50 +12,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM gcr.io/cloud-builders/gcloud AS builder
FROM golang:1.22.9-alpine AS builder

RUN apt-get update && \
apt-get install -y curl && \
apt-get clean
# install curl
RUN apk update && \
apk upgrade && \
apk --no-cache add curl

WORKDIR /usr/local

# install kubectl
ENV KUBECTL_VER=1.29.7
ENV KUBECTL_VER=1.30.4
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VER}/bin/linux/amd64/kubectl && \
chmod go+rx ./kubectl && \
mv ./kubectl /usr/local/bin/kubectl

# install Helm package manager
ENV HELM_VER=3.14.3
ENV HELM_VER=3.16.3
ENV HELM_URL=https://get.helm.sh/helm-v${HELM_VER}-linux-amd64.tar.gz
RUN curl -L ${HELM_URL} > /tmp/helm.tar.gz \
&& tar -zxvf /tmp/helm.tar.gz -C /tmp \
&& mv /tmp/linux-amd64/helm /usr/local/bin/helm \
&& chmod go+rx /usr/local/bin/helm \
&& rm /tmp/helm.tar.gz && rm -rf /tmp/linux-amd64

# Build the Go image from source
FROM golang:1.22.6 AS build-stage

# Copy and build the Go application
WORKDIR /agones.dev

COPY *.go ./

COPY test/upgrade/main.go ./
RUN go mod init agones.dev/agones/test/upgrade/testContainer
RUN go mod tidy
RUN go mod download

RUN CGO_ENABLED=0 GOOS=linux go build -o /upgrade-test

# Copy the above binary into a lean image
FROM gcr.io/distroless/static-debian12:nonroot AS build-release-stage

# Copy the dev build Agones Helm chart
WORKDIR /

COPY --from=build-stage /upgrade-test /upgrade-test
COPY --from=builder /usr/local /usr/local

USER nonroot:nonroot
# Use a non-root user for security best practices
RUN adduser -D -g '' adduser
USER adduser
COPY --chown=adduser install/helm/agones /install/helm

ENTRYPOINT ["/upgrade-test"]
16 changes: 9 additions & 7 deletions test/upgrade/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,24 @@
#

REGISTRY ?=
mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
project_path := $(dir $(mkfile_path))
root_path = $(realpath $(project_path)/)
dev_version = 1.44.0-dev
server_tag := $(REGISTRY)/upgrade-test-controller:$(dev_version)

base_version = 1.46.0
# Version defaults to the short hash of the latest commit
VERSION ?= $(base_version)-dev-$(shell git rev-parse --short=7 HEAD)
server_tag := $(REGISTRY)/upgrade-test-controller:$(VERSION)
cwd:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
# _____ _
# |_ _|_ _ _ __ __ _ ___| |_ ___
# | |/ _` | '__/ _` |/ _ \ __/ __|
# | | (_| | | | (_| | __/ |_\__ \
# |_|\__,_|_| \__, |\___|\__|___/
# |___/

# Using .ONESHELL allows us to `cd` to the parent directory agones. This gives the Dockerfile the
# context of the agones directory, which allows it to COPY files from any child directory.
.ONESHELL:
# Build a docker image for the server, and tag it
build:
cd $(root_path) && docker build -f $(project_path)Dockerfile --tag=$(server_tag) .
cd "$(cwd)/../.." && DOCKER_BUILDKIT=1 docker build -f $(cwd)/Dockerfile --tag=$(server_tag) .

push: build
docker push $(server_tag)
67 changes: 67 additions & 0 deletions test/upgrade/evictablePods.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright 2024 Google LLC All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create evictable pods to prevent Autopilot clusters from completely scaling down.
# https://cloud.google.com/kubernetes-engine/docs/how-to/capacity-provisioning
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: low-priority
value: -10
preemptionPolicy: Never
globalDefault: false
description: "Low priority workloads"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: evictable-pods-deployment
spec:
replicas: 200
selector:
matchLabels:
app: evictable-pods
template:
metadata:
labels:
app: evictable-pods
# Label for use with packed game server pod affinity rules
agones.dev/role: gameserver
spec:
priorityClassName: low-priority
terminationGracePeriodSeconds: 0
containers:
- name: ubuntu
image: ubuntu
imagePullPolicy: IfNotPresent
command: ["sleep"]
args: ["infinity"]
resources:
requests:
memory: 52Mi
cpu: 30m
limits:
memory: 52Mi
cpu: 30m
# Use same affinity as packed game server pods
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm:
labelSelector:
matchLabels:
agones.dev/role: gameserver
topologyKey: kubernetes.io/hostname
weight: 100
Loading

0 comments on commit 171def9

Please sign in to comment.