Skip to content

Commit

Permalink
ci: add tests for fluxqueue
Browse files Browse the repository at this point in the history
Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Jan 18, 2025
1 parent d6d7535 commit 99cdf74
Show file tree
Hide file tree
Showing 12 changed files with 524 additions and 104 deletions.
102 changes: 102 additions & 0 deletions .github/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash

set -eEu -o pipefail

# Keep track of root directory to return to
here=$(pwd)
registry=${1:-ghcr.io/converged-computing}
namespace=${2:-fluxqueue-system}

# These containers should already be loaded into minikube
echo "Sleeping 20 seconds waiting for images to deploy"
sleep 20
kubectl get pods -n ${namespace}

# Get pod for controller, scheduler, and postgres
controller_pod=$(kubectl get pods -n ${namespace} -o json | jq -r .items[0].metadata.name)
echo "Found fluxqueue controller pod: ${controller_pod}"
scheduler_pod=$(kubectl get pods -n ${namespace} -o json | jq -r .items[1].metadata.name)
echo "Found fluxqueue scheduler pod: ${scheduler_pod}"
postgres_pod=$(kubectl get pods -n ${namespace} -o json | jq -r .items[2].metadata.name)
echo "Found fluxqueue postgres pod: ${postgres_pod}"

function echo_run {
command="$@"
echo "⭐️ ${command}"
${command}
}


# Show logs for debugging, if needed
echo
echo_run kubectl logs -n ${namespace} ${controller_pod} -c manager
echo
echo_run kubectl logs -n ${namespace} ${controller_pod} -c fluxion

echo
echo
echo_run kubectl logs -n ${namespace} ${scheduler_pod}

echo
echo
echo_run kubectl logs -n ${namespace} ${postgres_pod}

# Shared function to check output
function check_output {
check_name="$1"
actual="$2"
expected="$3"
if [[ "${expected}" != "${actual}" ]]; then
echo "Expected output is ${expected}"
echo "Actual output is ${actual}"
exit 1
fi
}

# Now create the pod and job
echo_run kubectl apply -f ./examples/pod.yaml
sleep 3
echo_run kubectl get pods

# The pod should be running, and scheduler should be fluxion
scheduled_by=$(kubectl get pod pod -o json | jq -r .spec.schedulerName)
pod_status=$(kubectl get pods pod --no-headers -o custom-columns=":status.phase")
echo
echo " Pod Status: ${pod_status}"
echo " Scheduled by: ${scheduled_by}"
check_output 'check-pod-scheduled-by' "${scheduled_by}" "FluxionScheduler"
check_output 'check-pod-status' "${pod_status}" "Running"

# Now delete
echo_run kubectl delete -f ./examples/pod.yaml
sleep 2
pods_running=$(kubectl get pods -o json | jq -r '.items | length')
echo " Pods Running: ${pods_running}"
check_output 'check-pod-deleted' "${pods_running}" "0"

# Do the same for a job
echo_run kubectl apply -f ./examples/job.yaml
sleep 3
echo_run kubectl get pods


# Check both job pods
for pod in $(kubectl get pods -o json | jq -r .items[].metadata.name)
do
echo "Checking job pod ${pod}"
scheduled_by=$(kubectl get pod ${pod} -o json | jq -r .spec.schedulerName)
pod_status=$(kubectl get pods ${pod} --no-headers -o custom-columns=":status.phase")
echo
echo " Pod Status: ${pod_status}"
echo " Scheduled by: ${scheduled_by}"
check_output 'check-pod-scheduled-by' "${scheduled_by}" "FluxionScheduler"
check_output 'check-pod-status' "${pod_status}" "Running"
done


# Now delete the job - we are done!
echo_run kubectl delete -f ./examples/job.yaml
sleep 2
pods_running=$(kubectl get pods -o json | jq -r '.items | length')
echo " Pods Running: ${pods_running}"
check_output 'check-pod-deleted' "${pods_running}" "0"
115 changes: 115 additions & 0 deletions .github/workflows/build-deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: fluxqueue build-and deploy

on:
pull_request: {}
release:
types: [published]
push:
branches:
- main

jobs:
build-fluxqueue:
permissions:
packages: write
env:
container: ghcr.io/converged-computing/fluxqueue
runs-on: ubuntu-latest
name: build fluxqueue controller
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.22

- name: Build Container
run: make docker-build

- name: Tag Release Image
if: (github.event_name == 'release')
run: |
tag=${GITHUB_REF#refs/tags/}
echo "Tagging and releasing ${{ env.container}}:${tag}"
docker tag ${{ env.container }}:latest ${{ env.container }}:${tag}
- name: GHCR Login
if: (github.event_name != 'pull_request')
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Deploy Container
if: (github.event_name != 'pull_request')
run: docker push ${{ env.container }} --all-tags

build-postgres:
permissions:
packages: write
env:
container: ghcr.io/converged-computing/fluxqueue-postgres
runs-on: ubuntu-latest
name: build postgres
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.22

- name: Build Container
run: make build-postgres

- name: Tag Release Image
if: (github.event_name == 'release')
run: |
tag=${GITHUB_REF#refs/tags/}
echo "Tagging and releasing ${{ env.container}}:${tag}"
docker tag ${{ env.container }}:latest ${{ env.container }}:${tag}
- name: GHCR Login
if: (github.event_name != 'pull_request')
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Deploy Container
if: (github.event_name != 'pull_request')
run: docker push ${{ env.container }} --all-tags

build-scheduler:
permissions:
packages: write
env:
container: ghcr.io/converged-computing/fluxqueue-scheduler
runs-on: ubuntu-latest
name: build scheduler
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.22

- name: Build Container
run: make build-scheduler

- name: Tag Release Image
if: (github.event_name == 'release')
run: |
tag=${GITHUB_REF#refs/tags/}
echo "Tagging and releasing ${{ env.container}}:${tag}"
docker tag ${{ env.container }}:latest ${{ env.container }}:${tag}
- name: GHCR Login
if: (github.event_name != 'pull_request')
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Deploy Container
if: (github.event_name != 'pull_request')
run: docker push ${{ env.container }} --all-tags
43 changes: 43 additions & 0 deletions .github/workflows/e2e-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: fluxqueue test

on:
pull_request: {}
workflow_dispatch:

jobs:
test-fluxqueue:
env:
registry: ghcr.io/converged-computing
namespace: fluxqueue-system

runs-on: ubuntu-latest
name: build containers
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v4
with:
go-version: ^1.22

- name: Make Space For Build
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
- name: Create Kind Cluster
uses: helm/[email protected]
with:
cluster_name: kind
kubectl_version: v1.28.2
version: v0.20.0
config: ./examples/kind-config.yaml

- name: Install Cert Manager
run: |
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.16.2/cert-manager.yaml
sleep 20
- name: Deploy Fluxqueue
run: ./hack/quick-build-kind.sh ${{ env.registry }} ${{ env.namespace }}
- name: Test Fluxqueue
run: /bin/bash ./.github/test.sh ${{ env.registry }} ${{ env.namespace }}
47 changes: 47 additions & 0 deletions .github/workflows/helm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: fluxqueue-helm

on:
pull_request: {}
push:
branches:
- main
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest
name: Prepare chart
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- uses: actions/setup-go@v3
with:
go-version: ^1.22
- name: GHCR Login
if: (github.event_name != 'pull_request')
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Install Helm
run: |
export HELM_EXPERIMENTAL_OCI=1
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- name: Build chart
run: make helm

- name: Login to Helm
if: (github.event_name != 'pull_request')
env:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "${password}" | helm registry login -u ${username} --password-stdin ${registry}
PKG_RESPONSE=$(helm package ./chart)
echo "$PKG_RESPONSE"
CHART_TAR_GZ=$(basename "$PKG_RESPONSE")
helm push "$CHART_TAR_GZ" oci://ghcr.io/converged-computing/fluxqueue-helm
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ SELECT * from reservations;

### TODO

- [ ] need to cleanup - handle FluxJob object so doesn't keep reconciling. Likely we want to delete at some point.
- [ ] In the case of jobs that are changing (e.g., pods deleting, but we don't want to kill entire job) what should we do?
- we need to use shrink here. And a shrink down to size 0 I assume is a cancel.
- [ ] For cancel, we would issue a cancel for every pod associated with a job. How can we avoid that (or is that OK?)
Expand Down
Loading

0 comments on commit 99cdf74

Please sign in to comment.