Skip to content

Orquesta CI

Orquesta CI #4865

# We run orquesta integration tests as part of a separate workflow.
# Orquesta tests have a lot of race conditions which result in intermediate failures and timeouts.
# Utilizing separate workflow allows us to re-run just this workflow / job on failure instead of
# wasting time and resources by needing to re-run all the jobs.
name: Orquesta CI
on:
push:
branches:
# only on merges to master branch
- master
# and version branches, which only include minor versions (eg: v3.4)
- v[0-9]+.[0-9]+
tags:
# also version tags, which include bugfix releases (eg: v3.4.0)
- v[0-9]+.[0-9]+.[0-9]+
pull_request:
types: [opened, reopened, synchronize]
branches:
# Only for PRs targeting those branches
- master
- v[0-9]+.[0-9]+
schedule:
# run every night at midnight
- cron: '0 0 * * *'
jobs:
# TODO: Fix the required checks!
# When the pre_job triggers and skips builds, it prevents merging the PR because
# the required checks are reported as skipped instead of passed.
# Special job which automatically cancels old runs for the same branch, prevents runs for the
# same file set which has already passed, etc.
pre_job:
name: Skip Duplicate Jobs Pre Job
runs-on: ubuntu-20.04
outputs:
should_skip: ${{ steps.skip_check.outputs.should_skip }}
steps:
- id: skip_check
uses: fkirc/skip-duplicate-actions@4c656bbdb6906310fa6213604828008bc28fe55d # v3.3.0
with:
cancel_others: 'true'
github_token: ${{ github.token }}
integration-tests:
needs: pre_job
# NOTE: We always want to run job on master since we run some additional checks there (code
# coverage, etc)
# if: ${{ needs.pre_job.outputs.should_skip != 'true' || github.ref == 'refs/heads/master' }}
name: '${{ matrix.make.name }} - Python ${{ matrix.python.version-short }}'
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
# NOTE: We need to use full Python version as part of Python deps cache key otherwise
# setup virtualenv step will fail.
python:
- {version-short: '3.8', version: '3.8.10'}
- {version-short: '3.9', version: '3.9.14'}
- {version-short: '3.10', version: '3.10.15'}
- {version-short: '3.11', version: '3.11.10'}
make:
- name: 'Integration Tests (Orquesta)'
task: 'ci-orquesta'
shard: {k: 0, n: 1}
services:
mongo:
image: mongo:7.0
ports:
- 27017:27017
rabbitmq:
image: rabbitmq:3.8-management
options: >-
--name rabbitmq
ports:
- 5671:5671/tcp # AMQP SSL port
- 5672:5672/tcp # AMQP standard port
- 15672:15672/tcp # Management: HTTP, CLI
redis:
# Docker Hub image
image: redis
# Set health checks to wait until redis has started
options: >-
--name "redis"
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 6379:6379/tcp
env:
TASK: '${{ matrix.make.task }}'
NODE_TOTAL: '${{ matrix.make.shard.n }}'
NODE_INDEX: '${{ matrix.make.shard.k }}'
# We need to explicitly specify terminal width otherwise some CLI tests fail on container
# environments where small terminal size is used.
COLUMNS: '120'
# CI st2.conf (with ST2_CI_USER user instead of stanley)
ST2_CONF: 'conf/st2.ci.conf'
# Tell StackStorm that we are indeed in CI mode, previously we hard coded a Travis specific
# environment variable in our test code, making it a PITA when we switch CI providers.
# Now, we simply set this environment varible here in the CI portion of our testing and
# it avoids any CI provider type lock-in.
ST2_CI: 'true'
# Name of the user who is running the CI (on GitHub Actions this is 'runner')
ST2_CI_USER: 'runner'
# GitHub is juggling how to set vars for multiple shells. Protect our PATH assumptions.
PATH: /home/runner/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Custom Environment Setup
run: |
./scripts/github/setup-environment.sh
- name: 'Set up Python (${{ matrix.python.version }}) and Cache Deps'
uses: ./.github/actions/setup-python
with:
python-version: '${{ matrix.python.version }}'
- name: Cache and Install APT Dependencies
uses: ./.github/actions/apt-packages
- name: Install virtualenv
run: |
./scripts/github/install-virtualenv.sh
- name: Install requirements
run: |
./scripts/ci/install-requirements.sh
- name: Setup Integration Tests
run: |
# prep a ci-specific dev conf file that uses runner instead of stanley
# this user is the username of the user in GitHub actions, used for SSH, etc during
# integration tests (important)
cp conf/st2.dev.conf "${ST2_CONF}" ; sed -i -e "s/stanley/${ST2_CI_USER}/" "${ST2_CONF}"
sudo -E ./scripts/ci/add-itest-user-key.sh
- name: Permissions Workaround
run: |
echo "$ST2_CI_REPO_PATH"
sudo ST2_CI_REPO_PATH="${ST2_CI_REPO_PATH}" scripts/ci/permissions-workaround.sh
- name: Print versions
run: |
./scripts/ci/print-versions.sh
- name: make
timeout-minutes: 41
env:
MAX_ATTEMPTS: 3
RETRY_DELAY: 5
# use: script -e -c to print colors
run: |
# There is a race in some orequesta integration tests so they tend to fail quite often.
# To avoid needed to re-run whole workflow in such case, we should try to retry this
# specific step. This saves us a bunch of time manually re-running the whole workflow.
# TODO: Try to identify problematic tests (iirc mostly orquesta ones) and only retry /
# re-run those.
set +e
for i in $(seq 1 ${MAX_ATTEMPTS}); do
echo "Attempt: ${i}/${MAX_ATTEMPTS}"
script -e -c "timeout 10m make ${TASK}" && exit 0
exit_code=$?
echo "Command failed / timed out (exit_code=${exit_code}), will retry in ${RETRY_DELAY} seconds..."
sleep ${RETRY_DELAY}
done
set -e
echo "Failed after ${MAX_ATTEMPTS} attempts, failing the job."
exit 1
- name: Compress Service Logs Before upload
if: ${{ failure() }}
run: |
./tools/launchdev.sh stop # stop st2 before collecting logs
tar cvzpf logs.tar.gz logs/*
- name: Upload StackStorm services Logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: logs-py${{ matrix.python.version }}
path: logs.tar.gz
retention-days: 7
slack-notification:
name: Slack notification for failed master builds
if: always()
needs:
- integration-tests
runs-on: ubuntu-20.04
steps:
- name: Workflow conclusion
# this step creates an environment variable WORKFLOW_CONCLUSION and is the most reliable way to check the status of previous jobs
uses: technote-space/workflow-conclusion-action@v2
- name: CI Run Failure Slack Notification
if: ${{ env.WORKFLOW_CONCLUSION == 'failure' && github.ref == 'refs/heads/master' }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
uses: voxmedia/github-action-slack-notify-build@v1
with:
channel: development
status: FAILED
color: danger
# HELPER FOR FUTURE DEVELOPERS:
# If your GitHub Actions job is failing and you need to debug it, by default there is
# no way to SSH into the container.
# The step below can be uncommeted and will stop here and allow you to SSH in.
# When this step is reached, simply refresh the GitHub Actions output for this build
# and this SSH command will be printed every 5 seconds to the output.
# Once you are done debugging in your SSH session, simply: touch /continue
# and this will continue the build.
#
# - name: Setup tmate session for debugging failed jobs (allows SSH into the container)
# uses: mxschmitt/action-tmate@v3
# if: "${{ failure() }}"
#