From 9c899dcc0d85c95e267468bd30d006a0bb4f423c Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Thu, 19 Sep 2024 11:44:56 -0400
Subject: [PATCH 1/6] chore: add exit code & tox fix

Currently, the training library does not exit when an error is encountered
within the training loop (invoked through torchrun). This commit updates
that functionality so we correctly return an exit code of 1 on child failure.

Additionally, this commit also adds the `make fix` command which
automatically fixes all trivial issues picked up on by ruff

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/training/main_ds.py | 15 ++++++++++-----
 tox.ini                             |  1 -
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index c5cdb2ba..d1ae0e01 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -761,6 +761,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     print(f"\033[92mRunning training command as subprocess: {' '.join(command)}\033[0m")
     process = None
     interrupt: KeyboardInterrupt | Exception | None = None
+    failure = False
     try:
         process = StreamablePopen(
             f"{train_args.ckpt_output_dir}/full_logs_global{torch_args.node_rank}.log",
@@ -771,19 +772,20 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         print("Training subprocess interrupted by user.")
         interrupt = e
     except Exception as e:
-        print(f"An error occurred: {str(e)}")
+        print("Unexpected exception received during distributed training")
         interrupt = e
     finally:
         if "process" not in locals() or process is None:
             return
-        if process.poll() == 0:
-            print("\033[92mTraining subprocess exited successfully! 🎉\033[0m")
+
+        failure = process.poll() != 0
+        if not failure:
+            print("\033[92mOperation completed successfully! 🎉\033[0m")
         else:
             print(
                 "\033[91mTraining subprocess has not exited yet. Sending SIGTERM.\033[0m"
             )
 
-        print("Sending interrupt signal to Training subprocess.")
         process.terminate()
         try:
             print("Waiting for process to exit, 60s...")
@@ -795,8 +797,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
             process.kill()
 
         if interrupt:
-            print(f"Error caught from training subprocess.: {interrupt}")
             raise interrupt
+        if failure:
+            raise RuntimeError(
+                "Suffered a failure during distributed training. Please see the training logs for more context."
+            )
 
 
 if __name__ == "__main__":
diff --git a/tox.ini b/tox.ini
index b6b625cd..5756b665 100644
--- a/tox.ini
+++ b/tox.ini
@@ -66,7 +66,6 @@ commands =
     sh -c 'git diff --exit-code || (echo "pyproject.toml formatting is incorrect. Please run \"make toml-fmt\" and commit the changes." && exit 1)'
 allowlist_externals = make, sh
 
-
 [testenv:spellcheck]
 description = spell check (needs 'aspell' command)
 basepython = {[testenv:py3]basepython}

From af89deded32582e76663eff4fac3fbccfa14ebe6 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Mon, 21 Oct 2024 13:25:39 -0400
Subject: [PATCH 2/6] ci: grant HF_TOKEN access to the medium-size E2E CI job

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-a10g-x1.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index 3485ad73..ddc4cb81 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -137,6 +137,8 @@ jobs:
 
       - name: Run e2e test
         working-directory: ./instructlab
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           . venv/bin/activate
           ./scripts/e2e-ci.sh -m
@@ -155,6 +157,7 @@ jobs:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           aws-region: ${{ secrets.AWS_REGION }}
+
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
         with:

From e49809b3bd12d21610b3f9ebd325e4d6004b9c84 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Mon, 21 Oct 2024 13:24:43 -0400
Subject: [PATCH 3/6] ci: add large-size E2E CI job

this commit adds a new workflow to the Training repo
it will run a nightly cron job to test the current
'main' branch of Training against the current
'main' branch of the CLI (instructlab)

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-l40s-x4.yml | 231 +++++++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 .github/workflows/e2e-nvidia-l40s-x4.yml

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
new file mode 100644
index 00000000..e7ff227a
--- /dev/null
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA L40S x4)
+
+on:
+  schedule:
+    - cron: '0 16 * * *' # Runs at 4PM UTC every day
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+jobs:
+  start-large-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-01a89eee1adde309c
+          ec2-instance-type: g6e.12xlarge
+          subnet-id: subnet-024298cefa3bedd61
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  e2e-large-test:
+    needs:
+      - start-large-ec2-runner
+    runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+  
+      - name: Checkout instructlab/training
+        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        run: |
+          PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
+          if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+          echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
+
+      - name: Check if gh cli is installed
+        id: gh_cli
+        run: |
+          if command -v gh &> /dev/null ; then
+            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Install gh CLI
+        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+        run: |
+          sudo dnf install 'dnf-command(config-manager)' -y
+          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+          sudo dnf install gh --repo gh-cli -y
+
+      - name: test gh CLI
+        run: |
+          gh --version
+
+      - name: set default repo
+        run: |
+          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        run: |
+          git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+
+      - name: Install ilab
+        working-directory: ./instructlab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          nvidia-smi
+          python3.11 -m pip cache remove llama_cpp_python
+
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
+
+          # https://github.com/instructlab/instructlab/issues/1821
+          # install with Torch and build dependencies installed
+          python3.11 -m pip install packaging wheel setuptools-scm
+          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
+
+      - name: Update instructlab-training library
+        working-directory: ./training
+        run: |
+          . ../instructlab/venv/bin/activate
+          pip install .
+          pip install .[cuda]
+
+      - name: Check disk
+        run: |
+          df -h
+
+      - name: Run e2e test
+        working-directory: ./instructlab
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          . venv/bin/activate
+          ./scripts/e2e-ci.sh -l
+
+      - name: Add comment to PR if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Post job results to Slack if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-failure
+        uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+          channel-id: 'e2e-ci-results'
+          slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+
+      - name: Post job results to Slack if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-success
+        uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+          channel-id: 'e2e-ci-results'
+          slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+
+  stop-large-ec2-runner:
+    needs:
+      - start-large-ec2-runner
+      - e2e-large-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

From 51893905bcc43769a2c412835473bcbad3d17dd1 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Mon, 21 Oct 2024 17:22:43 -0400
Subject: [PATCH 4/6] chore: minor CI tweaks

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/mergify.yml                      | 4 ++--
 .github/workflows/e2e-nvidia-a10g-x1.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index edd79057..404565f2 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -25,11 +25,11 @@ pull_request_rules:
         - -files~=^\.github/(actions|workflows)/.*\.ya?ml$
         - -files~=^\.github/workflows/actionlint\.
 
-    # e2e workflow
+    # e2e medium workflow
     - or:
       - and:
         # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
-        - check-success=e2e-medium-workflow-complete
+        - check-success~=e2e-medium-workflow-complete
         - or:
           - files~=\.py$
           - files=pyproject.toml
diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index 3485ad73..64f1c5d3 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -142,7 +142,6 @@ jobs:
           ./scripts/e2e-ci.sh -m
 
   stop-medium-ec2-runner:
-    name: Stop external EC2 runner
     needs:
       - start-medium-ec2-runner
       - e2e-medium-test
@@ -155,6 +154,7 @@ jobs:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           aws-region: ${{ secrets.AWS_REGION }}
+
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
         with:

From 3a7b7db5e116d049e263a57c7488dd061dd7a81c Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 23 Oct 2024 14:57:16 -0400
Subject: [PATCH 5/6] fix: add working directory config to steps in large E2E
 CI job

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-l40s-x4.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index e7ff227a..e8b578f6 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -107,6 +107,7 @@ jobs:
           gh --version
 
       - name: set default repo
+        working-directory: ./training
         run: |
           gh repo set-default ${{ github.server_url }}/${{ github.repository }}
         env:
@@ -114,6 +115,7 @@ jobs:
 
       - name: Add comment to PR
         if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
         env:
@@ -121,6 +123,7 @@ jobs:
 
       - name: Fetch and checkout PR
         if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
         env:
@@ -128,6 +131,7 @@ jobs:
 
       - name: Checkout branch
         if: steps.check_pr.outputs.is_pr == 'false'
+        working-directory: ./training
         run: |
           git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
 

From 890ebd91227a62886204d0efe461a5d24d157c25 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 23 Oct 2024 15:46:55 -0400
Subject: [PATCH 6/6] fix: add remaining missing working directory configs

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-l40s-x4.yml | 2 ++
 README.md                                | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index e8b578f6..b75c8cc5 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -174,6 +174,7 @@ jobs:
 
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
         env:
@@ -181,6 +182,7 @@ jobs:
 
       - name: Add comment to PR if the workflow succeeded
         if: success() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
         env:
diff --git a/README.md b/README.md
index e9cf199d..be396a64 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
 # InstructLab Training Library
 
 ![Lint](https://github.com/instructlab/training/actions/workflows/lint.yml/badge.svg?branch=main)
-![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
 ![Build](https://github.com/instructlab/training/actions/workflows/pypi.yaml/badge.svg?branch=main)
 ![Release](https://img.shields.io/github/v/release/instructlab/training)
 ![License](https://img.shields.io/github/license/instructlab/training)
 
+![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
+![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
+
 - [Installing](#installing-the-library)
   - [Additional Nvidia packages](#additional-nvidia-packages)
 - [Using the library](#using-the-library)