From 9c899dcc0d85c95e267468bd30d006a0bb4f423c Mon Sep 17 00:00:00 2001 From: Oleg S <97077423+RobotSail@users.noreply.github.com> Date: Thu, 19 Sep 2024 11:44:56 -0400 Subject: [PATCH 1/6] chore: add exit code & tox fix Currently, the training library does not exit when an error is encountered within the training loop (invoked through torchrun). This commit updates that functionality so we correctly return an exit code of 1 on child failure. Additionally, this commit also adds the `make fix` command which automatically fixes all trivial issues picked up on by ruff Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com> --- src/instructlab/training/main_ds.py | 15 ++++++++++----- tox.ini | 1 - 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index c5cdb2ba..d1ae0e01 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -761,6 +761,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: print(f"\033[92mRunning training command as subprocess: {' '.join(command)}\033[0m") process = None interrupt: KeyboardInterrupt | Exception | None = None + failure = False try: process = StreamablePopen( f"{train_args.ckpt_output_dir}/full_logs_global{torch_args.node_rank}.log", @@ -771,19 +772,20 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: print("Training subprocess interrupted by user.") interrupt = e except Exception as e: - print(f"An error occurred: {str(e)}") + print("Unexpected exception received during distributed training") interrupt = e finally: if "process" not in locals() or process is None: return - if process.poll() == 0: - print("\033[92mTraining subprocess exited successfully! 🎉\033[0m") + + failure = process.poll() != 0 + if not failure: + print("\033[92mOperation completed successfully! 🎉\033[0m") else: print( "\033[91mTraining subprocess has not exited yet. Sending SIGTERM.\033[0m" ) - print("Sending interrupt signal to Training subprocess.") process.terminate() try: print("Waiting for process to exit, 60s...") @@ -795,8 +797,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: process.kill() if interrupt: - print(f"Error caught from training subprocess.: {interrupt}") raise interrupt + if failure: + raise RuntimeError( + "Suffered a failure during distributed training. Please see the training logs for more context." + ) if __name__ == "__main__": diff --git a/tox.ini b/tox.ini index b6b625cd..5756b665 100644 --- a/tox.ini +++ b/tox.ini @@ -66,7 +66,6 @@ commands = sh -c 'git diff --exit-code || (echo "pyproject.toml formatting is incorrect. Please run \"make toml-fmt\" and commit the changes." && exit 1)' allowlist_externals = make, sh - [testenv:spellcheck] description = spell check (needs 'aspell' command) basepython = {[testenv:py3]basepython} From af89deded32582e76663eff4fac3fbccfa14ebe6 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Mon, 21 Oct 2024 13:25:39 -0400 Subject: [PATCH 2/6] ci: grant HF_TOKEN access to the medium-size E2E CI job Signed-off-by: Nathan Weinberg --- .github/workflows/e2e-nvidia-a10g-x1.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml index 3485ad73..ddc4cb81 100644 --- a/.github/workflows/e2e-nvidia-a10g-x1.yml +++ b/.github/workflows/e2e-nvidia-a10g-x1.yml @@ -137,6 +137,8 @@ jobs: - name: Run e2e test working-directory: ./instructlab + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | . venv/bin/activate ./scripts/e2e-ci.sh -m @@ -155,6 +157,7 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} + - name: Stop EC2 runner uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 with: From e49809b3bd12d21610b3f9ebd325e4d6004b9c84 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Mon, 21 Oct 2024 13:24:43 -0400 Subject: [PATCH 3/6] ci: add large-size E2E CI job this commit adds a new workflow to the Training repo it will run a nightly cron job to test the current 'main' branch of Training against the current 'main' branch of the CLI (instructlab) Signed-off-by: Nathan Weinberg --- .github/workflows/e2e-nvidia-l40s-x4.yml | 231 +++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 .github/workflows/e2e-nvidia-l40s-x4.yml diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml new file mode 100644 index 00000000..e7ff227a --- /dev/null +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -0,0 +1,231 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: E2E (NVIDIA L40S x4) + +on: + schedule: + - cron: '0 16 * * *' # Runs at 4PM UTC every day + workflow_dispatch: + inputs: + pr_or_branch: + description: 'pull request number or branch name' + required: true + default: 'main' + +jobs: + start-large-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Start EC2 runner + id: start-ec2-runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ami-01a89eee1adde309c + ec2-instance-type: g6e.12xlarge + subnet-id: subnet-024298cefa3bedd61 + security-group-id: sg-06300447c4a5fbef3 + iam-role-name: instructlab-ci-runner + aws-resource-tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} + ] + + e2e-large-test: + needs: + - start-large-ec2-runner + runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} + + permissions: + pull-requests: write + + steps: + - name: Install Packages + run: | + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel + + - name: Checkout instructlab/instructlab + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + with: + repository: "instructlab/instructlab" + path: "instructlab" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Checkout instructlab/training + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + with: + repository: "instructlab/training" + path: "training" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Determine if pr_or_branch is a PR number + id: check_pr + run: | + PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set + if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then + echo "is_pr=true" >> "$GITHUB_OUTPUT" + else + echo "is_pr=false" >> "$GITHUB_OUTPUT" + fi + echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT" + + - name: Check if gh cli is installed + id: gh_cli + run: | + if command -v gh &> /dev/null ; then + echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" + else + echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Install gh CLI + if: steps.gh_cli.outputs.gh_cli_installed == 'false' + run: | + sudo dnf install 'dnf-command(config-manager)' -y + sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo + sudo dnf install gh --repo gh-cli -y + + - name: test gh CLI + run: | + gh --version + + - name: set default repo + run: | + gh repo set-default ${{ github.server_url }}/${{ github.repository }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR + if: steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Fetch and checkout PR + if: steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout branch + if: steps.check_pr.outputs.is_pr == 'false' + run: | + git checkout ${{ steps.check_pr.outputs.pr_or_branch }} + + - name: Install ilab + working-directory: ./instructlab + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" + python3.11 -m venv --upgrade-deps venv + . venv/bin/activate + nvidia-smi + python3.11 -m pip cache remove llama_cpp_python + + CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install . + + # https://github.com/instructlab/instructlab/issues/1821 + # install with Torch and build dependencies installed + python3.11 -m pip install packaging wheel setuptools-scm + python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt + + - name: Update instructlab-training library + working-directory: ./training + run: | + . ../instructlab/venv/bin/activate + pip install . + pip install .[cuda] + + - name: Check disk + run: | + df -h + + - name: Run e2e test + working-directory: ./instructlab + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + . venv/bin/activate + ./scripts/e2e-ci.sh -l + + - name: Add comment to PR if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Add comment to PR if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'true' + run: | + gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Post job results to Slack if the workflow failed + if: failure() && steps.check_pr.outputs.is_pr == 'false' + id: slack-report-failure + uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel-id: 'e2e-ci-results' + slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} + + - name: Post job results to Slack if the workflow succeeded + if: success() && steps.check_pr.outputs.is_pr == 'false' + id: slack-report-success + uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs. + channel-id: 'e2e-ci-results' + slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }} + + stop-large-ec2-runner: + needs: + - start-large-ec2-runner + - e2e-large-test + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-large-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} From 51893905bcc43769a2c412835473bcbad3d17dd1 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Mon, 21 Oct 2024 17:22:43 -0400 Subject: [PATCH 4/6] chore: minor CI tweaks Signed-off-by: Nathan Weinberg --- .github/mergify.yml | 4 ++-- .github/workflows/e2e-nvidia-a10g-x1.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index edd79057..404565f2 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -25,11 +25,11 @@ pull_request_rules: - -files~=^\.github/(actions|workflows)/.*\.ya?ml$ - -files~=^\.github/workflows/actionlint\. - # e2e workflow + # e2e medium workflow - or: - and: # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml' - - check-success=e2e-medium-workflow-complete + - check-success~=e2e-medium-workflow-complete - or: - files~=\.py$ - files=pyproject.toml diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml index 3485ad73..64f1c5d3 100644 --- a/.github/workflows/e2e-nvidia-a10g-x1.yml +++ b/.github/workflows/e2e-nvidia-a10g-x1.yml @@ -142,7 +142,6 @@ jobs: ./scripts/e2e-ci.sh -m stop-medium-ec2-runner: - name: Stop external EC2 runner needs: - start-medium-ec2-runner - e2e-medium-test @@ -155,6 +154,7 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} + - name: Stop EC2 runner uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 with: From 3a7b7db5e116d049e263a57c7488dd061dd7a81c Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Wed, 23 Oct 2024 14:57:16 -0400 Subject: [PATCH 5/6] fix: add working directory config to steps in large E2E CI job Signed-off-by: Nathan Weinberg --- .github/workflows/e2e-nvidia-l40s-x4.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index e7ff227a..e8b578f6 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -107,6 +107,7 @@ jobs: gh --version - name: set default repo + working-directory: ./training run: | gh repo set-default ${{ github.server_url }}/${{ github.repository }} env: @@ -114,6 +115,7 @@ jobs: - name: Add comment to PR if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training run: | gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" env: @@ -121,6 +123,7 @@ jobs: - name: Fetch and checkout PR if: steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training run: | gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }} env: @@ -128,6 +131,7 @@ jobs: - name: Checkout branch if: steps.check_pr.outputs.is_pr == 'false' + working-directory: ./training run: | git checkout ${{ steps.check_pr.outputs.pr_or_branch }} From 890ebd91227a62886204d0efe461a5d24d157c25 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Wed, 23 Oct 2024 15:46:55 -0400 Subject: [PATCH 6/6] fix: add remaining missing working directory configs Signed-off-by: Nathan Weinberg --- .github/workflows/e2e-nvidia-l40s-x4.yml | 2 ++ README.md | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index e8b578f6..b75c8cc5 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -174,6 +174,7 @@ jobs: - name: Add comment to PR if the workflow failed if: failure() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training run: | gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." env: @@ -181,6 +182,7 @@ jobs: - name: Add comment to PR if the workflow succeeded if: success() && steps.check_pr.outputs.is_pr == 'true' + working-directory: ./training run: | gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" env: diff --git a/README.md b/README.md index e9cf199d..be396a64 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ # InstructLab Training Library ![Lint](https://github.com/instructlab/training/actions/workflows/lint.yml/badge.svg?branch=main) -![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) ![Build](https://github.com/instructlab/training/actions/workflows/pypi.yaml/badge.svg?branch=main) ![Release](https://img.shields.io/github/v/release/instructlab/training) ![License](https://img.shields.io/github/license/instructlab/training) +![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) +![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main) + - [Installing](#installing-the-library) - [Additional Nvidia packages](#additional-nvidia-packages) - [Using the library](#using-the-library)