Add graceful shutdown to all health check failures #1807
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Rust GPU Tests | |
on: | |
pull_request: | |
concurrency: | |
group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}" | |
cancel-in-progress: true | |
jobs: | |
e2e: | |
runs-on: gpu | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Validate presence of GPU devices | |
run: nvidia-smi | |
- name: Check shared memory size | |
run: df -h | |
- name: Update gcc to version 11 | |
run: | | |
sudo apt-get update | |
sudo apt-get install -y build-essential manpages-dev software-properties-common | |
sudo add-apt-repository ppa:ubuntu-toolchain-r/test | |
sudo apt-get update | |
sudo apt-get install -y gcc-11 g++-11 | |
sudo ln -sf /usr/bin/gcc-11 /usr/bin/gcc | |
gcc --version | |
- name: Install OpenSSL && pkg-config && protobuf-compiler | |
run: sudo apt-get update && sudo apt-get install -y pkg-config libssl-dev protobuf-compiler | |
- name: Install CUDA and NCCL dependencies | |
if: steps.cache-cuda-nccl.outputs.cache-hit != 'true' | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
run: | | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb | |
sudo dpkg -i cuda-keyring_1.1-1_all.deb | |
sudo apt update | |
sudo apt install -y cuda-toolkit-12-2 cuda-command-line-tools-12-2 libnccl2 libnccl-dev | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Cache Rust build | |
uses: actions/cache@v4 | |
id: cache-rust | |
with: | |
path: | | |
~/.cargo/registry | |
~/.cargo/git | |
target | |
key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} | |
restore-keys: | | |
rust-build-${{ runner.os }}- | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Install Rust nightly | |
uses: dtolnay/rust-toolchain@master | |
with: | |
toolchain: nightly | |
- name: GPU Dependent Tests | |
timeout-minutes: 10 | |
run: cargo test --release --features gpu_dependent -- --test-threads=1 | |
shell: bash | |
env: | |
NCCL_P2P_LEVEL: LOC | |
NCCL_NET: Socket | |
NCCL_P2P_DIRECT_DISABLE: 1 | |
NCCL_SHM_DISABLE: 1 |