try NCCL_P2P_DIRECT_DISABLE #109
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Rust GPU Tests | |
on: | |
push: | |
concurrency: | |
group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}" | |
cancel-in-progress: true | |
jobs: | |
e2e: | |
runs-on: gpu | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Validate presence of GPU devices | |
run: nvidia-smi | |
- name: Check shared memory size | |
run: df -h | |
- name: Install OpenSSL && pkg-config | |
run: sudo apt-get update && sudo apt-get install -y pkg-config libssl-dev | |
- name: Install CUDA and NCCL dependencies | |
if: steps.cache-cuda-nccl.outputs.cache-hit != 'true' | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
run: | | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb | |
sudo dpkg -i cuda-keyring_1.1-1_all.deb | |
sudo apt update | |
sudo apt install -y cuda-toolkit-12-2 libnccl2 libnccl-dev | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Cache Rust build | |
uses: actions/cache@v3 | |
id: cache-rust | |
with: | |
path: | | |
~/.cargo/registry | |
~/.cargo/git | |
target | |
key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} | |
restore-keys: | | |
rust-build-${{ runner.os }}- | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Install Rust nightly | |
uses: dtolnay/rust-toolchain@master | |
with: | |
toolchain: nightly | |
- name: E2E Tests | |
run: cargo test --release e2e | |
shell: bash | |
env: | |
NCCL_DEBUG: info | |
NCCL_P2P_LEVEL: LOC | |
NCCL_NET: Socket | |
NCCL_P2P_DIRECT_DISABLE: 1 |