Skip to content

Commit

Permalink
fc fancy
Browse files Browse the repository at this point in the history
Signed-off-by: Praateek <[email protected]>
  • Loading branch information
praateekmahajan committed Dec 17, 2024
1 parent 9df5d7b commit f3ce8d5
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 59 deletions.
104 changes: 50 additions & 54 deletions .github/workflows/gpuci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,88 +2,84 @@ name: "GPU CI/CD"

on:
push:
branches:
- main
branches: [main]
pull_request:
branches:
# We can run gpuCI on any PR targeting these branches
- 'main'
- '[rv][0-9].[0-9].[0-9]'
- '[rv][0-9].[0-9].[0-9]rc[0-9]'
# PR has to be labeled with "gpuCI" label
# If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI
types: [ labeled ]
types: [labeled]

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

# Reusable job templates
jobs:
# First, we build and push a NeMo-Curator container
build-container:
# "build-container" job is run if the "gpuci" label is added to the PR
if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }}
strategy:
matrix:
include:
- type: stable
image-suffix: ""
- type: nightly
image-suffix: "_nightly"
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
with:
image-name: nemo_curator_container
image-name: nemo_curator_container${{ matrix.image-suffix }}
dockerfile: Dockerfile
image-label: nemo-curator
image-label: nemo-curator${{ matrix.image-suffix }}
build-args: |
IMAGE_LABEL=nemo-curator
IMAGE_LABEL=nemo-curator${{ matrix.image-suffix }}
REPO_URL=https://github.com/${{ github.repository }}.git
CURATOR_COMMIT=${{ github.sha }}
BUILD_TYPE=${{ matrix.type }}
prune-filter-timerange: 24h

# Then, we run our PyTests in the container we just built
run-gpu-tests:
needs: build-container
# This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners
# It has 2 A100 GPUs
runs-on: self-hosted-azure
# "run-gpu-tests" job is run if the "gpuci" label is added to the PR
if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }}
strategy:
matrix:
include:
- type: stable
image-suffix: ""
- type: nightly
image-suffix: "_nightly"

steps:
# If something went wrong during the last cleanup, this step ensures any existing container is removed
- name: Remove existing container if it exists
run: |
if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then
docker rm -f nemo-curator-container
fi
env:
CONTAINER_NAME: nemo-curator-container${{ matrix.image-suffix }}
IMAGE_NAME: nemoci.azurecr.io/nemo_curator_container${{ matrix.image-suffix }}:${{ github.run_id }}

# This runs the container which was pushed by build-container, which we call "nemo-curator-container"
# `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
# We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
# `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
- name: Run Docker container
run: |
docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity"
steps:
- name: Cleanup existing container
run: docker rm -f ${{ env.CONTAINER_NAME }} || true

# Expect `whoami` to be "azureuser"
# Expect `nvidia-smi` to show our 2 A100 GPUs
- name: Check GPUs
run: |
whoami
docker exec nemo-curator-container nvidia-smi
- name: Run container
run: |
docker run --gpus all \
--name ${{ env.CONTAINER_NAME }} \
-d ${{ env.IMAGE_NAME }} \
bash -c "sleep infinity"
# In the virtual environment (called "curator") we created in the container,
# list all of our packages. Useful for debugging
- name: Verify installations
run: |
docker exec nemo-curator-container pip list
- name: Verify environment
run: |
echo "Checking system user:"
docker exec ${{ env.CONTAINER_NAME }} whoami
echo "Checking GPU availability:"
docker exec ${{ env.CONTAINER_NAME }} nvidia-smi
echo "Checking installed packages:"
docker exec ${{ env.CONTAINER_NAME }} pip list
# In the virtual environment (called "curator") we created in the container,
# run our PyTests marked with `@pytest.mark.gpu`
# We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository),
# and then the directory where the PyTests are located
- name: Run PyTests with GPU mark
run: |
docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests
- name: Run GPU tests
run: |
docker exec ${{ env.CONTAINER_NAME }} \
pytest -m gpu \
--rootdir /opt/NeMo-Curator \
/opt/NeMo-Curator/tests
# After running `docker stop`, the container remains in an exited state
# It is still present on our system and could be restarted with `docker start`
# Thus, we use `docker rm` to permanently removed it from the system
- name: Cleanup
if: always()
run: |
docker stop nemo-curator-container && docker rm nemo-curator-container
- name: Cleanup
if: always()
run: docker rm -f ${{ env.CONTAINER_NAME }} || true
21 changes: 16 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ARG PYTHON_VER=3.10
ARG IMAGE_LABEL
ARG REPO_URL
ARG CURATOR_COMMIT
ARG BUILD_TYPE=stable

FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} as curator-update
# Needed to navigate to and pull the forked repository's changes
Expand All @@ -23,14 +24,16 @@ RUN bash -exu <<EOF
git checkout $CURATOR_COMMIT
EOF


FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER}
LABEL "nemo.library"=${IMAGE_LABEL}
WORKDIR /opt

# Re-declare ARGs after new FROM to make them available in this stage
ARG CUDA_VER
ARG BUILD_TYPE

# Install the minimal libcu* libraries needed by NeMo Curator
ENV _CUDA_VER=${CUDA_VER}
RUN conda create -y --name curator -c nvidia/label/cuda-${_CUDA_VER} -c conda-forge \
RUN conda create -y --name curator -c nvidia/label/cuda-${CUDA_VER} -c conda-forge \
python=3.10 \
cuda-cudart \
libcufft \
Expand All @@ -48,15 +51,23 @@ RUN \
--mount=type=bind,source=/opt/NeMo-Curator/pyproject.toml,target=/opt/NeMo-Curator/pyproject.toml,from=curator-update \
cd /opt/NeMo-Curator && \
source activate curator && \
pip install ".[all]"
if [ "$BUILD_TYPE" = "nightly" ]; then \
pip install ".[all_nightly]"; \
else \
pip install ".[all]"; \
fi

COPY --from=curator-update /opt/NeMo-Curator/ /opt/NeMo-Curator/

# Clone the user's repository, find the relevant commit, and install everything we need
RUN bash -exu <<EOF
source activate curator
cd /opt/NeMo-Curator/
pip install --extra-index-url https://pypi.nvidia.com ".[all]"
if [ "$BUILD_TYPE" = "nightly" ]; then \
pip install --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple ".[all_nightly]"; \
else \
pip install --extra-index-url https://pypi.nvidia.com ".[all]"; \
fi
EOF

ENV PATH /opt/conda/envs/curator/bin:$PATH

0 comments on commit f3ce8d5

Please sign in to comment.