Skip to content

Commit

Permalink
Add debug flags for HIP runs (#2206)
Browse files Browse the repository at this point in the history
Summary:
- Add debug flags for HIP runs when testing
- Fix FBGEMM_GPU-ROCm building process for ROCm 5.7
- Redo and reduce the workflow definitions for ROCm runs

Pull Request resolved: #2206

Reviewed By: sryap

Differential Revision: D52099327

Pulled By: q10

fbshipit-source-id: 98c6fae53ebfdc54ae9fa6cc9997166651e03454
  • Loading branch information
q10 authored and facebook-github-bot committed Dec 13, 2023
1 parent a029c50 commit 6c5d308
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 143 deletions.
2 changes: 2 additions & 0 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ __configure_fbgemm_gpu_build_rocm () {
build_args=(
--package_variant=rocm
-DTORCH_USE_HIP_DSA=1
# HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
-DHIP_ROOT_DIR=/opt/rocm
)
}

Expand Down
4 changes: 3 additions & 1 deletion .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,11 @@ run_fbgemm_gpu_tests () {

# Enable ROCM testing if specified
if [ "$fbgemm_variant" == "rocm" ]; then
echo "[TEST] Set environment variable FBGEMM_TEST_WITH_ROCM to enable ROCm tests ..."
echo "[TEST] Set environment variables for ROCm testing ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
fi

# These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/utils_pytorch.bash
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ install_pytorch_pip () {
echo "[INSTALL] Successfully installed PyTorch through PyTorch PIP"
}


################################################################################
# PyTorch Diagnose Functions
################################################################################
Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/utils_rocm.bash
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ install_rocm_ubuntu () {
print_exec rm -f "${package_name}"

echo "[INFO] Check ROCM GPU info ..."
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
print_exec rocm-smi

echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
Expand Down
6 changes: 6 additions & 0 deletions .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ print_gpu_info () {
return 1
fi
else
if which rocminfo; then
# If rocminfo is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
else
echo "[CHECK] rocminfo not found"
fi
if which rocm-smi; then
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocm-smi) || true
Expand Down
142 changes: 0 additions & 142 deletions .github/workflows/fbgemm_gpu_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,148 +28,6 @@ concurrency:
cancel-in-progress: true

jobs:
build_and_test_amd:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: ${{ matrix.container-image }}
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.12xlarge" },
]
container-image: [ "ubuntu:20.04" ]
python-version: [ "3.8", "3.9", "3.10" ]
rocm-version: [ "5.7" ]

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils git pciutils sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Free Disk Space
run: . $PRELUDE; free_disk_space

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install ROCm
run: . $PRELUDE; install_rocm_ubuntu $BUILD_ENV ${{ matrix.rocm-version }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install PyTorch-ROCm Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU-ROCm Nightly
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a

- name: Test FBGEMM_GPU-ROCm Nightly Installation
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm


test_amd_gpu:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_AMD_GPU: 1
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "rocm" },
]
# ROCm machines are limited, so we only test against Python 3.10
python-version: [ "3.10" ]
rocm-version: [ "5.7" ]

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y git wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Free Disk Space
run: . $PRELUDE; free_disk_space

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install PyTorch-ROCm Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU-ROCm Nightly
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm

- name: Test FBGEMM_GPU-ROCm Nightly Installation
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm


build_and_test_cpu:
runs-on: ${{ matrix.host-machine.instance }}
container:
Expand Down
197 changes: 197 additions & 0 deletions .github/workflows/fbgemm_gpu_rocm_ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# This workflow is used for both CI as well as nightly builds against
# PyTorch-ROCm Nightly.
name: FBGEMM_GPU-ROCm CI

on:
# PR Trigger (enabled for regression checks and debugging)
#
pull_request:
branches:
- main

# Push Trigger (enable to catch errors coming out of multiple merges)
#
push:
branches:
- main

# Cron Trigger (UTC)
#
# Based on the Conda page for PyTorch-nightly, the GPU nightly releases appear
# around 02:30 PST every day (roughly 2 hours after the CPU releases)
#
schedule:
- cron: '45 12 * * *'

# Manual Trigger
#
workflow_dispatch:
inputs:
publish_to_pypi:
description: Publish Artifact to PyPI
type: boolean
required: false
default: false

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: ${{ matrix.container-image }}
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.24xlarge" },
]
container-image: [ "ubuntu:20.04" ]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
rocm-version: [ "5.7" ]

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y binutils git pciutils sudo wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Free Disk Space
run: . $PRELUDE; free_disk_space

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install ROCm
run: . $PRELUDE; install_rocm_ubuntu $BUILD_ENV ${{ matrix.rocm-version }}

- name: Install C/C++ Compilers
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install PyTorch-ROCm Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU Wheel
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly rocm

- name: Upload Built Wheel as GHA Artifact
uses: actions/upload-artifact@v3
with:
name: fbgemm_gpu_nightly_rocm_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_rocm${{ matrix.rocm-version }}.whl
path: fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-*.whl


# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_AMD_GPU: 1
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "rocm" },
]
# ROCm machines are limited, so we only test against Python 3.11
python-version: [ "3.11" ]
rocm-version: [ "5.7" ]
needs: build_artifact

steps:
- name: Setup Build Container
run: |
apt update -y
apt install -y git wget
git config --global --add safe.directory '*'
- name: Checkout the Repository
uses: actions/checkout@v3

- name: Download Wheel Artifact from GHA
uses: actions/download-artifact@v3
with:
name: fbgemm_gpu_nightly_rocm_${{ matrix.host-machine.arch }}_${{ matrix.python-version }}_rocm${{ matrix.rocm-version }}.whl

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Free Disk Space
run: . $PRELUDE; free_disk_space

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install PyTorch-ROCm Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: . $PRELUDE; collect_pytorch_env_info $BUILD_ENV

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Wheel
run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl

- name: Test with PyTest
timeout-minutes: 15
run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV

0 comments on commit 6c5d308

Please sign in to comment.