Skip to content

Commit

Permalink
[CI] Improve workflows (#23)
Browse files Browse the repository at this point in the history
* improve workflows
  • Loading branch information
shink authored Jan 23, 2025
1 parent 6d83a0b commit d698170
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 144 deletions.
216 changes: 111 additions & 105 deletions .github/workflows/deepspeed.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
name: huawei-ascend-npu
name: Unit tests with DeepSpeed on Ascend NPU

defaults:
run:
shell: bash -ieo pipefail {0}
on:
workflow_dispatch:
pull_request:
Expand All @@ -16,118 +13,127 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}

jobs:
unit-tests:
deepspeed-ut:
if: ${{ github.repository_owner == 'Ascend' }}
name: Run unit tests with DeepSpeed
runs-on: [self-hosted, ascend, npu]
container:
image: ascendai/cann
ports:
- 80
image: ascendai/cann:latest
volumes:
- /usr/local/dcmi:/usr/local/dcmi
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
- /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
- /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info
- /etc/ascend_install.info:/etc/ascend_install.info
options: --network host
--name deepspeed_unit-tests
--device /dev/davinci0
--device /dev/davinci_manager
--device /dev/devmm_svm
--device /dev/hisi_hdc
--shm-size "20g"
--entrypoint /bin/bash

- /etc/ascend_install.info:/etc/ascend_install.info
options: >-
--network host
--device /dev/davinci0
--device /dev/davinci_manager
--device /dev/devmm_svm
--device /dev/hisi_hdc
steps:
- uses: actions/checkout@v4

- name: Install pytorch
run: |
npu-smi info
apt-get update
apt-get install sudo
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
source /root/.bashrc
pip install torch==2.2.0 torchvision==0.17.0 torch_npu==2.2.0 torchaudio==2.2.0 numpy==1.26.4 cloudpickle tornado ml-dtypes
python << EOF
if __name__ == '__main__':
import torch
import torch_npu
torch_npu.npu.set_device("npu:0")
print(f"Device Name: {torch.npu.get_device_name(0)}")
print(f"Device Count: {torch.npu.device_count()}")
print(f"Device Available: {torch.npu.is_available()}")
EOF
- name: Install transformers
uses: nick-fields/retry@v3
with:
timeout_minutes: 30
max_attempts: 3
retry_on: error
command: |
source /root/.bashrc
echo "y" | apt-get install git
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
- name: Show NPU info
run: |
npu-smi info
- name: Config mirrors
run: |
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
- name: Install system dependencies
run: |
apt-get update
apt-get install -y \
git gcc g++ make cmake ninja-build
- name: Checkout
uses: actions/checkout@v4

- name: Install pytorch
run: |
pip install \
torch==2.2.0 \
torch_npu==2.2.0 \
torchvision==0.17.0 \
torchaudio==2.2.0 \
numpy==1.26.4 \
cloudpickle \
tornado \
ml-dtypes
python << EOF
if __name__ == '__main__':
import torch
import torch_npu
torch_npu.npu.set_device("npu:0")
print(f"Device Name: {torch.npu.get_device_name(0)}")
print(f"Device Count: {torch.npu.device_count()}")
print(f"Device Available: {torch.npu.is_available()}")
EOF
- name: Checkout transformers
uses: actions/checkout@v4
with:
repository: huggingface/transformers
path: transformers

- name: Install transformers
working-directory: transformers
run: |
pip install .
- name: Install deepspeed
uses: nick-fields/retry@v3
with:
timeout_minutes: 30
max_attempts: 3
retry_on: error
command: |
source /root/.bashrc
git clone --depth=1 https://github.com/microsoft/DeepSpeed.git
- name: Checkout deepspeed
uses: actions/checkout@v4
with:
repository: microsoft/DeepSpeed
path: deepspeed

- name: Install deepspeed dependencies
run: |
pip install -r requirements/requirements_deepspeed.txt
cd DeepSpeed
- name: Install deepspeed
working-directory: deepspeed
run: |
pip install .[1bit,autotuning,inf]
ds_report
- name: Python environment
run: |
source /root/.bashrc
pip list
- name: Unit tests
run: |
source /root/.bashrc
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd DeepSpeed/tests/unit/
pytest --verbose accelerator/*
pytest --verbose autotuning/*
pytest --verbose checkpoint/test_reshape_checkpoint.py
pytest --verbose checkpoint/test_moe_checkpoint.py
pytest --verbose checkpoint/test_shared_weights.py
pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
pytest --verbose model_parallelism/*
pytest --verbose moe/test_moe_tp.py
pytest --verbose monitor/*
pytest --verbose utils/*
pytest --verbose runtime/test_ds_config_model.py
pytest --verbose runtime/pipe/test_pipe_schedule.py
pytest --verbose runtime/zero/test_zero_config.py
pytest --verbose runtime/zero/test_zero_tiled.py
pytest --verbose runtime/zero/test_zeropp.py
pytest --verbose runtime/test_autocast.py
pytest --verbose runtime/test_data.py
pytest --verbose runtime/test_runtime_utils.py
pytest --verbose runtime/activation_checkpointing/*
pytest --verbose runtime/utils/*
pytest --verbose runtime/zero/test_zero_dynamic_class.py
- name: Show environment info
run: |
pip list
- name: Run unit tests
working-directory: deepspeed/tests/unit
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
pytest --verbose accelerator/*
pytest --verbose autotuning/*
pytest --verbose checkpoint/test_reshape_checkpoint.py
pytest --verbose checkpoint/test_moe_checkpoint.py
pytest --verbose checkpoint/test_shared_weights.py
pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
pytest --verbose model_parallelism/*
pytest --verbose moe/test_moe_tp.py
pytest --verbose monitor/*
pytest --verbose utils/*
pytest --verbose runtime/test_ds_config_model.py
pytest --verbose runtime/pipe/test_pipe_schedule.py
pytest --verbose runtime/zero/test_zero_config.py
pytest --verbose runtime/zero/test_zero_tiled.py
pytest --verbose runtime/zero/test_zeropp.py
pytest --verbose runtime/test_autocast.py
pytest --verbose runtime/test_data.py
pytest --verbose runtime/test_runtime_utils.py
pytest --verbose runtime/activation_checkpointing/*
pytest --verbose runtime/utils/*
pytest --verbose runtime/zero/test_zero_dynamic_class.py
85 changes: 46 additions & 39 deletions .github/workflows/llamacpp.yaml
Original file line number Diff line number Diff line change
@@ -1,54 +1,61 @@
name: llama.cpp
name: Build llama.cpp in CANN container

defaults:
run:
shell: bash -el {0}
on:
workflow_dispatch:
pull_request:
# paths:
# - '.github/workflows/llamacpp.yaml'
# - 'requirements/**'
paths:
- '.github/workflows/llamacpp.yaml'
- 'requirements/**'
schedule:
- cron: "0 0 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}

jobs:
unit-tests:
if: contains(github.event.pull_request.labels.*.name, 'Ascend NPU')

runs-on: ubuntu-latest
strategy:
matrix:
build: ['Release']
cann: ['openeuler-python3.10-cann8.0.rc3.beta1']
device: ['ascend910b3']
container:
image: ascendai/cann:${{ matrix.cann }}
steps:
- uses: actions/checkout@v4
- name: Install llamacpp
uses: nick-fields/retry@v3
openeuler-arm64-test:
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
name: Build llama.cpp on OpenEuler for Arm64
runs-on: ubuntu-24.04-arm
strategy:
matrix:
cann:
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
device:
- 'ascend910b3'
build:
- 'Release'
container: ascendai/cann:${{ matrix.cann }}
steps:
- name: Install dependencies
run: |
yum update -y
yum install -y git gcc gcc-c++ make cmake
- name: Checkout
uses: actions/checkout@v4

- name: Checkout llama.cpp
uses: actions/checkout@v4
with:
timeout_minutes: 30
max_attempts: 3
retry_on: error
command: |
yum update -y
yum install git cmake gcc gcc-c++ make -y
git clone https://github.com/ggerganov/llama.cpp.git
- name: Build
repository: ggerganov/llama.cpp
path: llama.cpp

- name: Build llama.cpp
working-directory: llama.cpp
run: |
cd llama.cpp
mkdir build
cd build
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/lib64:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib/:${LD_LIBRARY_PATH}
cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_CANN=on -DSOC_TYPE=${{ matrix.device }} && cmake --build . -j $(nproc)
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-DGGML_CANN=on \
-DSOC_TYPE=${{ matrix.device }}
cmake --build build -j $(nproc)

0 comments on commit d698170

Please sign in to comment.