-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extract Full Model Execution Tests into Nightly Tests (#379)
### Ticket #378 ### Problem description Full model execution tests are only attempted in push/pr triggered workflows, but not in nightlies. This complicates queries and provides no single source of truth on actual full model execution status from tt-torch main. ### What's changed Duplicate the workflow in run-tests.yml into a self contained job that can be run in nightly tests. Execution tests are refactored to be run in parallel. ### Checklist - [x] New/Existing tests provide coverage for changes
- Loading branch information
1 parent
186ad28
commit 5fbaf8d
Showing
2 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
name: Run Full Model Execution Tests | ||
|
||
on: | ||
workflow_dispatch: | ||
workflow_call: | ||
workflow_run: | ||
workflows: [Build] # backref to run-build as dependency | ||
types: [completed] | ||
|
||
jobs: | ||
tests: | ||
timeout-minutes: 90 # current onPR time runs ~60m | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
build: [ | ||
{ | ||
runs-on: wormhole_b0, name: "autoencoder", tests: " | ||
tests/models/autoencoder_linear/test_autoencoder_linear.py::test_autoencoder_linear[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "distilbert", tests: " | ||
tests/models/distilbert/test_distilbert.py::test_distilbert[full-distilbert-base-uncased-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "mlpmixer", tests: " | ||
tests/models/mlpmixer/test_mlpmixer.py::test_mlpmixer[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "mnist", tests: " | ||
tests/models/mnist/test_mnist.py::test_mnist_train[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "mobilenet", tests: " | ||
tests/models/MobileNetV2/test_MobileNetV2.py::test_MobileNetV2[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "openpose", tests: " | ||
tests/models/openpose/test_openpose_v2.py::test_openpose_v2[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "perceiver_io", tests: " | ||
tests/models/perceiver_io/test_perceiver_io.py::test_perceiver_io[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "resnet", tests: " | ||
tests/models/resnet/test_resnet.py::test_resnet[full-eval] | ||
tests/models/resnet50/test_resnet50.py::test_resnet[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "yolov3", tests: " | ||
tests/models/yolov3/test_yolov3.py::test_yolov3[full-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "albert_masked_lm", tests: " | ||
tests/models/albert/test_albert_masked_lm.py::test_albert_masked_lm[full-albert/albert-xxlarge-v2-eval] | ||
" | ||
}, | ||
{ | ||
runs-on: wormhole_b0, name: "llama_3b", tests: " | ||
tests/models/llama/test_llama_3b.py::test_llama_3b[full-meta-llama/Llama-3.2-3B-eval] | ||
" | ||
}, | ||
# { | ||
# runs-on: wormhole_b0, name: "torchvision_image_classification", tests: " | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-mobilenet_v2] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-mobilenet_v3_small] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-mobilenet_v3_large] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnet18] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnet34] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnet50] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnet101] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnet152] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnext50_32x4d] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnext101_32x8d] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-resnext101_64x4d] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-wide_resnet50_2] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-wide_resnet101_2] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_y_400mf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_y_800mf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_y_1_6gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_y_3_2gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_y_8gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_y_16gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_y_32gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_x_400mf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_x_800mf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_x_1_6gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_x_3_2gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_x_8gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_x_16gf] | ||
# tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-eval-regnet_x_32gf] | ||
# " | ||
# }, | ||
|
||
] | ||
runs-on: | ||
- ${{ matrix.build.runs-on }} | ||
|
||
name: "test execution (${{ matrix.build.runs-on }}, ${{ matrix.build.name }})" | ||
|
||
container: | ||
image: ghcr.io/tenstorrent/tt-torch/tt-torch-ci-ubuntu-22-04:latest | ||
options: --user root --device /dev/tenstorrent/0 --shm-size=4gb | ||
volumes: | ||
- /dev/hugepages:/dev/hugepages | ||
- /dev/hugepages-1G:/dev/hugepages-1G | ||
- /etc/udev/rules.d:/etc/udev/rules.d | ||
- /lib/modules:/lib/modules | ||
- /opt/tt_metal_infra/provisioning/provisioning_env:/opt/tt_metal_infra/provisioning/provisioning_env | ||
- /mnt/dockercache:/mnt/dockercache | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
submodules: recursive | ||
lfs: true | ||
|
||
- name: Fetch job id | ||
id: fetch-job-id | ||
uses: tenstorrent/tt-github-actions/.github/actions/job_id@main | ||
with: | ||
job_name: "test execution (${{ matrix.build.runs-on }}, ${{ matrix.build.name }})" # reference above tests.name | ||
|
||
- name: Set reusable strings | ||
id: strings | ||
shell: bash | ||
env: | ||
JOB_ID: ${{ steps.fetch-job-id.outputs.job_id }} | ||
|
||
run: | | ||
echo "work-dir=$(pwd)" >> "$GITHUB_OUTPUT" | ||
echo "install-dir=$(pwd)/install" >> "$GITHUB_OUTPUT" | ||
echo "dist-dir=$(pwd)/dist" >> "$GITHUB_OUTPUT" | ||
echo "test_report_path_models=report_models_$JOB_ID.xml" >> "$GITHUB_OUTPUT" | ||
- name: Use build artifacts | ||
uses: actions/download-artifact@v4 | ||
with: | ||
name: install-artifacts | ||
path: ${{ steps.strings.outputs.install-dir }} | ||
|
||
- name: 'Untar install directory' | ||
shell: bash | ||
working-directory: ${{ steps.strings.outputs.install-dir }} | ||
run: | | ||
tar xvf artifact.tar | ||
mkdir -p ${{ steps.strings.outputs.dist-dir }} | ||
mv wheels/* ${{ steps.strings.outputs.dist-dir }} | ||
- name: install tt-torch | ||
shell: bash | ||
run: | | ||
source env/activate | ||
pip install ${{ steps.strings.outputs.dist-dir }}/*.whl | ||
- name: Run Full Model Execution Tests | ||
env: | ||
HF_HOME: /mnt/dockercache/huggingface | ||
TORCH_HOME: /mnt/dockercache/torch | ||
HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
shell: bash | ||
run: | | ||
# Make sure we don't stop on first failure | ||
set +e | ||
tests_list=$(echo "${{ matrix.build.tests }}" | xargs -n1 echo) | ||
total_tests=$(echo "$tests_list" | wc -l) | ||
failures=0 | ||
counter=0 | ||
source env/activate | ||
for test_case in $tests_list; do | ||
pytest -v "$test_case" \ | ||
--junit-xml=${{ steps.strings.outputs.test_report_path_models }} \ | ||
--cov=tt_torch --cov-report term --cov-report xml:coverage.xml --cov-append | ||
exit_code=$? | ||
if [ $exit_code -eq 0 ]; then | ||
echo "[ $counter / $total_tests ] $test_case PASSED" | ||
else | ||
echo "[ $counter / $total_tests ] $test_case FAILED" | ||
failures=$((failures + 1)) | ||
fi | ||
done | ||
# If any test failed, exit nonzero to mark the job as failed | ||
if [ $failures -ne 0 ]; then | ||
echo "Total failures: $failures" | ||
exit 1 | ||
fi | ||
- name: Upload Test Report Models | ||
uses: actions/upload-artifact@v4 | ||
if: success() || failure() | ||
with: | ||
name: test-reports-models-${{ matrix.build.runs-on }}-${{ matrix.build.name }}-${{ steps.fetch-job-id.outputs.job_id }} | ||
path: ${{ steps.strings.outputs.test_report_path_models }} | ||
|
||
- name: Show Test Report | ||
uses: mikepenz/action-junit-report@v5 | ||
if: success() || failure() | ||
with: | ||
report_paths: ${{ steps.strings.outputs.test_report_path_torch }} | ||
check_name: TT-Torch Tests | ||
comment: true | ||
updateComment: false | ||
detailed_summary: true | ||
group_suite: true | ||
- name: Upload coverage reports to Codecov | ||
if: success() || failure() | ||
continue-on-error: true | ||
uses: codecov/codecov-action@v5 | ||
with: | ||
files: coverage.info,.coverage,coverage.xml | ||
# disable_search: true | ||
token: ${{ secrets.CODECOV_TOKEN }} | ||
|
||
- name: Upload test results to Codecov | ||
if: success() || failure() | ||
continue-on-error: true | ||
uses: codecov/test-results-action@v1 | ||
with: | ||
files: ${{ steps.strings.outputs.test_report_path_torch }}, ${{ steps.strings.outputs.test_report_path_models }}, ${{ steps.strings.outputs.test_report_path_onnx }} | ||
disable_search: true | ||
token: ${{ secrets.CODECOV_TOKEN }} |