From a6041bae0f7e9a8c9c928f7b8a92fe9473f327da Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 12:09:48 -0600 Subject: [PATCH 1/9] test gha-tools conda timeout --- .github/workflows/pr.yaml | 142 ++++++++++++++++---------------- ci/build_cpp.sh | 2 + ci/build_docs.sh | 2 + ci/check_style.sh | 2 + ci/test_common.sh | 2 + ci/test_cpp.sh | 2 + ci/test_python.sh | 2 + ci/test_python_distributed.sh | 2 + ci/use_gha_tools_from_branch.sh | 14 ++++ 9 files changed, 99 insertions(+), 71 deletions(-) create mode 100644 ci/use_gha_tools_from_branch.sh diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 1a011908..a3869f15 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -12,37 +12,37 @@ concurrency: jobs: pr-builder: needs: - - check-nightly-ci + # - check-nightly-ci - changed-files - checks - conda-cpp-build - - devcontainer + # - devcontainer - docs-build - conda-cpp-tests - conda-python-tests - conda-python-distributed-tests - - wheel-build-libucxx - - wheel-build-ucxx - - wheel-tests-ucxx - - wheel-build-distributed-ucxx - - wheel-tests-distributed-ucxx + # - wheel-build-libucxx + # - wheel-build-ucxx + # - wheel-tests-ucxx + # - wheel-build-distributed-ucxx + # - wheel-tests-distributed-ucxx secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@nvks-runners if: always() with: needs: ${{ toJSON(needs) }} - check-nightly-ci: - # Switch to ubuntu-latest once it defaults to a version of Ubuntu that - # provides at least Python 3.11 (see - # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat) - runs-on: ubuntu-24.04 - env: - RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - steps: - - name: Check if nightly CI is passing - uses: rapidsai/shared-actions/check_nightly_success/dispatch@main - with: - repo: ucxx + # check-nightly-ci: + # # Switch to ubuntu-latest once it defaults to a version of Ubuntu that + # # provides at least Python 3.11 (see + # # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat) + # runs-on: ubuntu-24.04 + # env: + # RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # steps: + # - name: Check if nightly CI is passing + # uses: rapidsai/shared-actions/check_nightly_success/dispatch@main + # with: + # repo: ucxx changed-files: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@nvks-runners @@ -107,55 +107,55 @@ jobs: build_type: pull-request script: "ci/test_python_distributed.sh" container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" - wheel-build-libucxx: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners - with: - build_type: pull-request - script: ci/build_wheel_libucxx.sh - # build for every combination of arch and CUDA version, but only for the latest Python - matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) - wheel-build-ucxx: - needs: wheel-build-libucxx - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners - with: - build_type: pull-request - script: ci/build_wheel_ucxx.sh - wheel-tests-ucxx: - needs: [wheel-build-ucxx, changed-files] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners - if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python - with: - build_type: pull-request - container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" - script: ci/test_wheel_ucxx.sh - wheel-build-distributed-ucxx: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners - with: - build_type: pull-request - script: ci/build_wheel_distributed_ucxx.sh - wheel-tests-distributed-ucxx: - needs: [wheel-build-ucxx, wheel-build-distributed-ucxx, changed-files] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners - if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python - with: - build_type: pull-request - container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" - script: ci/test_wheel_distributed_ucxx.sh - devcontainer: - secrets: inherit - needs: checks - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.04 - with: - arch: '["amd64"]' - cuda: '["12.8"]' - build_command: | - sccache -z; - build-all --verbose; - sccache -s; + # wheel-build-libucxx: + # needs: checks + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners + # with: + # build_type: pull-request + # script: ci/build_wheel_libucxx.sh + # # build for every combination of arch and CUDA version, but only for the latest Python + # matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) + # wheel-build-ucxx: + # needs: wheel-build-libucxx + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners + # with: + # build_type: pull-request + # script: ci/build_wheel_ucxx.sh + # wheel-tests-ucxx: + # needs: [wheel-build-ucxx, changed-files] + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners + # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python + # with: + # build_type: pull-request + # container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" + # script: ci/test_wheel_ucxx.sh + # wheel-build-distributed-ucxx: + # needs: checks + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@nvks-runners + # with: + # build_type: pull-request + # script: ci/build_wheel_distributed_ucxx.sh + # wheel-tests-distributed-ucxx: + # needs: [wheel-build-ucxx, wheel-build-distributed-ucxx, changed-files] + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@nvks-runners + # if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python + # with: + # build_type: pull-request + # container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" + # script: ci/test_wheel_distributed_ucxx.sh + # devcontainer: + # secrets: inherit + # needs: checks + # uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.04 + # with: + # arch: '["amd64"]' + # cuda: '["12.8"]' + # build_command: | + # sccache -z; + # build-all --verbose; + # sccache -s; diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 98410678..68993483 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -5,6 +5,8 @@ set -euo pipefail +source ./ci/use_gha_tools_from_branch.sh + rapids-configure-conda-channels source rapids-configure-sccache diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 30f3cd02..03f9a753 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -3,6 +3,8 @@ set -euo pipefail +source ./ci/use_gha_tools_from_branch.sh + rapids-logger "Create test conda environment" . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/check_style.sh b/ci/check_style.sh index e1240d25..5cbc9218 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -5,6 +5,8 @@ set -euo pipefail +source ./ci/use_gha_tools_from_branch.sh + rapids-logger "Create checks conda environment" . /opt/conda/etc/profile.d/conda.sh diff --git a/ci/test_common.sh b/ci/test_common.sh index 314c66bc..58b00c05 100755 --- a/ci/test_common.sh +++ b/ci/test_common.sh @@ -5,6 +5,8 @@ set -euo pipefail +source ./ci/use_gha_tools_from_branch.sh + ################################### Common ##################################### log_command() { diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index e6178bc6..5b812848 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -5,6 +5,8 @@ set -euo pipefail +source ./ci/use_gha_tools_from_branch.sh + source "$(dirname "$0")/test_common.sh" rapids-logger "Create test conda environment" diff --git a/ci/test_python.sh b/ci/test_python.sh index 6ad39c46..96f7d1bf 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -5,6 +5,8 @@ set -euo pipefail +source ./ci/use_gha_tools_from_branch.sh + source "$(dirname "$0")/test_common.sh" rapids-logger "Create test conda environment" diff --git a/ci/test_python_distributed.sh b/ci/test_python_distributed.sh index d7b7e402..d4913d63 100755 --- a/ci/test_python_distributed.sh +++ b/ci/test_python_distributed.sh @@ -5,6 +5,8 @@ set -euo pipefail +source ./ci/use_gha_tools_from_branch.sh + source "$(dirname "$0")/test_common.sh" rapids-logger "Create test conda environment" diff --git a/ci/use_gha_tools_from_branch.sh b/ci/use_gha_tools_from_branch.sh new file mode 100644 index 00000000..22bb64c8 --- /dev/null +++ b/ci/use_gha_tools_from_branch.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# fill these in +GHA_TOOLS_BRANCH='conda-install-timeout' +GHA_TOOLS_REPO_ORG=jameslamb + +git clone \ + --branch ${GHA_TOOLS_BRANCH} \ + https://github.com/${GHA_TOOLS_REPO_ORG}/gha-tools.git \ + /tmp/gha-tools + +unset GHA_TOOLS_BRANCH GHA_TOOLS_REPO_ORG + +export PATH="/tmp/gha-tools/tools":$PATH From 3e994d4ea014b64d6fd25e1202d082d32296cb58 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 12:13:21 -0600 Subject: [PATCH 2/9] allow builds to proceed without checks --- .github/workflows/pr.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a3869f15..480f504c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -67,7 +67,6 @@ jobs: with: enable_check_generated_files: false conda-cpp-build: - needs: checks secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@nvks-runners with: From 578ad61caccefb337ce74c28fc870eee42293130 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 12:27:30 -0600 Subject: [PATCH 3/9] empty commit to re-trigger CI From f9da7f9e918c6ef46a7d61eccee64ee739762e7c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 13:00:15 -0600 Subject: [PATCH 4/9] reduce duplication in gha-tools overrides --- ci/test_cpp.sh | 2 -- ci/test_python.sh | 2 -- ci/test_python_distributed.sh | 2 -- 3 files changed, 6 deletions(-) diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 5b812848..e6178bc6 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -5,8 +5,6 @@ set -euo pipefail -source ./ci/use_gha_tools_from_branch.sh - source "$(dirname "$0")/test_common.sh" rapids-logger "Create test conda environment" diff --git a/ci/test_python.sh b/ci/test_python.sh index 96f7d1bf..6ad39c46 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -5,8 +5,6 @@ set -euo pipefail -source ./ci/use_gha_tools_from_branch.sh - source "$(dirname "$0")/test_common.sh" rapids-logger "Create test conda environment" diff --git a/ci/test_python_distributed.sh b/ci/test_python_distributed.sh index d4913d63..d7b7e402 100755 --- a/ci/test_python_distributed.sh +++ b/ci/test_python_distributed.sh @@ -5,8 +5,6 @@ set -euo pipefail -source ./ci/use_gha_tools_from_branch.sh - source "$(dirname "$0")/test_common.sh" rapids-logger "Create test conda environment" From ca2d189df2161782f7f4e98bff7e4c8ac0b05bdb Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 13:29:35 -0600 Subject: [PATCH 5/9] set RAPIDS_CONDA_RETRY_TIMEOUT env variable --- .github/workflows/pr.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 480f504c..139f74ba 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -9,6 +9,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + RAPIDS_CONDA_RETRY_TIMEOUT: '13s' + jobs: pr-builder: needs: From 24d1897e9d618c233096d84686a5f7cdb757dc33 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 13:31:10 -0600 Subject: [PATCH 6/9] test with an integer --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 139f74ba..ded913c3 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -10,7 +10,7 @@ concurrency: cancel-in-progress: true env: - RAPIDS_CONDA_RETRY_TIMEOUT: '13s' + RAPIDS_CONDA_RETRY_TIMEOUT: 120 jobs: pr-builder: From f287c0d8d6f65b8fc7efa9e80b794bcc2ce18e18 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 13:38:54 -0600 Subject: [PATCH 7/9] try setting variable from script --- .github/workflows/pr.yaml | 3 --- ci/use_gha_tools_from_branch.sh | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ded913c3..480f504c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -9,9 +9,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -env: - RAPIDS_CONDA_RETRY_TIMEOUT: 120 - jobs: pr-builder: needs: diff --git a/ci/use_gha_tools_from_branch.sh b/ci/use_gha_tools_from_branch.sh index 22bb64c8..83274058 100644 --- a/ci/use_gha_tools_from_branch.sh +++ b/ci/use_gha_tools_from_branch.sh @@ -1,5 +1,7 @@ #!/bin/bash +export RAPIDS_CONDA_RETRY_TIMEOUT=120 + # fill these in GHA_TOOLS_BRANCH='conda-install-timeout' GHA_TOOLS_REPO_ORG=jameslamb From be86b5fb35d14eec0e6df679230fa4e746ff1dcf Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 13:48:33 -0600 Subject: [PATCH 8/9] empty commit to re-trigger CI From 1e08d306194ae3f8066ed124620a382d8e958617 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 4 Feb 2025 14:00:31 -0600 Subject: [PATCH 9/9] try RAPIDS_MAMBA_RETRY_TIMEOUT --- ci/use_gha_tools_from_branch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/use_gha_tools_from_branch.sh b/ci/use_gha_tools_from_branch.sh index 83274058..29449884 100644 --- a/ci/use_gha_tools_from_branch.sh +++ b/ci/use_gha_tools_from_branch.sh @@ -1,6 +1,6 @@ #!/bin/bash -export RAPIDS_CONDA_RETRY_TIMEOUT=120 +export RAPIDS_MAMBA_RETRY_TIMEOUT=146 # fill these in GHA_TOOLS_BRANCH='conda-install-timeout'