From bf7f1696791a2b4bb93edabaaa4f6dfc1bcc1f4f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 20 Sep 2024 15:41:03 -0500 Subject: [PATCH 1/5] try to reproduce nightly failure --- .github/workflows/pr.yaml | 52 +++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 6e5c86c54..b2a333815 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -37,7 +37,7 @@ jobs: conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@test-cuda-11.4 with: build_type: pull-request conda-python-build: @@ -49,30 +49,30 @@ jobs: conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-11.4 with: build_type: pull-request - docs-build: - needs: conda-python-build - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 - with: - build_type: pull-request - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: "ci/build_docs.sh" - wheel-build-pylibwholegraph: - needs: checks - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 - with: - build_type: pull-request - script: ci/build_wheel.sh - wheel-test-pylibwholegraph: - needs: wheel-build-pylibwholegraph - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - with: - build_type: pull-request - script: ci/test_wheel.sh - matrix_filter: map(select(.ARCH == "amd64")) + # docs-build: + # needs: conda-python-build + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + # with: + # build_type: pull-request + # arch: "amd64" + # container_image: "rapidsai/ci-conda:latest" + # run_script: "ci/build_docs.sh" + # wheel-build-pylibwholegraph: + # needs: checks + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + # with: + # build_type: pull-request + # script: ci/build_wheel.sh + # wheel-test-pylibwholegraph: + # needs: wheel-build-pylibwholegraph + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + # with: + # build_type: pull-request + # script: ci/test_wheel.sh + # matrix_filter: map(select(.ARCH == "amd64")) From 984dbd03732791831c43143d1fd5e0a102153441 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 20 Sep 2024 15:43:12 -0500 Subject: [PATCH 2/5] workflow dependencies --- .github/workflows/pr.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index b2a333815..8747fc83d 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -12,14 +12,14 @@ concurrency: jobs: pr-builder: needs: - - checks + # - checks - conda-cpp-build - conda-cpp-tests - conda-python-build - conda-python-tests - - docs-build - - wheel-build-pylibwholegraph - - wheel-test-pylibwholegraph + # - docs-build + # - wheel-build-pylibwholegraph + # - wheel-test-pylibwholegraph secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 checks: @@ -28,7 +28,7 @@ jobs: with: enable_check_generated_files: false conda-cpp-build: - needs: checks + # needs: checks secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: From 387422709f467524d9f363e9d9a99e11bee1df00 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 20 Sep 2024 16:01:15 -0500 Subject: [PATCH 3/5] bump nccl floor to 2.18.1.1 --- .github/workflows/pr.yaml | 10 +++++----- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/libwholegraph/conda_build_config.yaml | 2 +- dependencies.yaml | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 8747fc83d..58de4c2fd 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -22,11 +22,11 @@ jobs: # - wheel-test-pylibwholegraph secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 - checks: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 - with: - enable_check_generated_files: false + # checks: + # secrets: inherit + # uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 + # with: + # enable_check_generated_files: false conda-cpp-build: # needs: checks secrets: inherit diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index d989d880e..27e2ee6fa 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -28,7 +28,7 @@ dependencies: - librmm==24.10.*,>=0.0.0a0 - nanobind>=0.2.0 - nbsphinx -- nccl +- nccl>=2.18.1.1 - ninja - numpy>=1.23,<3.0a0 - numpydoc diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 5b152cd31..5988a9893 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -30,7 +30,7 @@ dependencies: - librmm==24.10.*,>=0.0.0a0 - nanobind>=0.2.0 - nbsphinx -- nccl +- nccl>=2.18.1.1 - ninja - numpy>=1.23,<3.0a0 - numpydoc diff --git a/conda/recipes/libwholegraph/conda_build_config.yaml b/conda/recipes/libwholegraph/conda_build_config.yaml index 35b1d6b62..8b6dd3439 100644 --- a/conda/recipes/libwholegraph/conda_build_config.yaml +++ b/conda/recipes/libwholegraph/conda_build_config.yaml @@ -17,7 +17,7 @@ doxygen_version: - ">=1.8.11" nccl_version: - - ">=2.9.9" + - ">=2.18.1.1" c_stdlib: - sysroot diff --git a/dependencies.yaml b/dependencies.yaml index 8aaf92cd9..622e82a81 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -87,7 +87,7 @@ dependencies: - libraft-headers==24.10.*,>=0.0.0a0 - librmm==24.10.*,>=0.0.0a0 - nanobind>=0.2.0 - - nccl + - &nccl nccl>=2.18.1.1 specific: - output_types: conda matrices: @@ -216,14 +216,14 @@ dependencies: common: - output_types: [conda] packages: - - nccl + - *nccl test_python: common: - output_types: [conda] packages: - c-compiler - cxx-compiler - - nccl + - *nccl - output_types: [conda, requirements] packages: - ninja From 9169d9eeebde05c00090306c5224333d75529b3f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 20 Sep 2024 16:32:52 -0500 Subject: [PATCH 4/5] revert testing-only changes --- .github/workflows/pr.yaml | 72 +++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 58de4c2fd..6e5c86c54 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -12,23 +12,23 @@ concurrency: jobs: pr-builder: needs: - # - checks + - checks - conda-cpp-build - conda-cpp-tests - conda-python-build - conda-python-tests - # - docs-build - # - wheel-build-pylibwholegraph - # - wheel-test-pylibwholegraph + - docs-build + - wheel-build-pylibwholegraph + - wheel-test-pylibwholegraph secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 - # checks: - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 - # with: - # enable_check_generated_files: false + checks: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 + with: + enable_check_generated_files: false conda-cpp-build: - # needs: checks + needs: checks secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: @@ -37,7 +37,7 @@ jobs: conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@test-cuda-11.4 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 with: build_type: pull-request conda-python-build: @@ -49,30 +49,30 @@ jobs: conda-python-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-11.4 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 + with: + build_type: pull-request + docs-build: + needs: conda-python-build + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + with: + build_type: pull-request + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: "ci/build_docs.sh" + wheel-build-pylibwholegraph: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + with: + build_type: pull-request + script: ci/build_wheel.sh + wheel-test-pylibwholegraph: + needs: wheel-build-pylibwholegraph + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: pull-request - # docs-build: - # needs: conda-python-build - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 - # with: - # build_type: pull-request - # arch: "amd64" - # container_image: "rapidsai/ci-conda:latest" - # run_script: "ci/build_docs.sh" - # wheel-build-pylibwholegraph: - # needs: checks - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 - # with: - # build_type: pull-request - # script: ci/build_wheel.sh - # wheel-test-pylibwholegraph: - # needs: wheel-build-pylibwholegraph - # secrets: inherit - # uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - # with: - # build_type: pull-request - # script: ci/test_wheel.sh - # matrix_filter: map(select(.ARCH == "amd64")) + script: ci/test_wheel.sh + matrix_filter: map(select(.ARCH == "amd64")) From ff12da56c549dadc18c2e1268dd7c9fa86eef985 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 23 Sep 2024 08:12:04 -0500 Subject: [PATCH 5/5] relax pytorch pin --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- dependencies.yaml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 27e2ee6fa..f20d98977 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -40,7 +40,7 @@ dependencies: - pytest-xdist - python>=3.10,<3.13 - pytorch-cuda=11.8 -- pytorch=2.0.0 +- pytorch>=2.0,<2.4.0a0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - recommonmark - scikit-build-core>=0.10.0 diff --git a/dependencies.yaml b/dependencies.yaml index 622e82a81..950e1979a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -285,13 +285,13 @@ dependencies: # If conda-forge supports the new cuda-* packages for CUDA 11.8 # at some point, then we can fully support/properly specify # this environment. - - pytorch=2.0.0 + - &pytorch pytorch>=2.0,<2.4.0a0 - pytorch-cuda=11.8 - matrix: arch: aarch64 cuda: "11.8" packages: - - pytorch=2.0.0 + - *pytorch - pytorch-cuda=11.8 - matrix: packages: @@ -318,7 +318,7 @@ dependencies: common: - output_types: [conda] packages: - - pytorch=2.0.0 + - *pytorch - cpuonly clang_tools: common: