Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai}[gfbf/2024a] jax v0.4.34, ml_dtypes v0.5.0 w/ CUDA 12.6.0 WIP #21924

Draft
wants to merge 27 commits into
base: develop
Choose a base branch
from

Conversation

ThomasHoffmann77
Copy link
Contributor

@ThomasHoffmann77 ThomasHoffmann77 commented Nov 28, 2024

(created using eb --new-pr)
requires:

TODO:

  • disable Bazel downloads
  • Use Bazel 7.4.1
  • Tests
  • Plugins: python -c "import jax; jax.device()" does not list GPU devices. Probably because of missing jaxlib_cuda12_plugin -> use additional build parameter --build_gpu_plugin, --gpu_plugin_cuda_version=12, --build_gpu_pjrt_plugin, and --build_gpu_kernel_plugin=cuda
  • fix: E jaxlib.xla_extension.XlaRuntimeError: INTERNAL: libdevice not found at ./libdevice.10.bc
    -> export XLA_FLAGS=--xla_gpu_cuda_data_dir=$CUDA_HOME

… jax-0.4.35_easyblock_compat.patch, jax-0.4.35_fix-pybind11-systemlib_cupti.patch
Copy link

github-actions bot commented Nov 28, 2024

Updated software Bazel-6.5.0-GCCcore-13.3.0.eb

Diff against Bazel-7.4.1-GCCcore-13.3.0.eb

easybuild/easyconfigs/b/Bazel/Bazel-7.4.1-GCCcore-13.3.0.eb

diff --git a/easybuild/easyconfigs/b/Bazel/Bazel-7.4.1-GCCcore-13.3.0.eb b/easybuild/easyconfigs/b/Bazel/Bazel-6.5.0-GCCcore-13.3.0.eb
index 1fc72e512e..c4c77a8a7d 100644
--- a/easybuild/easyconfigs/b/Bazel/Bazel-7.4.1-GCCcore-13.3.0.eb
+++ b/easybuild/easyconfigs/b/Bazel/Bazel-6.5.0-GCCcore-13.3.0.eb
@@ -1,5 +1,5 @@
 name = 'Bazel'
-version = '7.4.1'
+version = '6.5.0'
 
 homepage = 'https://bazel.io/'
 description = """Bazel is a build tool that builds code quickly and reliably.
@@ -9,15 +9,20 @@ toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
 
 source_urls = ['https://github.com/bazelbuild/%(namelower)s/releases/download/%(version)s']
 sources = ['%(namelower)s-%(version)s-dist.zip']
-checksums = ['83386618bc489f4da36266ef2620ec64a526c686cf07041332caff7c953afaf5']
+patches = ['Bazel-6.5.0_py3.12_pytest_assertEqual.patch']
+checksums = [
+    {'bazel-6.5.0-dist.zip': 'fc89da919415289f29e4ff18a5e01270ece9a6fe83cb60967218bac4a3bb3ed2'},
+    {'Bazel-6.5.0_py3.12_pytest_assertEqual.patch': '2670dd5c393970ba20db2c98cf0208df7190ff339ccb66fee9a6d48aaaf3ede6'},
+]
 
 builddependencies = [
     ('binutils', '2.42'),
     ('Python', '3.12.3'),
     ('Zip', '3.0'),
 ]
+
 dependencies = [
-    ('Java', '21.0.2', '', SYSTEM),
+    ('Java', '11.0.20', '', SYSTEM),
 ]
 
 runtest = True
Diff against Bazel-6.1.0-GCCcore-12.3.0.eb

easybuild/easyconfigs/b/Bazel/Bazel-6.1.0-GCCcore-12.3.0.eb

diff --git a/easybuild/easyconfigs/b/Bazel/Bazel-6.1.0-GCCcore-12.3.0.eb b/easybuild/easyconfigs/b/Bazel/Bazel-6.5.0-GCCcore-13.3.0.eb
index 1bacc7b936..c4c77a8a7d 100644
--- a/easybuild/easyconfigs/b/Bazel/Bazel-6.1.0-GCCcore-12.3.0.eb
+++ b/easybuild/easyconfigs/b/Bazel/Bazel-6.5.0-GCCcore-13.3.0.eb
@@ -1,27 +1,29 @@
 name = 'Bazel'
-version = '6.1.0'
+version = '6.5.0'
 
 homepage = 'https://bazel.io/'
 description = """Bazel is a build tool that builds code quickly and reliably.
 It is used to build the majority of Google's software."""
 
-toolchain = {'name': 'GCCcore', 'version': '12.3.0'}
+toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
 
 source_urls = ['https://github.com/bazelbuild/%(namelower)s/releases/download/%(version)s']
 sources = ['%(namelower)s-%(version)s-dist.zip']
-patches = ['Bazel-6.3.1_add-symlinks-in-runfiles.patch']
+patches = ['Bazel-6.5.0_py3.12_pytest_assertEqual.patch']
 checksums = [
-    {'bazel-6.1.0-dist.zip': 'c4b85675541cf66ee7cb71514097fdd6c5fc0e02527243617a4f20ca6b4f2932'},
-    {'Bazel-6.3.1_add-symlinks-in-runfiles.patch': '81db53aa87229557480b6f719c99a0f1af9c69dfec12185451e520b0128c3ae2'},
+    {'bazel-6.5.0-dist.zip': 'fc89da919415289f29e4ff18a5e01270ece9a6fe83cb60967218bac4a3bb3ed2'},
+    {'Bazel-6.5.0_py3.12_pytest_assertEqual.patch': '2670dd5c393970ba20db2c98cf0208df7190ff339ccb66fee9a6d48aaaf3ede6'},
 ]
 
 builddependencies = [
-    ('binutils', '2.40'),
-    ('Python', '3.11.3'),
+    ('binutils', '2.42'),
+    ('Python', '3.12.3'),
     ('Zip', '3.0'),
 ]
 
-dependencies = [('Java', '11', '', SYSTEM)]
+dependencies = [
+    ('Java', '11.0.20', '', SYSTEM),
+]
 
 runtest = True
 testopts = "-- //examples/cpp:hello-success_test //examples/py/... //examples/py_native:test //examples/shell/..."
Diff against Bazel-6.3.1-GCCcore-12.2.0.eb

easybuild/easyconfigs/b/Bazel/Bazel-6.3.1-GCCcore-12.2.0.eb

diff --git a/easybuild/easyconfigs/b/Bazel/Bazel-6.3.1-GCCcore-12.2.0.eb b/easybuild/easyconfigs/b/Bazel/Bazel-6.5.0-GCCcore-13.3.0.eb
index 8c284f50a4..c4c77a8a7d 100644
--- a/easybuild/easyconfigs/b/Bazel/Bazel-6.3.1-GCCcore-12.2.0.eb
+++ b/easybuild/easyconfigs/b/Bazel/Bazel-6.5.0-GCCcore-13.3.0.eb
@@ -1,27 +1,29 @@
 name = 'Bazel'
-version = '6.3.1'
+version = '6.5.0'
 
 homepage = 'https://bazel.io/'
 description = """Bazel is a build tool that builds code quickly and reliably.
 It is used to build the majority of Google's software."""
 
-toolchain = {'name': 'GCCcore', 'version': '12.2.0'}
+toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
 
 source_urls = ['https://github.com/bazelbuild/%(namelower)s/releases/download/%(version)s']
 sources = ['%(namelower)s-%(version)s-dist.zip']
-patches = ['Bazel-6.3.1_add-symlinks-in-runfiles.patch']
+patches = ['Bazel-6.5.0_py3.12_pytest_assertEqual.patch']
 checksums = [
-    {'bazel-6.3.1-dist.zip': '2676319e86c5aeab142dccd42434364a33aa330a091c13562b7de87a10e68775'},
-    {'Bazel-6.3.1_add-symlinks-in-runfiles.patch': '81db53aa87229557480b6f719c99a0f1af9c69dfec12185451e520b0128c3ae2'},
+    {'bazel-6.5.0-dist.zip': 'fc89da919415289f29e4ff18a5e01270ece9a6fe83cb60967218bac4a3bb3ed2'},
+    {'Bazel-6.5.0_py3.12_pytest_assertEqual.patch': '2670dd5c393970ba20db2c98cf0208df7190ff339ccb66fee9a6d48aaaf3ede6'},
 ]
 
 builddependencies = [
-    ('binutils', '2.39'),
-    ('Python', '3.10.8'),
+    ('binutils', '2.42'),
+    ('Python', '3.12.3'),
     ('Zip', '3.0'),
 ]
 
-dependencies = [('Java', '11', '', SYSTEM)]
+dependencies = [
+    ('Java', '11.0.20', '', SYSTEM),
+]
 
 runtest = True
 testopts = "-- //examples/cpp:hello-success_test //examples/py/... //examples/py_native:test //examples/shell/..."

Updated software jax-0.4.34-gfbf-2024a-CUDA-12.6.0.eb

Diff against jax-0.4.25-gfbf-2023a.eb

easybuild/easyconfigs/j/jax/jax-0.4.25-gfbf-2023a.eb

diff --git a/easybuild/easyconfigs/j/jax/jax-0.4.25-gfbf-2023a.eb b/easybuild/easyconfigs/j/jax/jax-0.4.34-gfbf-2024a-CUDA-12.6.0.eb
index a4a86382ad..ba56113deb 100644
--- a/easybuild/easyconfigs/j/jax/jax-0.4.25-gfbf-2023a.eb
+++ b/easybuild/easyconfigs/j/jax/jax-0.4.34-gfbf-2024a-CUDA-12.6.0.eb
@@ -6,54 +6,83 @@
 easyblock = 'PythonBundle'
 
 name = 'jax'
-version = '0.4.25'
+version = '0.4.34'
+versionsuffix = '-CUDA-%(cudaver)s'
 
 homepage = 'https://jax.readthedocs.io/'
 description = """Composable transformations of Python+NumPy programs:
 differentiate, vectorize, JIT to GPU/TPU, and more"""
 
-toolchain = {'name': 'gfbf', 'version': '2023a'}
+toolchain = {'name': 'gfbf', 'version': '2024a'}
+cuda_compute_capabilities = ["5.0", "6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "9.0"]
 
 builddependencies = [
-    ('Bazel', '6.3.1'),
-    ('pytest-xdist', '3.3.1'),
-    ('git', '2.41.0', '-nodocs'),  # bazel uses git to fetch repositories
-    ('matplotlib', '3.7.2'),  # required for tests/lobpcg_test.py
-    ('poetry', '1.5.1'),
-    ('pybind11', '2.11.1'),
+    # ('Bazel', '7.4.1'),  TODO: problems with @@local_config_python//:py3_runtime:
+    # Error in fail: interpreter_path must be an absolute path
+    # Bazel 6.5.0 (download) works.
+    ('pybind11', '2.13.6'),
+    ('pytest-xdist', '3.6.1'),
+    ('git', '2.45.1'),  # bazel uses git to fetch repositories
+    ('matplotlib', '3.9.2'),  # required for tests/lobpcg_test.py
+    ('poetry', '1.8.3'),
+    ('Clang', '18.1.8', versionsuffix)
 ]
 
 dependencies = [
-    ('Python', '3.11.3'),
-    ('SciPy-bundle', '2023.07'),
+    ('CUDA', '12.6.0', '', SYSTEM),  # 12.6.2 ?
+    ('cuDNN', '9.5.0.50', versionsuffix, SYSTEM),
+    ('NCCL', '2.22.3', versionsuffix),
+    ('Python', '3.12.3'),
+    ('SciPy-bundle', '2024.05'),  # 2024.11 ?
     ('absl-py', '2.1.0'),
-    ('flatbuffers-python', '23.5.26'),
-    ('ml_dtypes', '0.3.2'),
-    ('zlib', '1.2.13'),
+    ('flatbuffers-python', '24.3.25'),
+    ('ml_dtypes', '0.5.0'),
+    ('zlib', '1.3.1'),
 ]
 
 # downloading xla and other tarballs to avoid that Bazel downloads it during the build
 local_extract_cmd = 'mkdir -p %(builddir)s/archives && cp %s %(builddir)s/archives'
 # note: following commits *must* be the exact same onces used upstream
 # XLA_COMMIT from jax-jaxlib: third_party/xla/workspace.bzl
-local_xla_commit = '4ccfe33c71665ddcbca5b127fefe8baa3ed632d4'
+local_xla_commit = 'cd6e808c59f53b40a99df1f1b860db9a3e598bff'
 # TFRT_COMMIT from xla: third_party/tsl/third_party/tf_runtime/workspace.bzl
-local_tfrt_commit = '0aeefb1660d7e37964b2bb71b1f518096bda9a25'
+local_tfrt_commit = '0aeefb1660d7e37964b2bb71b1f518096bda9a25'  # TODO: still required?
+# TODO: add other downloads
 
 # Use sources downloaded by EasyBuild
 _jaxlib_buildopts = '--bazel_options="--distdir=%(builddir)s/archives" '
 # Use dependencies from EasyBuild
 _jaxlib_buildopts += '--bazel_options="--action_env=TF_SYSTEM_LIBS=pybind11" '
-_jaxlib_buildopts += '--bazel_options="--action_env=CPATH=$EBROOTPYBIND11/include" '
+_jaxlib_buildopts += '--bazel_options="--action_env=CPATH=$EBROOTPYBIND11/include:$EBROOTCUDA/extras/CUPTI/include" '
 # Avoid warning (treated as error) in upb/table.c
-_jaxlib_buildopts += '--bazel_options="--copt=-Wno-maybe-uninitialized" '
+_jaxlib_buildopts += '--bazel_options="--copt=-Wno-maybe-uninitialized" '  # TODO: still required?
+# _jaxlib_buildopts += '--nouse_clang '  #TODO: avoid clang (?)
+_jaxlib_buildopts += '--cuda_version=%(cudaver)s '
+_jaxlib_buildopts += '--python_bin_path=$EBROOTPYTHON/bin/python3 '
+# Do not use hermetic CUDA/cuDNN/NCCL: (requires action_env=CPATH=$EBROOTCUDA/extras/CUPTI/include";
+# requires patch of external/xla/xla/tsl/cuda/cupti_stub.cc and jaxlib/gpu/vendor.h (#include <cupti.h>): 
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_CUDNN_PATH="$EBROOTCUDNN" """
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_NCCL_PATH="$EBROOTNCCL" """
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_CUDA_PATH="$EBROOTCUDA" """
+_jaxlib_buildopts += """--bazel_options="--copt=-Ithird_party/gpus/cuda/extras/CUPTI/include" """
+
+_plugins_buildopts = """--enable_cuda """
+_plugins_buildopts += """--build_gpu_plugin """
+# _plugins_buildopts +="""--gpu_plugin_cuda_version=12 """
+_plugins_buildopts += """--build_gpu_pjrt_plugin """
+_plugins_buildopts += """--build_gpu_kernel_plugin=cuda """
+
+# get rid of .devDate versionsuffix:  TODO: find a better way
+# _no_devtag = """ export JAX_RELEASE && export JAXLIB_RELEASE && """  does not work (?)
+_no_devtag = """ sed -i "s/version=__version__/version='%(version)s'/g" setup.py && """
+_jaxlib_buildopts += """--bazel_options="--action_env=JAXLIB_RELEASE=1" """  # required?
 
 components = [
     ('jaxlib', version, {
         'sources': [
             {
                 'source_urls': ['https://github.com/google/jax/archive/'],
-                'filename': '%(name)s-v%(version)s.tar.gz',
+                'filename': 'jax-v%(version)s.tar.gz',
             },
             {
                 'source_urls': ['https://github.com/openxla/xla/archive'],
@@ -68,42 +97,133 @@ components = [
                 'extract_cmd': local_extract_cmd,
             },
         ],
-        'patches': ['jax-0.4.25_fix-pybind11-systemlib.patch'],
+        'patches': [
+            'jax-0.4.35_easyblock_compat.patch',
+            'jax-0.4.35_fix-pybind11-systemlib_cupti.patch',
+            'jax-0.4.35_version.patch',
+        ],
         'checksums': [
-            {'jaxlib-v0.4.25.tar.gz':
-             'fc1197c401924942eb14185a61688d0c476e3e81ff71f9dc95e620b57c06eec8'},
-            {'xla-4ccfe33c.tar.gz':
-             '8a59b9af7d0850059d7043f7043c780066d61538f3af536e8a10d3d717f35089'},
+            {'jax-v0.4.34.tar.gz':
+             'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'xla-cd6e808c.tar.gz':
+             '65cb6d63ef4083b35775052636cb9c629f86db6947c8b91711923ba31dbdcde8'},
             {'tf_runtime-0aeefb16.tar.gz':
              'a3df827d7896774cb1d80bf4e1c79ab05c268f29bd4d3db1fb5a4b9c2079d8e3'},
-            {'jax-0.4.25_fix-pybind11-systemlib.patch':
-             'daad5b726d1a138431b05eb60ecf4c89c7b5148eb939721800bdf43d804ca033'},
+            {'jax-0.4.35_easyblock_compat.patch':
+             'cbf4ad92b8438c4ce2a975efce1c47c57d4c3b117bceee071ab660f964057223'},
+            {'jax-0.4.35_fix-pybind11-systemlib_cupti.patch':
+             '51369589193be60dc94ec2de1b35d0a9268288578903fb05d41b6d1a8c9df460'},
+            {'jax-0.4.35_version.patch':
+             'cd2139a7802abf14b4b2cecee331aed80fff2ef91e16fa105093aea0795455e8'},
         ],
-        'start_dir': 'jax-jaxlib-v%(version)s',
-        'buildopts': _jaxlib_buildopts
+        'start_dir': 'jax-jax-v%(version)s',
+        'buildopts': _jaxlib_buildopts,
+        'prebuildopts': ' mkdir third_party/gpus/cuda/extras/ -p && ' +
+                        'ln -s $EBROOTCUDA/extras/CUPTI third_party/gpus/cuda/extras --relative &&' +
+                        _no_devtag
     }),
-]
-
-use_pip = True
-
-exts_list = [
-    (name, version, {
+    # build jaxlib first and then plugins in 2nd interation:
+    ('jaxlib', version, {
         'sources': [
             {
                 'source_urls': ['https://github.com/google/jax/archive/'],
-                'filename': '%(name)s-v%(version)s.tar.gz',
+                'filename': 'jax-v%(version)s.tar.gz',
+            },
+            {
+                'source_urls': ['https://github.com/openxla/xla/archive'],
+                'download_filename': '%s.tar.gz' % local_xla_commit,
+                'filename': 'xla-%s.tar.gz' % local_xla_commit[:8],
+                'extract_cmd': local_extract_cmd,
+            },
+            {
+                'source_urls': ['https://github.com/tensorflow/runtime/archive'],
+                'download_filename': '%s.tar.gz' % local_tfrt_commit,
+                'filename': 'tf_runtime-%s.tar.gz' % local_tfrt_commit[:8],
+                'extract_cmd': local_extract_cmd,
             },
         ],
-        'patches': ['jax-0.4.25_fix_env_test_no_log_spam.patch'],
         'checksums': [
-            {'jax-v0.4.25.tar.gz': '8b30af49688c0c13b82c6f5ce992727c00b5fc6d04a4c6962012f4246fa664eb'},
-            {'jax-0.4.25_fix_env_test_no_log_spam.patch':
-             'a18b5f147569d9ad41025124333a0f04fd0d0e0f9e4309658d7f6b9b838e2e2a'},
+            {'jax-v0.4.34.tar.gz':
+             'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'xla-cd6e808c.tar.gz':
+             '65cb6d63ef4083b35775052636cb9c629f86db6947c8b91711923ba31dbdcde8'},
+            {'tf_runtime-0aeefb16.tar.gz':
+             'a3df827d7896774cb1d80bf4e1c79ab05c268f29bd4d3db1fb5a4b9c2079d8e3'},
         ],
-        'runtest': "pytest -n %(parallel)s tests",
+        'start_dir': 'jax-jax-v%(version)s',
+        'buildopts': _jaxlib_buildopts + _plugins_buildopts,
+        'prebuildopts': _no_devtag
     }),
+
+]
+# failing:
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex128 FAILED [ 98%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex64 FAILED [ 98%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex128 FAILED [ 99%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex64 FAILED [ 99%]
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex128 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex64 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex128 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex64 - AssertionError: 
+# tests/nn_test.py::NNFunctionsTest::testDotProductAttentionMask7 FAILED   [ 10%]
+# FAILED tests/nn_test.py::NNFunctionsTest::testDotProductAttentionMask7 - AssertionError: 
+# 
+
+# Some tests require an isolated run:  TODO: still required?
+local_isolated_tests = [
+    'tests/host_callback_test.py::HostCallbackTapTest::test_tap_scan_custom_jvp',
+    'tests/host_callback_test.py::HostCallbackTapTest::test_tap_transforms_doc',
+    'tests/lax_scipy_special_functions_test.py::LaxScipySpcialFunctionsTest' +
+    '::testScipySpecialFun_gammainc_s_2x1x4_float32_float32',
+]
+# deliberately not testing in parallel, as that results in (additional) failing tests;
+# use XLA_PYTHON_CLIENT_ALLOCATOR=platform to allocate and deallocate GPU memory during testing,
+# see https://github.com/google/jax/issues/7323 and
+# https://github.com/google/jax/blob/main/docs/gpu_memory_allocation.rst;
+# use CUDA_VISIBLE_DEVICES=0 to avoid failing tests on systems with multiple GPUs;
+# use NVIDIA_TF32_OVERRIDE=0 to avoid loosing numerical precision by disabling TF32 Tensor Cores;
+local_test_exports = [
+    "NVIDIA_TF32_OVERRIDE=0",
+    "CUDA_VISIBLE_DEVICES=0",
+    "XLA_PYTHON_CLIENT_ALLOCATOR=platform",
+    "JAX_ENABLE_X64=true",
 ]
+local_test = ''.join(['export %s;' % x for x in local_test_exports])
+# run all tests at once except for local_isolated_tests:
+local_test += "pytest -vv tests %s && " % ' '.join(['--deselect %s' % x for x in local_isolated_tests])
+# run remaining local_isolated_tests separately: 
+local_test += ' && '.join(['pytest -vv %s' % x for x in local_isolated_tests])
 
+use_pip = True
+
+exts_list = [
+    (name, version, {
+        'patches': ['jax-0.4.35_version.patch'],
+        'preinstallopts': _no_devtag,
+        'runtest': False,
+        'source_tmpl': '%(name)s-v%(version)s.tar.gz',
+        'source_urls': ['https://github.com/google/jax/archive/'],
+        'checksums': [
+            {'jax-v0.4.34.tar.gz': 'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'jax-0.4.35_version.patch': 'cd2139a7802abf14b4b2cecee331aed80fff2ef91e16fa105093aea0795455e8'},
+        ],
+    }),
+]
+sanity_check_commands = [
+    """python -c "import jax_cuda"$(echo $EBVERSIONCUDA|awk -F '.' '{print $1}')"_plugin" """
+]
 sanity_pip_check = True
 
+# TODO: patch to set default XLA_FLAGS
+modluafooter = """
+setenv("XLA_FLAGS", "--xla_gpu_cuda_data_dir=" .. os.getenv("CUDA_HOME"));
+"""
+
+modtclfooter = """
+setenv XLA_FLAGS --xla_gpu_cuda_data_dir=$::env(CUDA_HOME)
+"""
+
+# TODO: sanity check paths
+
+
 moduleclass = 'ai'
Diff against jax-0.4.25-gfbf-2023a-CUDA-12.1.1.eb

easybuild/easyconfigs/j/jax/jax-0.4.25-gfbf-2023a-CUDA-12.1.1.eb

diff --git a/easybuild/easyconfigs/j/jax/jax-0.4.25-gfbf-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/j/jax/jax-0.4.34-gfbf-2024a-CUDA-12.6.0.eb
index 75355c6c84..ba56113deb 100644
--- a/easybuild/easyconfigs/j/jax/jax-0.4.25-gfbf-2023a-CUDA-12.1.1.eb
+++ b/easybuild/easyconfigs/j/jax/jax-0.4.34-gfbf-2024a-CUDA-12.6.0.eb
@@ -6,59 +6,83 @@
 easyblock = 'PythonBundle'
 
 name = 'jax'
-version = '0.4.25'
+version = '0.4.34'
 versionsuffix = '-CUDA-%(cudaver)s'
 
 homepage = 'https://jax.readthedocs.io/'
 description = """Composable transformations of Python+NumPy programs:
 differentiate, vectorize, JIT to GPU/TPU, and more"""
 
-toolchain = {'name': 'gfbf', 'version': '2023a'}
+toolchain = {'name': 'gfbf', 'version': '2024a'}
 cuda_compute_capabilities = ["5.0", "6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "9.0"]
 
 builddependencies = [
-    ('Bazel', '6.3.1'),
-    ('pytest-xdist', '3.3.1'),
-    ('git', '2.41.0', '-nodocs'),  # bazel uses git to fetch repositories
-    ('matplotlib', '3.7.2'),  # required for tests/lobpcg_test.py
-    ('poetry', '1.5.1'),
-    ('pybind11', '2.11.1'),
+    # ('Bazel', '7.4.1'),  TODO: problems with @@local_config_python//:py3_runtime:
+    # Error in fail: interpreter_path must be an absolute path
+    # Bazel 6.5.0 (download) works.
+    ('pybind11', '2.13.6'),
+    ('pytest-xdist', '3.6.1'),
+    ('git', '2.45.1'),  # bazel uses git to fetch repositories
+    ('matplotlib', '3.9.2'),  # required for tests/lobpcg_test.py
+    ('poetry', '1.8.3'),
+    ('Clang', '18.1.8', versionsuffix)
 ]
 
 dependencies = [
-    ('CUDA', '12.1.1', '', SYSTEM),
-    ('cuDNN', '8.9.2.26', versionsuffix, SYSTEM),
-    ('NCCL', '2.18.3', versionsuffix),
-    ('Python', '3.11.3'),
-    ('SciPy-bundle', '2023.07'),
+    ('CUDA', '12.6.0', '', SYSTEM),  # 12.6.2 ?
+    ('cuDNN', '9.5.0.50', versionsuffix, SYSTEM),
+    ('NCCL', '2.22.3', versionsuffix),
+    ('Python', '3.12.3'),
+    ('SciPy-bundle', '2024.05'),  # 2024.11 ?
     ('absl-py', '2.1.0'),
-    ('flatbuffers-python', '23.5.26'),
-    ('ml_dtypes', '0.3.2'),
-    ('zlib', '1.2.13'),
+    ('flatbuffers-python', '24.3.25'),
+    ('ml_dtypes', '0.5.0'),
+    ('zlib', '1.3.1'),
 ]
 
 # downloading xla and other tarballs to avoid that Bazel downloads it during the build
 local_extract_cmd = 'mkdir -p %(builddir)s/archives && cp %s %(builddir)s/archives'
 # note: following commits *must* be the exact same onces used upstream
 # XLA_COMMIT from jax-jaxlib: third_party/xla/workspace.bzl
-local_xla_commit = '4ccfe33c71665ddcbca5b127fefe8baa3ed632d4'
+local_xla_commit = 'cd6e808c59f53b40a99df1f1b860db9a3e598bff'
 # TFRT_COMMIT from xla: third_party/tsl/third_party/tf_runtime/workspace.bzl
-local_tfrt_commit = '0aeefb1660d7e37964b2bb71b1f518096bda9a25'
+local_tfrt_commit = '0aeefb1660d7e37964b2bb71b1f518096bda9a25'  # TODO: still required?
+# TODO: add other downloads
 
 # Use sources downloaded by EasyBuild
 _jaxlib_buildopts = '--bazel_options="--distdir=%(builddir)s/archives" '
 # Use dependencies from EasyBuild
 _jaxlib_buildopts += '--bazel_options="--action_env=TF_SYSTEM_LIBS=pybind11" '
-_jaxlib_buildopts += '--bazel_options="--action_env=CPATH=$EBROOTPYBIND11/include" '
+_jaxlib_buildopts += '--bazel_options="--action_env=CPATH=$EBROOTPYBIND11/include:$EBROOTCUDA/extras/CUPTI/include" '
 # Avoid warning (treated as error) in upb/table.c
-_jaxlib_buildopts += '--bazel_options="--copt=-Wno-maybe-uninitialized" '
+_jaxlib_buildopts += '--bazel_options="--copt=-Wno-maybe-uninitialized" '  # TODO: still required?
+# _jaxlib_buildopts += '--nouse_clang '  #TODO: avoid clang (?)
+_jaxlib_buildopts += '--cuda_version=%(cudaver)s '
+_jaxlib_buildopts += '--python_bin_path=$EBROOTPYTHON/bin/python3 '
+# Do not use hermetic CUDA/cuDNN/NCCL: (requires action_env=CPATH=$EBROOTCUDA/extras/CUPTI/include";
+# requires patch of external/xla/xla/tsl/cuda/cupti_stub.cc and jaxlib/gpu/vendor.h (#include <cupti.h>): 
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_CUDNN_PATH="$EBROOTCUDNN" """
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_NCCL_PATH="$EBROOTNCCL" """
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_CUDA_PATH="$EBROOTCUDA" """
+_jaxlib_buildopts += """--bazel_options="--copt=-Ithird_party/gpus/cuda/extras/CUPTI/include" """
+
+_plugins_buildopts = """--enable_cuda """
+_plugins_buildopts += """--build_gpu_plugin """
+# _plugins_buildopts +="""--gpu_plugin_cuda_version=12 """
+_plugins_buildopts += """--build_gpu_pjrt_plugin """
+_plugins_buildopts += """--build_gpu_kernel_plugin=cuda """
+
+# get rid of .devDate versionsuffix:  TODO: find a better way
+# _no_devtag = """ export JAX_RELEASE && export JAXLIB_RELEASE && """  does not work (?)
+_no_devtag = """ sed -i "s/version=__version__/version='%(version)s'/g" setup.py && """
+_jaxlib_buildopts += """--bazel_options="--action_env=JAXLIB_RELEASE=1" """  # required?
 
 components = [
     ('jaxlib', version, {
         'sources': [
             {
                 'source_urls': ['https://github.com/google/jax/archive/'],
-                'filename': '%(name)s-v%(version)s.tar.gz',
+                'filename': 'jax-v%(version)s.tar.gz',
             },
             {
                 'source_urls': ['https://github.com/openxla/xla/archive'],
@@ -73,23 +97,79 @@ components = [
                 'extract_cmd': local_extract_cmd,
             },
         ],
-        'patches': ['jax-0.4.25_fix-pybind11-systemlib.patch'],
+        'patches': [
+            'jax-0.4.35_easyblock_compat.patch',
+            'jax-0.4.35_fix-pybind11-systemlib_cupti.patch',
+            'jax-0.4.35_version.patch',
+        ],
         'checksums': [
-            {'jaxlib-v0.4.25.tar.gz':
-             'fc1197c401924942eb14185a61688d0c476e3e81ff71f9dc95e620b57c06eec8'},
-            {'xla-4ccfe33c.tar.gz':
-             '8a59b9af7d0850059d7043f7043c780066d61538f3af536e8a10d3d717f35089'},
+            {'jax-v0.4.34.tar.gz':
+             'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'xla-cd6e808c.tar.gz':
+             '65cb6d63ef4083b35775052636cb9c629f86db6947c8b91711923ba31dbdcde8'},
             {'tf_runtime-0aeefb16.tar.gz':
              'a3df827d7896774cb1d80bf4e1c79ab05c268f29bd4d3db1fb5a4b9c2079d8e3'},
-            {'jax-0.4.25_fix-pybind11-systemlib.patch':
-             'daad5b726d1a138431b05eb60ecf4c89c7b5148eb939721800bdf43d804ca033'},
+            {'jax-0.4.35_easyblock_compat.patch':
+             'cbf4ad92b8438c4ce2a975efce1c47c57d4c3b117bceee071ab660f964057223'},
+            {'jax-0.4.35_fix-pybind11-systemlib_cupti.patch':
+             '51369589193be60dc94ec2de1b35d0a9268288578903fb05d41b6d1a8c9df460'},
+            {'jax-0.4.35_version.patch':
+             'cd2139a7802abf14b4b2cecee331aed80fff2ef91e16fa105093aea0795455e8'},
         ],
-        'start_dir': 'jax-jaxlib-v%(version)s',
-        'buildopts': _jaxlib_buildopts
+        'start_dir': 'jax-jax-v%(version)s',
+        'buildopts': _jaxlib_buildopts,
+        'prebuildopts': ' mkdir third_party/gpus/cuda/extras/ -p && ' +
+                        'ln -s $EBROOTCUDA/extras/CUPTI third_party/gpus/cuda/extras --relative &&' +
+                        _no_devtag
     }),
+    # build jaxlib first and then plugins in 2nd interation:
+    ('jaxlib', version, {
+        'sources': [
+            {
+                'source_urls': ['https://github.com/google/jax/archive/'],
+                'filename': 'jax-v%(version)s.tar.gz',
+            },
+            {
+                'source_urls': ['https://github.com/openxla/xla/archive'],
+                'download_filename': '%s.tar.gz' % local_xla_commit,
+                'filename': 'xla-%s.tar.gz' % local_xla_commit[:8],
+                'extract_cmd': local_extract_cmd,
+            },
+            {
+                'source_urls': ['https://github.com/tensorflow/runtime/archive'],
+                'download_filename': '%s.tar.gz' % local_tfrt_commit,
+                'filename': 'tf_runtime-%s.tar.gz' % local_tfrt_commit[:8],
+                'extract_cmd': local_extract_cmd,
+            },
+        ],
+        'checksums': [
+            {'jax-v0.4.34.tar.gz':
+             'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'xla-cd6e808c.tar.gz':
+             '65cb6d63ef4083b35775052636cb9c629f86db6947c8b91711923ba31dbdcde8'},
+            {'tf_runtime-0aeefb16.tar.gz':
+             'a3df827d7896774cb1d80bf4e1c79ab05c268f29bd4d3db1fb5a4b9c2079d8e3'},
+        ],
+        'start_dir': 'jax-jax-v%(version)s',
+        'buildopts': _jaxlib_buildopts + _plugins_buildopts,
+        'prebuildopts': _no_devtag
+    }),
+
 ]
+# failing:
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex128 FAILED [ 98%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex64 FAILED [ 98%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex128 FAILED [ 99%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex64 FAILED [ 99%]
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex128 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex64 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex128 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex64 - AssertionError: 
+# tests/nn_test.py::NNFunctionsTest::testDotProductAttentionMask7 FAILED   [ 10%]
+# FAILED tests/nn_test.py::NNFunctionsTest::testDotProductAttentionMask7 - AssertionError: 
+# 
 
-# Some tests require an isolated run:
+# Some tests require an isolated run:  TODO: still required?
 local_isolated_tests = [
     'tests/host_callback_test.py::HostCallbackTapTest::test_tap_scan_custom_jvp',
     'tests/host_callback_test.py::HostCallbackTapTest::test_tap_transforms_doc',
@@ -118,18 +198,32 @@ use_pip = True
 
 exts_list = [
     (name, version, {
+        'patches': ['jax-0.4.35_version.patch'],
+        'preinstallopts': _no_devtag,
+        'runtest': False,
         'source_tmpl': '%(name)s-v%(version)s.tar.gz',
         'source_urls': ['https://github.com/google/jax/archive/'],
-        'patches': ['jax-0.4.25_fix_env_test_no_log_spam.patch'],
         'checksums': [
-            {'jax-v0.4.25.tar.gz': '8b30af49688c0c13b82c6f5ce992727c00b5fc6d04a4c6962012f4246fa664eb'},
-            {'jax-0.4.25_fix_env_test_no_log_spam.patch':
-             'a18b5f147569d9ad41025124333a0f04fd0d0e0f9e4309658d7f6b9b838e2e2a'},
+            {'jax-v0.4.34.tar.gz': 'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'jax-0.4.35_version.patch': 'cd2139a7802abf14b4b2cecee331aed80fff2ef91e16fa105093aea0795455e8'},
         ],
-        'runtest': local_test,
     }),
 ]
-
+sanity_check_commands = [
+    """python -c "import jax_cuda"$(echo $EBVERSIONCUDA|awk -F '.' '{print $1}')"_plugin" """
+]
 sanity_pip_check = True
 
+# TODO: patch to set default XLA_FLAGS
+modluafooter = """
+setenv("XLA_FLAGS", "--xla_gpu_cuda_data_dir=" .. os.getenv("CUDA_HOME"));
+"""
+
+modtclfooter = """
+setenv XLA_FLAGS --xla_gpu_cuda_data_dir=$::env(CUDA_HOME)
+"""
+
+# TODO: sanity check paths
+
+
 moduleclass = 'ai'
Diff against jax-0.3.25-foss-2022a.eb

easybuild/easyconfigs/j/jax/jax-0.3.25-foss-2022a.eb

diff --git a/easybuild/easyconfigs/j/jax/jax-0.3.25-foss-2022a.eb b/easybuild/easyconfigs/j/jax/jax-0.4.34-gfbf-2024a-CUDA-12.6.0.eb
index fb821e58e1..ba56113deb 100644
--- a/easybuild/easyconfigs/j/jax/jax-0.3.25-foss-2022a.eb
+++ b/easybuild/easyconfigs/j/jax/jax-0.4.34-gfbf-2024a-CUDA-12.6.0.eb
@@ -1,114 +1,229 @@
 # This file is an EasyBuild reciPY as per https://github.com/easybuilders/easybuild
 # Author: Denis Kristak
 # Updated by: Alex Domingo (Vrije Universiteit Brussel)
+# Updated by: Pavel Tománek (INUITS)
+# Updated by: Thomas Hoffmann (EMBL Heidelberg)
 easyblock = 'PythonBundle'
 
 name = 'jax'
-version = '0.3.25'
+version = '0.4.34'
+versionsuffix = '-CUDA-%(cudaver)s'
 
-homepage = 'https://pypi.python.org/pypi/jax'
+homepage = 'https://jax.readthedocs.io/'
 description = """Composable transformations of Python+NumPy programs:
 differentiate, vectorize, JIT to GPU/TPU, and more"""
 
-toolchain = {'name': 'foss', 'version': '2022a'}
+toolchain = {'name': 'gfbf', 'version': '2024a'}
+cuda_compute_capabilities = ["5.0", "6.0", "6.1", "7.0", "7.5", "8.0", "8.6", "9.0"]
 
 builddependencies = [
-    ('Bazel', '5.1.1'),
-    ('pytest-xdist', '2.5.0'),
-    # git 2.x required to fetch repository 'io_bazel_rules_docker'
-    ('git', '2.36.0', '-nodocs'),
-    ('matplotlib', '3.5.2'),  # required for tests/lobpcg_test.py
+    # ('Bazel', '7.4.1'),  TODO: problems with @@local_config_python//:py3_runtime:
+    # Error in fail: interpreter_path must be an absolute path
+    # Bazel 6.5.0 (download) works.
+    ('pybind11', '2.13.6'),
+    ('pytest-xdist', '3.6.1'),
+    ('git', '2.45.1'),  # bazel uses git to fetch repositories
+    ('matplotlib', '3.9.2'),  # required for tests/lobpcg_test.py
+    ('poetry', '1.8.3'),
+    ('Clang', '18.1.8', versionsuffix)
 ]
 
 dependencies = [
-    ('Python', '3.10.4'),
-    ('SciPy-bundle', '2022.05'),
-    ('flatbuffers-python', '2.0'),
+    ('CUDA', '12.6.0', '', SYSTEM),  # 12.6.2 ?
+    ('cuDNN', '9.5.0.50', versionsuffix, SYSTEM),
+    ('NCCL', '2.22.3', versionsuffix),
+    ('Python', '3.12.3'),
+    ('SciPy-bundle', '2024.05'),  # 2024.11 ?
+    ('absl-py', '2.1.0'),
+    ('flatbuffers-python', '24.3.25'),
+    ('ml_dtypes', '0.5.0'),
+    ('zlib', '1.3.1'),
 ]
 
-# downloading TensorFlow tarball to avoid that Bazel downloads it during the build
-# note: this *must* be the exact same commit as used in WORKSPACE
-local_tf_commit = 'f0fe8d4c04fab1f157854a1aa3c136377901cdef'
-local_tf_dir = 'tensorflow-%s' % local_tf_commit
-local_tf_builddir = '%(builddir)s/' + local_tf_dir
+# downloading xla and other tarballs to avoid that Bazel downloads it during the build
+local_extract_cmd = 'mkdir -p %(builddir)s/archives && cp %s %(builddir)s/archives'
+# note: following commits *must* be the exact same onces used upstream
+# XLA_COMMIT from jax-jaxlib: third_party/xla/workspace.bzl
+local_xla_commit = 'cd6e808c59f53b40a99df1f1b860db9a3e598bff'
+# TFRT_COMMIT from xla: third_party/tsl/third_party/tf_runtime/workspace.bzl
+local_tfrt_commit = '0aeefb1660d7e37964b2bb71b1f518096bda9a25'  # TODO: still required?
+# TODO: add other downloads
 
-# replace remote TensorFlow repository with the local one from EB
-local_jax_prebuildopts = "sed -i -f jaxlib_local-tensorflow-repo.sed WORKSPACE && "
-local_jax_prebuildopts += "sed -i 's|EB_TF_REPOPATH|%s|' WORKSPACE && " % local_tf_builddir
+# Use sources downloaded by EasyBuild
+_jaxlib_buildopts = '--bazel_options="--distdir=%(builddir)s/archives" '
+# Use dependencies from EasyBuild
+_jaxlib_buildopts += '--bazel_options="--action_env=TF_SYSTEM_LIBS=pybind11" '
+_jaxlib_buildopts += '--bazel_options="--action_env=CPATH=$EBROOTPYBIND11/include:$EBROOTCUDA/extras/CUPTI/include" '
+# Avoid warning (treated as error) in upb/table.c
+_jaxlib_buildopts += '--bazel_options="--copt=-Wno-maybe-uninitialized" '  # TODO: still required?
+# _jaxlib_buildopts += '--nouse_clang '  #TODO: avoid clang (?)
+_jaxlib_buildopts += '--cuda_version=%(cudaver)s '
+_jaxlib_buildopts += '--python_bin_path=$EBROOTPYTHON/bin/python3 '
+# Do not use hermetic CUDA/cuDNN/NCCL: (requires action_env=CPATH=$EBROOTCUDA/extras/CUPTI/include";
+# requires patch of external/xla/xla/tsl/cuda/cupti_stub.cc and jaxlib/gpu/vendor.h (#include <cupti.h>): 
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_CUDNN_PATH="$EBROOTCUDNN" """
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_NCCL_PATH="$EBROOTNCCL" """
+_jaxlib_buildopts += """--bazel_options=--repo_env=LOCAL_CUDA_PATH="$EBROOTCUDA" """
+_jaxlib_buildopts += """--bazel_options="--copt=-Ithird_party/gpus/cuda/extras/CUPTI/include" """
 
-use_pip = True
+_plugins_buildopts = """--enable_cuda """
+_plugins_buildopts += """--build_gpu_plugin """
+# _plugins_buildopts +="""--gpu_plugin_cuda_version=12 """
+_plugins_buildopts += """--build_gpu_pjrt_plugin """
+_plugins_buildopts += """--build_gpu_kernel_plugin=cuda """
 
-default_easyblock = 'PythonPackage'
-default_component_specs = {
-    'sources': [SOURCE_TAR_GZ],
-    'source_urls': [PYPI_SOURCE],
-    'start_dir': '%(name)s-%(version)s',
-    'use_pip': True,
-    'sanity_pip_check': True,
-    'download_dep_fail': True,
-}
+# get rid of .devDate versionsuffix:  TODO: find a better way
+# _no_devtag = """ export JAX_RELEASE && export JAXLIB_RELEASE && """  does not work (?)
+_no_devtag = """ sed -i "s/version=__version__/version='%(version)s'/g" setup.py && """
+_jaxlib_buildopts += """--bazel_options="--action_env=JAXLIB_RELEASE=1" """  # required?
 
 components = [
-    ('absl-py', '1.3.0', {
-        'options': {'modulename': 'absl'},
-        'checksums': ['463c38a08d2e4cef6c498b76ba5bd4858e4c6ef51da1a5a1f27139a022e20248'],
-    }),
     ('jaxlib', version, {
         'sources': [
-            '%(name)s-v%(version)s.tar.gz',
             {
-                'download_filename': '%s.tar.gz' % local_tf_commit,
-                'filename': 'tensorflow-%s.tar.gz' % local_tf_commit,
-            }
-        ],
-        'source_urls': [
-            'https://github.com/google/jax/archive/',
-            'https://github.com/tensorflow/tensorflow/archive/'
+                'source_urls': ['https://github.com/google/jax/archive/'],
+                'filename': 'jax-v%(version)s.tar.gz',
+            },
+            {
+                'source_urls': ['https://github.com/openxla/xla/archive'],
+                'download_filename': '%s.tar.gz' % local_xla_commit,
+                'filename': 'xla-%s.tar.gz' % local_xla_commit[:8],
+                'extract_cmd': local_extract_cmd,
+            },
+            {
+                'source_urls': ['https://github.com/tensorflow/runtime/archive'],
+                'download_filename': '%s.tar.gz' % local_tfrt_commit,
+                'filename': 'tf_runtime-%s.tar.gz' % local_tfrt_commit[:8],
+                'extract_cmd': local_extract_cmd,
+            },
         ],
         'patches': [
-            ('jaxlib_local-tensorflow-repo.sed', '.'),
-            ('TensorFlow-2.7.0_cuda-noncanonical-include-paths.patch', '../' + local_tf_dir),
+            'jax-0.4.35_easyblock_compat.patch',
+            'jax-0.4.35_fix-pybind11-systemlib_cupti.patch',
+            'jax-0.4.35_version.patch',
         ],
         'checksums': [
-            # jaxlib-v0.3.25.tar.gz
-            '73ebc7868631cd9d520385557bbd7f08762d748a5a6a1bebef0f3b8d7ba748ef',
-            # tensorflow-f0fe8d4c04fab1f157854a1aa3c136377901cdef.tar.gz
-            '9ebba3031e8a81993682e4b9e43891ebb8480b6287e635df8e7efaa45ab5ede7',
-            # jaxlib_local-tensorflow-repo.sed
-            'abb5c3b97f4e317bce9f22ed3eeea3b9715365818d8b50720d937e2d41d5c4e5',
-            # TensorFlow-2.7.0_cuda-noncanonical-include-paths.patch
-            '0a759010c253d49755955cd5f028e75de4a4c447dcc8f5a0d9f47cce6881a9db',
+            {'jax-v0.4.34.tar.gz':
+             'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'xla-cd6e808c.tar.gz':
+             '65cb6d63ef4083b35775052636cb9c629f86db6947c8b91711923ba31dbdcde8'},
+            {'tf_runtime-0aeefb16.tar.gz':
+             'a3df827d7896774cb1d80bf4e1c79ab05c268f29bd4d3db1fb5a4b9c2079d8e3'},
+            {'jax-0.4.35_easyblock_compat.patch':
+             'cbf4ad92b8438c4ce2a975efce1c47c57d4c3b117bceee071ab660f964057223'},
+            {'jax-0.4.35_fix-pybind11-systemlib_cupti.patch':
+             '51369589193be60dc94ec2de1b35d0a9268288578903fb05d41b6d1a8c9df460'},
+            {'jax-0.4.35_version.patch':
+             'cd2139a7802abf14b4b2cecee331aed80fff2ef91e16fa105093aea0795455e8'},
         ],
-        'start_dir': 'jax-jaxlib-v%(version)s',
-        'prebuildopts': local_jax_prebuildopts,
+        'start_dir': 'jax-jax-v%(version)s',
+        'buildopts': _jaxlib_buildopts,
+        'prebuildopts': ' mkdir third_party/gpus/cuda/extras/ -p && ' +
+                        'ln -s $EBROOTCUDA/extras/CUPTI third_party/gpus/cuda/extras --relative &&' +
+                        _no_devtag
     }),
+    # build jaxlib first and then plugins in 2nd interation:
+    ('jaxlib', version, {
+        'sources': [
+            {
+                'source_urls': ['https://github.com/google/jax/archive/'],
+                'filename': 'jax-v%(version)s.tar.gz',
+            },
+            {
+                'source_urls': ['https://github.com/openxla/xla/archive'],
+                'download_filename': '%s.tar.gz' % local_xla_commit,
+                'filename': 'xla-%s.tar.gz' % local_xla_commit[:8],
+                'extract_cmd': local_extract_cmd,
+            },
+            {
+                'source_urls': ['https://github.com/tensorflow/runtime/archive'],
+                'download_filename': '%s.tar.gz' % local_tfrt_commit,
+                'filename': 'tf_runtime-%s.tar.gz' % local_tfrt_commit[:8],
+                'extract_cmd': local_extract_cmd,
+            },
+        ],
+        'checksums': [
+            {'jax-v0.4.34.tar.gz':
+             'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'xla-cd6e808c.tar.gz':
+             '65cb6d63ef4083b35775052636cb9c629f86db6947c8b91711923ba31dbdcde8'},
+            {'tf_runtime-0aeefb16.tar.gz':
+             'a3df827d7896774cb1d80bf4e1c79ab05c268f29bd4d3db1fb5a4b9c2079d8e3'},
+        ],
+        'start_dir': 'jax-jax-v%(version)s',
+        'buildopts': _jaxlib_buildopts + _plugins_buildopts,
+        'prebuildopts': _no_devtag
+    }),
+
 ]
+# failing:
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex128 FAILED [ 98%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex64 FAILED [ 98%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex128 FAILED [ 99%]
+# tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex64 FAILED [ 99%]
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex128 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_expm1_complex64 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex128 - AssertionError: 
+# FAILED tests/lax_test.py::FunctionAccuracyTest::testSuccessOnComplexPlane_tan_complex64 - AssertionError: 
+# tests/nn_test.py::NNFunctionsTest::testDotProductAttentionMask7 FAILED   [ 10%]
+# FAILED tests/nn_test.py::NNFunctionsTest::testDotProductAttentionMask7 - AssertionError: 
+# 
+
+# Some tests require an isolated run:  TODO: still required?
+local_isolated_tests = [
+    'tests/host_callback_test.py::HostCallbackTapTest::test_tap_scan_custom_jvp',
+    'tests/host_callback_test.py::HostCallbackTapTest::test_tap_transforms_doc',
+    'tests/lax_scipy_special_functions_test.py::LaxScipySpcialFunctionsTest' +
+    '::testScipySpecialFun_gammainc_s_2x1x4_float32_float32',
+]
+# deliberately not testing in parallel, as that results in (additional) failing tests;
+# use XLA_PYTHON_CLIENT_ALLOCATOR=platform to allocate and deallocate GPU memory during testing,
+# see https://github.com/google/jax/issues/7323 and
+# https://github.com/google/jax/blob/main/docs/gpu_memory_allocation.rst;
+# use CUDA_VISIBLE_DEVICES=0 to avoid failing tests on systems with multiple GPUs;
+# use NVIDIA_TF32_OVERRIDE=0 to avoid loosing numerical precision by disabling TF32 Tensor Cores;
+local_test_exports = [
+    "NVIDIA_TF32_OVERRIDE=0",
+    "CUDA_VISIBLE_DEVICES=0",
+    "XLA_PYTHON_CLIENT_ALLOCATOR=platform",
+    "JAX_ENABLE_X64=true",
+]
+local_test = ''.join(['export %s;' % x for x in local_test_exports])
+# run all tests at once except for local_isolated_tests:
+local_test += "pytest -vv tests %s && " % ' '.join(['--deselect %s' % x for x in local_isolated_tests])
+# run remaining local_isolated_tests separately: 
+local_test += ' && '.join(['pytest -vv %s' % x for x in local_isolated_tests])
+
+use_pip = True
 
 exts_list = [
-    ('opt_einsum', '3.3.0', {
-        'checksums': ['59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549'],
-    }),
-    ('etils', '0.8.0', {
-        'checksums': ['d1d5af7bd9c784a273c4e1eccfaa8feaca5e0481a08717b5313fa231da22a903'],
-    }),
     (name, version, {
+        'patches': ['jax-0.4.35_version.patch'],
+        'preinstallopts': _no_devtag,
+        'runtest': False,
         'source_tmpl': '%(name)s-v%(version)s.tar.gz',
         'source_urls': ['https://github.com/google/jax/archive/'],
-        'patches': [
-            'jax-0.3.23_relax-testPoly5-tolerance.patch',
-            'jax-0.3.25_skip-qdwh-test-rank-deficient-deficient.patch',
-        ],
         'checksums': [
-            {'jax-v0.3.25.tar.gz': '49e8ce88ddd7dd0de86116c9d75d98a577a9061377ec423493fbac5ea29f79f0'},
-            {'jax-0.3.23_relax-testPoly5-tolerance.patch':
-             'be64bf36dde4884a97b6c8bb22c6b14ab5b24033cd40bfe7ce18363c55c30e87'},
-            {'jax-0.3.25_skip-qdwh-test-rank-deficient-deficient.patch':
-             '70f16f2dba03ab162ce6e13ea61774524b485e9630209bbd4bec81fd16c8812f'},
+            {'jax-v0.4.34.tar.gz': 'd3a75ad667772309ade81350fa70c4a78028a920028800282e46d8383c0ee6bb'},
+            {'jax-0.4.35_version.patch': 'cd2139a7802abf14b4b2cecee331aed80fff2ef91e16fa105093aea0795455e8'},
         ],
-        'runtest': "pytest -n %(parallel)s tests",
     }),
 ]
-
+sanity_check_commands = [
+    """python -c "import jax_cuda"$(echo $EBVERSIONCUDA|awk -F '.' '{print $1}')"_plugin" """
+]
 sanity_pip_check = True
 
-moduleclass = 'tools'
+# TODO: patch to set default XLA_FLAGS
+modluafooter = """
+setenv("XLA_FLAGS", "--xla_gpu_cuda_data_dir=" .. os.getenv("CUDA_HOME"));
+"""
+
+modtclfooter = """
+setenv XLA_FLAGS --xla_gpu_cuda_data_dir=$::env(CUDA_HOME)
+"""
+
+# TODO: sanity check paths
+
+
+moduleclass = 'ai'

Updated software ml_dtypes-0.5.0-gfbf-2024a.eb

Diff against ml_dtypes-0.3.2-gfbf-2023a.eb

easybuild/easyconfigs/m/ml_dtypes/ml_dtypes-0.3.2-gfbf-2023a.eb

diff --git a/easybuild/easyconfigs/m/ml_dtypes/ml_dtypes-0.3.2-gfbf-2023a.eb b/easybuild/easyconfigs/m/ml_dtypes/ml_dtypes-0.5.0-gfbf-2024a.eb
index 9c3a18bfdb..5a20f1d845 100644
--- a/easybuild/easyconfigs/m/ml_dtypes/ml_dtypes-0.3.2-gfbf-2023a.eb
+++ b/easybuild/easyconfigs/m/ml_dtypes/ml_dtypes-0.5.0-gfbf-2024a.eb
@@ -1,8 +1,8 @@
-# Thomas Hoffmann, EMBL Heidelberg, [email protected], 2024/02
+# Thomas Hoffmann, EMBL Heidelberg, [email protected], 2024/11
 easyblock = 'PythonBundle'
 
 name = 'ml_dtypes'
-version = '0.3.2'
+version = '0.5.0'
 
 homepage = 'https://github.com/jax-ml/ml_dtypes'
 description = """
@@ -18,11 +18,16 @@ float8_e5m2
 float8_e5m2fnuz
 """
 
-toolchain = {'name': 'gfbf', 'version': '2023a'}
+toolchain = {'name': 'gfbf', 'version': '2024a'}
+
+builddependencies = [
+    ('poetry', '1.8.3'),
+]
 
 dependencies = [
-    ('Python', '3.11.3'),
-    ('SciPy-bundle', '2023.07'),
+    ('Python', '3.12.3'),
+    # ('SciPy-bundle', '2024.11'), ?
+    ('SciPy-bundle', '2024.05'),
 ]
 
 
@@ -31,16 +36,16 @@ use_pip = True
 default_easyblock = 'PythonPackage'
 
 exts_list = [
-    ('opt_einsum', '3.3.0', {
-        'checksums': ['59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549'],
+    ('opt_einsum', '3.4.0', {
+        'checksums': ['96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac'],
     }),
-    ('etils', '1.6.0', {
-        'checksums': ['c635fbd02a79fed4ad76825d31306b581d22b40671721daa8bc279cf6333e48a'],
+    ('etils', '1.10.0', {
+        'checksums': ['4eaa9d7248fd4eeb75e44d47ca29875a5ccea044cc14a17435794bf8ac116a05'],
     }),
     (name, version, {
         'patches': [('ml_dtypes-0.3.2_EigenAvx512.patch', 1)],
         'checksums': [
-            {'ml_dtypes-0.3.2.tar.gz': '533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967'},
+            {'ml_dtypes-0.5.0.tar.gz': '3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128'},
             {'ml_dtypes-0.3.2_EigenAvx512.patch': '197b05b0b7f611749824369f026099f6a172f9e8eab6ebb6504a16573746c892'},
         ],
     }),

@ThomasHoffmann77 ThomasHoffmann77 changed the title {ai}[gfbf/2024a] jax v0.4.35 w/ CUDA 12.6.0 {ai}[gfbf/2024a] jax v0.4.35 w/ CUDA 12.6.0 WIP Nov 28, 2024
@ThomasHoffmann77 ThomasHoffmann77 marked this pull request as draft November 28, 2024 13:43
@ThomasHoffmann77 ThomasHoffmann77 changed the title {ai}[gfbf/2024a] jax v0.4.35 w/ CUDA 12.6.0 WIP {ai}[gfbf/2024a] jax v0.4.35, ml_dtypes v0.5.0 w/ CUDA 12.6.0 WIP Nov 28, 2024
@github-actions github-actions bot added the change label Dec 2, 2024
@ThomasHoffmann77 ThomasHoffmann77 changed the title {ai}[gfbf/2024a] jax v0.4.35, ml_dtypes v0.5.0 w/ CUDA 12.6.0 WIP {ai}[gfbf/2024a] jax v0.4.34, ml_dtypes v0.5.0 w/ CUDA 12.6.0 WIP Dec 20, 2024
@ThomasHoffmann77 ThomasHoffmann77 marked this pull request as draft January 24, 2025 13:59
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant