Bump TensorRT-LLM to v0.16.0 (#166)

* misc(trtllm): update to v0.15.0 * chore: quality * misc(deps): relax transformers dependency * misc(actions): update dependency on GA * misc(deps): fu * fix(test_causal_lm): double parametrization of tp * misc(deps): bump accelerate to > 0.26 * test(test_causal_lm): ok fixture was "pp" not "tp" * misc(trtllm): remove lookup_plugin key in export config * misc(trtllm): ... * misc(trtllm): reenable sparsity * misc(trtllm): attempt to automatically push to PyPi release * misc(trtllm): let's try docker too? * misc(trtllm): bump to v0.1.0b9 for release * misc(trtllm): update readme * misc(trtllm): do not set max_tokens to None * misc(trtllm): make sure we use the same inputs structure in the output * misc(trtllm): format * misc(trtllm): bump to 0.16.0 * misc(trtllm): fix test * misc(trtllm): fix test * misc(trtllm): fix test * misc(trtllm): fix test * misc(trtllm): quality * misc(trtllm): quality * misc(trtllm): attempt to fix test * update docker image to match 0.16 * update docker image to match 0.16 (bis) * top_p > 0 as per trtllm requirements * fix some more breaking changes * fix some more breaking changes bis * fix test_misc * again * again * again * fix: Fix tests to comply with 0.16.0 update. * again * finally? * reintroduce more model --------- Co-authored-by: Hugo Larcher <[email protected]>
huggingface · Jan 16, 2025 · 61f7abf · 61f7abf
1 parent 84c7fb8
commit 61f7abf
Show file tree

Hide file tree

Showing 15 changed files with 240 additions and 154 deletions.
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
@@ -23,7 +23,7 @@ jobs:
       matrix:
         config:
           - name: Optimum-Nvidia Test Suite
-            image: nvidia/cuda:12.5.1-devel-ubuntu22.04
+            image: nvidia/cuda:12.6.3-devel-ubuntu24.04
         gpu_target: ["aws-g6-12xlarge-cache", "aws-g5-12xlarge-cache"]
 
     name: ${{ matrix.config.name }}
@@ -56,8 +56,7 @@ jobs:
       - name: Install dependencies
         run: |
           apt update && apt install -y libmpich-dev libopenmpi-dev openmpi-bin git
-          python3 -m pip install --upgrade -e .[quality,tests]  --pre --extra-index-url https://pypi.nvidia.com
-          python3 -m pip install --upgrade 'transformers>=4.43.0'
+          python3 -m pip install --upgrade -e .[quality,tests] --extra-index-url https://pypi.nvidia.com
 
       - name: Run nvidia-smi
         run: |
@@ -77,7 +76,7 @@ jobs:
       matrix:
         config:
           - name: Optimum-Nvidia CLI Test Suite
-            image: nvidia/cuda:12.5.1-devel-ubuntu22.04
+            image: nvidia/cuda:12.6.3-devel-ubuntu24.04
         gpu_target: ["aws-g6-12xlarge-cache", "aws-g5-12xlarge-cache"]
 
     name: ${{ matrix.config.name }}
@@ -110,8 +109,7 @@ jobs:
       - name: Install dependencies
         run: |
           apt update && apt install -y openmpi-bin libopenmpi-dev git
-          python3 -m pip install --upgrade -e .[quality,tests]  --pre --extra-index-url https://pypi.nvidia.com
-          python3 -m pip install --upgrade 'transformers>=4.43.0'
+          python3 -m pip install --upgrade -e .[quality,tests] --extra-index-url https://pypi.nvidia.com
 
       - name: Run nvidia-smi
         run: |

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -1,10 +1,37 @@
 name: Release
 
 on:
-  release:
-    types: [published]
+  push:
+    tags:
+      - "v*"
 
 jobs:
+  pypi:
+    name: Publish release artifact on PyPi repository
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.10
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install setuptools wheel
+
+      - run: |
+          python setup.py sdist bdist_wheel
+      - run: |
+          pip install twine
+
+      - name: Upload to PyPi
+        env:
+          OPTIMUM_NVIDIA_PYPI_TOKEN: ${{ secrets.OPTIMUM_NVIDIA_PYPI_TOKEN }}
+        run: |
+          twine upload dist/* -u __token__ -p "$OPTIMUM_NVIDIA_PYPI_TOKEN"
+
   docker:
     name: Push Docker container to Docker Hub and Github Registry
     runs-on: ubuntu-latest
@@ -14,44 +41,36 @@ jobs:
       attestations: write
       id-token: write
     steps:
-        - name: Check out the repo
-          uses: actions/checkout@v4
-
-        - name: Log in to Docker Hub
-          uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
-          with:
-            username: ${{ secrets.DOCKER_USERNAME }}
-            password: ${{ secrets.DOCKER_PASSWORD }}
-
-        - name: Log in to the Container registry
-          uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
-          with:
-            registry: ghcr.io
-            username: ${{ github.actor }}
-            password: ${{ secrets.GITHUB_TOKEN }}
-
-        - name: Extract metadata (tags, labels) for Docker
-          id: meta
-          uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
-          with:
-            images: |
-              ${{ env.DOCKER_IMAGE_NAME }}
-              ghcr.io/${{ github.repository }}
-
-        - name: Build and push Docker image
-          id: push
-          uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
-          with:
-            context: .
-            file: docker/Dockerfile
-            push: true
-            tags: ${{ steps.meta.outputs.tags }}
-            labels: ${{ steps.meta.outputs.labels }}
-
-
-        - name: Generate artifact attestation
-          uses: actions/attest-build-provenance@v1
-          with:
-            subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
-            subject-digest: ${{ steps.push.outputs.digest }}
-            push-to-registry: true
+      - name: Check out the repo
+        uses: actions/checkout@v4
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: |
+            ${{ env.DOCKER_IMAGE_NAME }}
+
+      - name: Build and push Docker image
+        id: push
+        uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
+        with:
+          context: .
+          file: docker/Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
+
+      - name: Generate artifact attestation
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
+          subject-digest: ${{ steps.push.outputs.digest }}
+          push-to-registry: true
diff --git a/README.md b/README.md
@@ -6,9 +6,9 @@ Optimum-NVIDIA
 <h4> Optimized inference with NVIDIA and Hugging Face </h4>
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://huggingface.co/docs/optimum/index)
-[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31013/)
-[![cuda](https://img.shields.io/badge/cuda-12.5-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.13.0.dev2024090300-green)](https://github.com/nvidia/tensorrt-llm)
+[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31013/)
+[![cuda](https://img.shields.io/badge/cuda-12.6-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.15.0-green)](https://github.com/nvidia/tensorrt-llm)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 ---

diff --git a/examples/text-generation.py b/examples/text-generation.py
@@ -80,5 +80,8 @@
         tokens["input_ids"],
     )
 
-    generated_text = tokenizer.decode(generated, skip_special_tokens=True)
+    if len(generated) and isinstance(generated[0], int):
+        generated_text = tokenizer.decode(generated, skip_special_tokens=True)
+    else:
+        generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
     print(generated_text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,19 +24,19 @@ classifiers = [
 
 # List dependencies
 dependencies = [
-    "accelerate == 0.25",
+    "accelerate >= 0.26",
     "datasets >= 2.14.0",
     "huggingface-hub >= 0.24.0",
     "hf-transfer==0.1.6",
     "mpi4py < 4.0.0",
     "mpmath == 1.3.0",
     "numpy >= 1.26.0, < 2.0.0",
-    "onnx >= 1.12.0",
+    "onnx >= 1.17.0",
     "optimum >= 1.21.0",
     "setuptools",
-    "tensorrt-llm == 0.13.0",
-    "torch>=2.4.0a,<=2.5.0a",
-    "transformers >= 4.42.4",
+    "tensorrt-llm == 0.16.0",
+    "torch>=2.4.0a,<=2.6.0a",
+    "transformers >= 4.45.1",
     "pynvml"
 ]
 
@@ -51,7 +51,7 @@ Issues = "https://github.com/huggingface/optimum-nvidia/issues"
 
 # List additional dependencies
 [project.optional-dependencies]
-test = ["mock", "pytest", "pytest-console-scripts", "pytest-xdist", "psutil", "parameterized"]
+tests = ["mock", "pytest", "pytest-console-scripts", "pytest-xdist", "psutil", "parameterized"]
 # quality = ["black", "ruff", "isort", "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git",]
 
 # Configure build system

diff --git a/setup.py b/setup.py
@@ -27,18 +27,18 @@
     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
 
 INSTALL_REQUIRES = [
-    "accelerate == 0.25",
+    "accelerate >= 0.26",
     "datasets >= 2.14",
     "huggingface-hub >= 0.24",
     "hf-transfer==0.1.6",
     "mpmath == 1.3.0",
     "numpy >= 1.26.0",
-    "onnx >= 1.12.0",
+    "onnx >= 1.17.0",
     "optimum >= 1.21.0",
     "setuptools",
-    "tensorrt-llm == 0.13.0",
-    "torch>=2.3.0a,<=2.5.0a",
-    "transformers >= 4.43.2",
+    "tensorrt-llm == 0.16.0",
+    "torch>=2.3.0a,<=2.6.0a",
+    "transformers >= 4.45.1",
     "pynvml"
 ]
 

diff --git a/src/optimum/nvidia/export/cli.py b/src/optimum/nvidia/export/cli.py
@@ -23,15 +23,16 @@ def common_trtllm_export_args(parser: "ArgumentParser"):
         help="Maximum sequence length, in number of tokens, the model supports.",
     )
     required_group.add_argument(
-        "--max-new-tokens",
-        type=int,
-        default=-1,
-        help="Maximum new tokens, "
+        "--max-new-tokens", type=int, default=-1, help="Maximum new tokens, "
     )
 
     multi_gpu_group = parser.add_argument_group("Multi-GPU support arguments")
-    multi_gpu_group.add_argument("--tp", type=int, default=1, help="Tensor Parallel degree")
-    multi_gpu_group.add_argument("--pp", type=int, default=1, help="Pipeline Parallel degree")
+    multi_gpu_group.add_argument(
+        "--tp", type=int, default=1, help="Tensor Parallel degree"
+    )
+    multi_gpu_group.add_argument(
+        "--pp", type=int, default=1, help="Pipeline Parallel degree"
+    )
 
     optional_group = parser.add_argument_group("Optional arguments")
     optional_group.add_argument(

diff --git a/src/optimum/nvidia/export/config.py b/src/optimum/nvidia/export/config.py
@@ -12,9 +12,9 @@
 from transformers import AutoConfig
 
 from optimum.nvidia.lang import DataType
+from optimum.nvidia.utils.nvml import is_post_hopper
 from optimum.utils import NormalizedConfig
 
-from optimum.nvidia.utils.nvml import is_post_hopper
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
@@ -83,7 +83,7 @@ def validate(self) -> "ExportConfig":
         if self.max_num_tokens == -1:
             if self.enabled_chunked_context:
                 # Should be N * tokens_per_block (8192 is the default)
-                self.max_num_tokens = 8192   # hardcode for now
+                self.max_num_tokens = 8192  # hardcode for now
                 warn(
                     f"max_num_tokens set to {self.max_num_tokens} with chunked context enabled might not be optimal."
                 )
@@ -105,7 +105,6 @@ def plugin_config(self) -> "PluginConfig":
         config.use_paged_context_fmha = True
 
         if self.sharding.world_size > 1:
-            config.lookup_plugin = "auto"
             config.set_nccl_plugin()
 
         if DataType(self.dtype) == DataType.FLOAT8:
@@ -133,8 +132,7 @@ def to_builder_config(
 
             if qmode.is_weight_only():
                 plugin_config.weight_only_groupwise_quant_matmul_plugin = "auto"
-            # weight_sparsity = qmode.sparsity is not None
-            weight_sparsity = False
+            weight_sparsity = qmode.sparsity is not None
         else:
             weight_sparsity = False
 
@@ -144,7 +142,6 @@ def to_builder_config(
             max_batch_size=self.max_batch_size,
             max_beam_width=self.max_beam_width,
             max_num_tokens=self.max_num_tokens,
-            builder_opt=self.optimization_level,
             plugin_config=plugin_config,
             use_fused_mlp=True,
             weight_sparsity=weight_sparsity,

diff --git a/src/optimum/nvidia/hub.py b/src/optimum/nvidia/hub.py
@@ -241,9 +241,8 @@ def _from_pretrained(
         # Check if we have a local path to a model OR a model_id on the hub
         if local_model_id.exists() and local_model_id.is_dir():
             if any(engine_files := list(folder_list_engines(local_model_id))):
-                engines_folder = engine_files[
-                    0
-                ].parent  # Looking for parent folder not actual specific engine file
+                # Looking for parent folder not actual specific engine file
+                engines_folder = engine_files[0].parent
                 checkpoints_folder = None
             else:
                 checkpoint_files = list(folder_list_checkpoints(local_model_id))