Skip to content

Commit

Permalink
Bump TensorRT-LLM to v0.16.0 (#166)
Browse files Browse the repository at this point in the history
* misc(trtllm): update to v0.15.0

* chore: quality

* misc(deps): relax transformers dependency

* misc(actions): update dependency on GA

* misc(deps): fu

* fix(test_causal_lm): double parametrization of tp

* misc(deps): bump accelerate to > 0.26

* test(test_causal_lm): ok fixture was "pp" not "tp"

* misc(trtllm): remove lookup_plugin key in export config

* misc(trtllm): ...

* misc(trtllm): reenable sparsity

* misc(trtllm): attempt to automatically push to PyPi release

* misc(trtllm): let's try docker too?

* misc(trtllm): bump to v0.1.0b9 for release

* misc(trtllm): update readme

* misc(trtllm): do not set max_tokens to None

* misc(trtllm): make sure we use the same inputs structure in the output

* misc(trtllm): format

* misc(trtllm): bump to 0.16.0

* misc(trtllm): fix test

* misc(trtllm): fix test

* misc(trtllm): fix test

* misc(trtllm): fix test

* misc(trtllm): quality

* misc(trtllm): quality

* misc(trtllm): attempt to fix test

* update docker image to match 0.16

* update docker image to match 0.16 (bis)

* top_p > 0 as per trtllm requirements

* fix some more breaking changes

* fix some more breaking changes bis

* fix test_misc

* again

* again

* again

* fix: Fix tests to comply with 0.16.0 update.

* again

* finally?

* reintroduce more model

---------

Co-authored-by: Hugo Larcher <[email protected]>
  • Loading branch information
mfuntowicz and Hugoch authored Jan 16, 2025
1 parent 84c7fb8 commit 61f7abf
Show file tree
Hide file tree
Showing 15 changed files with 240 additions and 154 deletions.
10 changes: 4 additions & 6 deletions .github/workflows/pr_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
matrix:
config:
- name: Optimum-Nvidia Test Suite
image: nvidia/cuda:12.5.1-devel-ubuntu22.04
image: nvidia/cuda:12.6.3-devel-ubuntu24.04
gpu_target: ["aws-g6-12xlarge-cache", "aws-g5-12xlarge-cache"]

name: ${{ matrix.config.name }}
Expand Down Expand Up @@ -56,8 +56,7 @@ jobs:
- name: Install dependencies
run: |
apt update && apt install -y libmpich-dev libopenmpi-dev openmpi-bin git
python3 -m pip install --upgrade -e .[quality,tests] --pre --extra-index-url https://pypi.nvidia.com
python3 -m pip install --upgrade 'transformers>=4.43.0'
python3 -m pip install --upgrade -e .[quality,tests] --extra-index-url https://pypi.nvidia.com
- name: Run nvidia-smi
run: |
Expand All @@ -77,7 +76,7 @@ jobs:
matrix:
config:
- name: Optimum-Nvidia CLI Test Suite
image: nvidia/cuda:12.5.1-devel-ubuntu22.04
image: nvidia/cuda:12.6.3-devel-ubuntu24.04
gpu_target: ["aws-g6-12xlarge-cache", "aws-g5-12xlarge-cache"]

name: ${{ matrix.config.name }}
Expand Down Expand Up @@ -110,8 +109,7 @@ jobs:
- name: Install dependencies
run: |
apt update && apt install -y openmpi-bin libopenmpi-dev git
python3 -m pip install --upgrade -e .[quality,tests] --pre --extra-index-url https://pypi.nvidia.com
python3 -m pip install --upgrade 'transformers>=4.43.0'
python3 -m pip install --upgrade -e .[quality,tests] --extra-index-url https://pypi.nvidia.com
- name: Run nvidia-smi
run: |
Expand Down
105 changes: 62 additions & 43 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,37 @@
name: Release

on:
release:
types: [published]
push:
tags:
- "v*"

jobs:
pypi:
name: Publish release artifact on PyPi repository
runs-on: ubuntu-latest

steps:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.10

- name: Install dependencies
run: |
pip install --upgrade pip
pip install setuptools wheel
- run: |
python setup.py sdist bdist_wheel
- run: |
pip install twine
- name: Upload to PyPi
env:
OPTIMUM_NVIDIA_PYPI_TOKEN: ${{ secrets.OPTIMUM_NVIDIA_PYPI_TOKEN }}
run: |
twine upload dist/* -u __token__ -p "$OPTIMUM_NVIDIA_PYPI_TOKEN"
docker:
name: Push Docker container to Docker Hub and Github Registry
runs-on: ubuntu-latest
Expand All @@ -14,44 +41,36 @@ jobs:
attestations: write
id-token: write
steps:
- name: Check out the repo
uses: actions/checkout@v4

- name: Log in to Docker Hub
uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: |
${{ env.DOCKER_IMAGE_NAME }}
ghcr.io/${{ github.repository }}
- name: Build and push Docker image
id: push
uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
with:
context: .
file: docker/Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}


- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
subject-digest: ${{ steps.push.outputs.digest }}
push-to-registry: true
- name: Check out the repo
uses: actions/checkout@v4

- name: Log in to Docker Hub
uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: |
${{ env.DOCKER_IMAGE_NAME }}
- name: Build and push Docker image
id: push
uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
with:
context: .
file: docker/Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}


- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
subject-digest: ${{ steps.push.outputs.digest }}
push-to-registry: true
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ Optimum-NVIDIA
<h4> Optimized inference with NVIDIA and Hugging Face </h4>

[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://huggingface.co/docs/optimum/index)
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31013/)
[![cuda](https://img.shields.io/badge/cuda-12.5-green)](https://developer.nvidia.com/cuda-downloads)
[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.13.0.dev2024090300-green)](https://github.com/nvidia/tensorrt-llm)
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31013/)
[![cuda](https://img.shields.io/badge/cuda-12.6-green)](https://developer.nvidia.com/cuda-downloads)
[![trt-llm](https://img.shields.io/badge/TensorRT--LLM-0.15.0-green)](https://github.com/nvidia/tensorrt-llm)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

---
Expand Down
5 changes: 4 additions & 1 deletion examples/text-generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,8 @@
tokens["input_ids"],
)

generated_text = tokenizer.decode(generated, skip_special_tokens=True)
if len(generated) and isinstance(generated[0], int):
generated_text = tokenizer.decode(generated, skip_special_tokens=True)
else:
generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
print(generated_text)
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@ classifiers = [

# List dependencies
dependencies = [
"accelerate == 0.25",
"accelerate >= 0.26",
"datasets >= 2.14.0",
"huggingface-hub >= 0.24.0",
"hf-transfer==0.1.6",
"mpi4py < 4.0.0",
"mpmath == 1.3.0",
"numpy >= 1.26.0, < 2.0.0",
"onnx >= 1.12.0",
"onnx >= 1.17.0",
"optimum >= 1.21.0",
"setuptools",
"tensorrt-llm == 0.13.0",
"torch>=2.4.0a,<=2.5.0a",
"transformers >= 4.42.4",
"tensorrt-llm == 0.16.0",
"torch>=2.4.0a,<=2.6.0a",
"transformers >= 4.45.1",
"pynvml"
]

Expand All @@ -51,7 +51,7 @@ Issues = "https://github.com/huggingface/optimum-nvidia/issues"

# List additional dependencies
[project.optional-dependencies]
test = ["mock", "pytest", "pytest-console-scripts", "pytest-xdist", "psutil", "parameterized"]
tests = ["mock", "pytest", "pytest-console-scripts", "pytest-xdist", "psutil", "parameterized"]
# quality = ["black", "ruff", "isort", "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git",]

# Configure build system
Expand Down
10 changes: 5 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,18 @@
assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)

INSTALL_REQUIRES = [
"accelerate == 0.25",
"accelerate >= 0.26",
"datasets >= 2.14",
"huggingface-hub >= 0.24",
"hf-transfer==0.1.6",
"mpmath == 1.3.0",
"numpy >= 1.26.0",
"onnx >= 1.12.0",
"onnx >= 1.17.0",
"optimum >= 1.21.0",
"setuptools",
"tensorrt-llm == 0.13.0",
"torch>=2.3.0a,<=2.5.0a",
"transformers >= 4.43.2",
"tensorrt-llm == 0.16.0",
"torch>=2.3.0a,<=2.6.0a",
"transformers >= 4.45.1",
"pynvml"
]

Expand Down
13 changes: 7 additions & 6 deletions src/optimum/nvidia/export/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@ def common_trtllm_export_args(parser: "ArgumentParser"):
help="Maximum sequence length, in number of tokens, the model supports.",
)
required_group.add_argument(
"--max-new-tokens",
type=int,
default=-1,
help="Maximum new tokens, "
"--max-new-tokens", type=int, default=-1, help="Maximum new tokens, "
)

multi_gpu_group = parser.add_argument_group("Multi-GPU support arguments")
multi_gpu_group.add_argument("--tp", type=int, default=1, help="Tensor Parallel degree")
multi_gpu_group.add_argument("--pp", type=int, default=1, help="Pipeline Parallel degree")
multi_gpu_group.add_argument(
"--tp", type=int, default=1, help="Tensor Parallel degree"
)
multi_gpu_group.add_argument(
"--pp", type=int, default=1, help="Pipeline Parallel degree"
)

optional_group = parser.add_argument_group("Optional arguments")
optional_group.add_argument(
Expand Down
9 changes: 3 additions & 6 deletions src/optimum/nvidia/export/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
from transformers import AutoConfig

from optimum.nvidia.lang import DataType
from optimum.nvidia.utils.nvml import is_post_hopper
from optimum.utils import NormalizedConfig

from optimum.nvidia.utils.nvml import is_post_hopper

if TYPE_CHECKING:
from transformers import PretrainedConfig
Expand Down Expand Up @@ -83,7 +83,7 @@ def validate(self) -> "ExportConfig":
if self.max_num_tokens == -1:
if self.enabled_chunked_context:
# Should be N * tokens_per_block (8192 is the default)
self.max_num_tokens = 8192 # hardcode for now
self.max_num_tokens = 8192 # hardcode for now
warn(
f"max_num_tokens set to {self.max_num_tokens} with chunked context enabled might not be optimal."
)
Expand All @@ -105,7 +105,6 @@ def plugin_config(self) -> "PluginConfig":
config.use_paged_context_fmha = True

if self.sharding.world_size > 1:
config.lookup_plugin = "auto"
config.set_nccl_plugin()

if DataType(self.dtype) == DataType.FLOAT8:
Expand Down Expand Up @@ -133,8 +132,7 @@ def to_builder_config(

if qmode.is_weight_only():
plugin_config.weight_only_groupwise_quant_matmul_plugin = "auto"
# weight_sparsity = qmode.sparsity is not None
weight_sparsity = False
weight_sparsity = qmode.sparsity is not None
else:
weight_sparsity = False

Expand All @@ -144,7 +142,6 @@ def to_builder_config(
max_batch_size=self.max_batch_size,
max_beam_width=self.max_beam_width,
max_num_tokens=self.max_num_tokens,
builder_opt=self.optimization_level,
plugin_config=plugin_config,
use_fused_mlp=True,
weight_sparsity=weight_sparsity,
Expand Down
5 changes: 2 additions & 3 deletions src/optimum/nvidia/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,9 +241,8 @@ def _from_pretrained(
# Check if we have a local path to a model OR a model_id on the hub
if local_model_id.exists() and local_model_id.is_dir():
if any(engine_files := list(folder_list_engines(local_model_id))):
engines_folder = engine_files[
0
].parent # Looking for parent folder not actual specific engine file
# Looking for parent folder not actual specific engine file
engines_folder = engine_files[0].parent
checkpoints_folder = None
else:
checkpoint_files = list(folder_list_checkpoints(local_model_id))
Expand Down
Loading

0 comments on commit 61f7abf

Please sign in to comment.