Refactor the overall Hugging Face -> TRTLLM export workflow (#133)

* Initial mixtral support * feat(mixtral): map the correct num_local_experts config key for the MOE config * feat(mixtral): allow to specify TP/PP configurations when allocating the model * feat(mixtral): Expose tp/pp in examples/cli * feat(mixtral): Remove config attributes from the model_kwargs to avoid setting many dupplicates * feat(hub): always do weight layout conversion on CPU memory * feat(mixtral): enable MOE config conversion from transformers * feat(parallelism): Enable providing TP/PP/MOE parallelism args * feat(parallelism): Enable forwarding tp/pp args to trtllm-build * feat(converter): Introduce base for TRTModelConverter * Upgrade huggingface-hub dependency to 0.23.0 * feat(hub): Initial refactoring for clear separation of concerns * feat(hub): Rework the overall separation of concern for the hub and exporting * feat(hub): Working for all non-Whisper model * feat(hub): Disable whisper for now * feat(trtllm) : Update trtllm to 0.10.0 * feat(deps) : Ping hf-transfer to 0.1.6 * feat(quant): Rework overall quantization schema * feat(misc): Failed name refactoring leaving untouched imports ... * feat(hub): Expose device_map="auto" * feat(chore): quality * feat(hub): expose device_map to enable auto-parallel * feat(docker): Use repo variable for image namelocal * feat(build): Validate new workflow for building engines * feat(deps): Move to TRTLLM 0.11 preversion * feat(deps): Use the new executor api for running LLMs * feat(kvcache): Use floor when computing the number of tokens to store in the kvcache * feat(ifb): Enable async generation with in-flight batching support * feat(misc): Add better typing return info for AutoModelCausalLM * feat(chore): quality * feat(misc): Remove padding reference in examples * Ensure building all the ranks in disitributed settisgs * Update hub tests
huggingface · Jul 7, 2024 · 011b5a9 · 011b5a9
1 parent 714734f
commit 011b5a9
Show file tree

Hide file tree

Showing 37 changed files with 1,027 additions and 3,759 deletions.
diff --git a/examples/async-text-generation.py b/examples/async-text-generation.py
@@ -0,0 +1,88 @@
+#  coding=utf-8
+#  Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      http://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import asyncio
+from argparse import ArgumentParser
+from logging import getLogger
+from pathlib import Path
+
+from transformers import AutoTokenizer
+
+from optimum.nvidia import AutoModelForCausalLM, ExportConfig, setup_logging
+
+
+# Setup logging needs to happen before importing TRT ...
+setup_logging(True)
+
+from optimum.nvidia.utils.cli import (
+    postprocess_quantization_parameters,
+    register_common_model_topology_args,
+    register_optimization_profiles_args,
+    register_quantization_args,
+)
+
+
+LOGGER = getLogger(__name__)
+
+
+async def infer():
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    export = ExportConfig.from_pretrained(args.model)
+    export.max_input_len = 1024
+    export.max_output_len = 256
+    export.max_num_tokens = 256
+    export.max_beam_width = 1
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model, device_map="auto", export_config=export
+    )
+    # model.save_pretrained(args.output)
+
+    prompt = "What is the latest generation of Nvidia GPUs?"
+    tokens = tokenizer(prompt, return_tensors="pt")
+    generated = await model.agenerate(
+        tokens["input_ids"],
+    )
+
+    generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
+    print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser("🤗 Optimum-Nvidia Text-Generation Example")
+    parser.add_argument(
+        "--hub-token",
+        type=str,
+        help="Hugging Face Hub Token to retrieve private weights.",
+    )
+    register_common_model_topology_args(parser)
+    register_optimization_profiles_args(parser)
+    register_quantization_args(parser)  # Inject params.quantization_config
+
+    parser.add_argument("model", type=str, help="The model's id or path to use.")
+    parser.add_argument(
+        "output", type=Path, help="Path to store generated TensorRT engine."
+    )
+    args = parser.parse_args()
+    args = postprocess_quantization_parameters(args)
+
+    if args.hub_token is not None:
+        from huggingface_hub import login
+
+        login(args.hub_token)
+
+    asyncio.run(infer())
diff --git a/examples/text-generation.py b/examples/text-generation.py
@@ -19,7 +19,7 @@
 
 from transformers import AutoTokenizer
 
-from optimum.nvidia import AutoModelForCausalLM, setup_logging
+from optimum.nvidia import AutoModelForCausalLM, ExportConfig, setup_logging
 
 
 # Setup logging needs to happen before importing TRT ...
@@ -59,27 +59,26 @@
 
         login(args.hub_token)
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
     if not tokenizer.pad_token:
         tokenizer.pad_token = tokenizer.eos_token
 
-    # Create the model
+    export = ExportConfig.from_pretrained(args.model)
+    export.max_input_len = 1024
+    export.max_output_len = 256
+    export.max_num_tokens = 256
+    export.max_beam_width = 1
+
     model = AutoModelForCausalLM.from_pretrained(
-        args.model, use_fp8=args.fp8, tp=args.tp, pp=args.pp
+        args.model, device_map="auto", export_config=export
     )
-    model.save_pretrained(args.output)
+    # model.save_pretrained(args.output)
 
     prompt = "What is the latest generation of Nvidia GPUs?"
-    tokens = tokenizer(prompt, padding=True, return_tensors="pt")
-    generated, lengths = model.generate(
-        **tokens,
-        top_k=40,
-        top_p=0.95,
-        repetition_penalty=10,
-        pad_token_id=tokenizer.eos_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        max_new_tokens=args.max_new_tokens,
+    tokens = tokenizer(prompt, return_tensors="pt")
+    generated = model.generate(
+        tokens["input_ids"],
     )
 
-    generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
+    generated_text = tokenizer.decode(generated, skip_special_tokens=True)
     print(generated_text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,26 +26,24 @@ classifiers = [
 dependencies = [
     "accelerate == 0.25",
     "datasets >= 2.14.0",
-    "huggingface-hub >= 0.22.0",
-    "hf-transfer",
+    "huggingface-hub >= 0.23.0",
+    "hf-transfer==0.1.6",
     "mpmath == 1.3.0",
     "numpy >= 1.26.0, < 2.0.0",
     "onnx >= 1.12.0",
     "optimum >= 1.13.0",
     "setuptools",
-    "tensorrt-llm == 0.9.0",
+    "tensorrt-llm > 0.10.0",
     "torch>=2.2.0a,<=2.3.0a",
     "transformers >= 4.38.2",
     "pynvml"
 ]
 
-
 [project.urls]
 Homepage = "https://huggingface.co/hardware/nvidia"
 Repository = "https://github.com/huggingface/optimum-nvidia"
 Issues = "https://github.com/huggingface/optimum-nvidia/issues"
 
-
 # List additional dependencies
 [project.optional-dependencies]
 test = ["mock", "pytest", "pytest-xdist", "psutil", "parameterized", "datasets", "safetensors",]
@@ -86,7 +84,6 @@ skip-magic-trailing-comma = false
 # Like Black, automatically detect the appropriate line ending.
 line-ending = "auto"
 
-
 [tool.pytest.ini_options]
 pythonpath = [
     "src"

diff --git a/setup.py b/setup.py
@@ -29,14 +29,14 @@
 INSTALL_REQUIRES = [
     "accelerate == 0.25",
     "datasets >= 2.14",
-    "huggingface-hub >= 0.22.0",
-    "hf-transfer",
+    "huggingface-hub >= 0.23",
+    "hf-transfer==0.1.6",
     "mpmath == 1.3.0",
     "numpy >= 1.26.0",
     "onnx >= 1.12.0",
     "optimum >= 1.13.0",
     "setuptools",
-    "tensorrt-llm == 0.9.0",
+    "tensorrt-llm > 0.10.0",
     "torch>=2.2.0a,<=2.3.0a",
     "transformers >= 4.38.2",
     "pynvml"

diff --git a/src/optimum/nvidia/__init__.py b/src/optimum/nvidia/__init__.py
@@ -13,8 +13,13 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-from .config import TensorRTConfig
+LIBRARY_NAME = "trtllm"
+
+
+from .export import ExportConfig
 from .logging import DEFAULT_LOGGING_FMT, setup_logging
 from .models import AutoModelForCausalLM
-from .pipelines import pipeline
+from .optimizations import IntoModelOptQuantizeConfig
+
+# from .pipelines import pipeline
 from .version import VERSION, __version__
diff --git a/src/optimum/nvidia/builder/__init__.py b/src/optimum/nvidia/builder/__init__.py