Skip to content

Commit

Permalink
Refactor the overall Hugging Face -> TRTLLM export workflow (#133)
Browse files Browse the repository at this point in the history
* Initial mixtral support

* feat(mixtral): map the correct num_local_experts config key for the MOE config

* feat(mixtral): allow to specify TP/PP configurations when allocating the model

* feat(mixtral): Expose tp/pp in examples/cli

* feat(mixtral): Remove config attributes from the model_kwargs to avoid setting many dupplicates

* feat(hub): always do weight layout conversion on CPU memory

* feat(mixtral): enable MOE config conversion from transformers

* feat(parallelism): Enable providing TP/PP/MOE parallelism args

* feat(parallelism): Enable forwarding tp/pp args to trtllm-build

* feat(converter): Introduce base for TRTModelConverter

* Upgrade huggingface-hub dependency to 0.23.0

* feat(hub): Initial refactoring for clear separation of concerns

* feat(hub): Rework the overall separation of concern for the hub and exporting

* feat(hub): Working for all non-Whisper model

* feat(hub): Disable whisper for now

* feat(trtllm) : Update trtllm to 0.10.0

* feat(deps) : Ping hf-transfer to 0.1.6

* feat(quant): Rework overall quantization schema

* feat(misc): Failed name refactoring leaving untouched imports ...

* feat(hub): Expose device_map="auto"

* feat(chore): quality

* feat(hub): expose device_map to enable auto-parallel

* feat(docker): Use repo variable for image namelocal

* feat(build): Validate new workflow for building engines

* feat(deps): Move to TRTLLM 0.11 preversion

* feat(deps): Use the new executor api for running LLMs

* feat(kvcache): Use floor when computing the number of tokens to store in the kvcache

* feat(ifb): Enable async generation with in-flight batching support

* feat(misc): Add better typing return info for AutoModelCausalLM

* feat(chore): quality

* feat(misc): Remove padding reference in examples

* Ensure building all the ranks in disitributed settisgs

* Update hub tests
  • Loading branch information
mfuntowicz authored Jul 7, 2024
1 parent 714734f commit 011b5a9
Show file tree
Hide file tree
Showing 37 changed files with 1,027 additions and 3,759 deletions.
88 changes: 88 additions & 0 deletions examples/async-text-generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
from argparse import ArgumentParser
from logging import getLogger
from pathlib import Path

from transformers import AutoTokenizer

from optimum.nvidia import AutoModelForCausalLM, ExportConfig, setup_logging


# Setup logging needs to happen before importing TRT ...
setup_logging(True)

from optimum.nvidia.utils.cli import (
postprocess_quantization_parameters,
register_common_model_topology_args,
register_optimization_profiles_args,
register_quantization_args,
)


LOGGER = getLogger(__name__)


async def infer():
tokenizer = AutoTokenizer.from_pretrained(args.model)
if not tokenizer.pad_token:
tokenizer.pad_token = tokenizer.eos_token

export = ExportConfig.from_pretrained(args.model)
export.max_input_len = 1024
export.max_output_len = 256
export.max_num_tokens = 256
export.max_beam_width = 1

model = AutoModelForCausalLM.from_pretrained(
args.model, device_map="auto", export_config=export
)
# model.save_pretrained(args.output)

prompt = "What is the latest generation of Nvidia GPUs?"
tokens = tokenizer(prompt, return_tensors="pt")
generated = await model.agenerate(
tokens["input_ids"],
)

generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
print(generated_text)


if __name__ == "__main__":
parser = ArgumentParser("🤗 Optimum-Nvidia Text-Generation Example")
parser.add_argument(
"--hub-token",
type=str,
help="Hugging Face Hub Token to retrieve private weights.",
)
register_common_model_topology_args(parser)
register_optimization_profiles_args(parser)
register_quantization_args(parser) # Inject params.quantization_config

parser.add_argument("model", type=str, help="The model's id or path to use.")
parser.add_argument(
"output", type=Path, help="Path to store generated TensorRT engine."
)
args = parser.parse_args()
args = postprocess_quantization_parameters(args)

if args.hub_token is not None:
from huggingface_hub import login

login(args.hub_token)

asyncio.run(infer())
29 changes: 14 additions & 15 deletions examples/text-generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from transformers import AutoTokenizer

from optimum.nvidia import AutoModelForCausalLM, setup_logging
from optimum.nvidia import AutoModelForCausalLM, ExportConfig, setup_logging


# Setup logging needs to happen before importing TRT ...
Expand Down Expand Up @@ -59,27 +59,26 @@

login(args.hub_token)

tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
tokenizer = AutoTokenizer.from_pretrained(args.model)
if not tokenizer.pad_token:
tokenizer.pad_token = tokenizer.eos_token

# Create the model
export = ExportConfig.from_pretrained(args.model)
export.max_input_len = 1024
export.max_output_len = 256
export.max_num_tokens = 256
export.max_beam_width = 1

model = AutoModelForCausalLM.from_pretrained(
args.model, use_fp8=args.fp8, tp=args.tp, pp=args.pp
args.model, device_map="auto", export_config=export
)
model.save_pretrained(args.output)
# model.save_pretrained(args.output)

prompt = "What is the latest generation of Nvidia GPUs?"
tokens = tokenizer(prompt, padding=True, return_tensors="pt")
generated, lengths = model.generate(
**tokens,
top_k=40,
top_p=0.95,
repetition_penalty=10,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=args.max_new_tokens,
tokens = tokenizer(prompt, return_tensors="pt")
generated = model.generate(
tokens["input_ids"],
)

generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
generated_text = tokenizer.decode(generated, skip_special_tokens=True)
print(generated_text)
9 changes: 3 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,24 @@ classifiers = [
dependencies = [
"accelerate == 0.25",
"datasets >= 2.14.0",
"huggingface-hub >= 0.22.0",
"hf-transfer",
"huggingface-hub >= 0.23.0",
"hf-transfer==0.1.6",
"mpmath == 1.3.0",
"numpy >= 1.26.0, < 2.0.0",
"onnx >= 1.12.0",
"optimum >= 1.13.0",
"setuptools",
"tensorrt-llm == 0.9.0",
"tensorrt-llm > 0.10.0",
"torch>=2.2.0a,<=2.3.0a",
"transformers >= 4.38.2",
"pynvml"
]


[project.urls]
Homepage = "https://huggingface.co/hardware/nvidia"
Repository = "https://github.com/huggingface/optimum-nvidia"
Issues = "https://github.com/huggingface/optimum-nvidia/issues"


# List additional dependencies
[project.optional-dependencies]
test = ["mock", "pytest", "pytest-xdist", "psutil", "parameterized", "datasets", "safetensors",]
Expand Down Expand Up @@ -86,7 +84,6 @@ skip-magic-trailing-comma = false
# Like Black, automatically detect the appropriate line ending.
line-ending = "auto"


[tool.pytest.ini_options]
pythonpath = [
"src"
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@
INSTALL_REQUIRES = [
"accelerate == 0.25",
"datasets >= 2.14",
"huggingface-hub >= 0.22.0",
"hf-transfer",
"huggingface-hub >= 0.23",
"hf-transfer==0.1.6",
"mpmath == 1.3.0",
"numpy >= 1.26.0",
"onnx >= 1.12.0",
"optimum >= 1.13.0",
"setuptools",
"tensorrt-llm == 0.9.0",
"tensorrt-llm > 0.10.0",
"torch>=2.2.0a,<=2.3.0a",
"transformers >= 4.38.2",
"pynvml"
Expand Down
9 changes: 7 additions & 2 deletions src/optimum/nvidia/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .config import TensorRTConfig
LIBRARY_NAME = "trtllm"


from .export import ExportConfig
from .logging import DEFAULT_LOGGING_FMT, setup_logging
from .models import AutoModelForCausalLM
from .pipelines import pipeline
from .optimizations import IntoModelOptQuantizeConfig

# from .pipelines import pipeline
from .version import VERSION, __version__
17 changes: 0 additions & 17 deletions src/optimum/nvidia/builder/__init__.py

This file was deleted.

Loading

0 comments on commit 011b5a9

Please sign in to comment.