Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add no code export infrastructure through CLI #148

Closed
wants to merge 10 commits into from
12 changes: 8 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,18 @@ dependencies = [
"mpmath == 1.3.0",
"numpy >= 1.26.0, < 2.0.0",
"onnx >= 1.12.0",
"optimum >= 1.13.0",
"optimum >= 1.21.0",
"setuptools",
"tensorrt-llm == 0.12.0.dev2024072300",
"torch>=2.3.0a,<=2.4.0a",
"transformers >= 4.38.2",
"tensorrt-llm == 0.13.0.dev2024082700",
"torch>=2.4.0a,<=2.5.0a",
# "transformers >= 4.43.2",
"pynvml"
]

[project.scripts]
optimum-cli="optimum.commands.optimum_cli:main"


[project.urls]
Homepage = "https://huggingface.co/hardware/nvidia"
Repository = "https://github.com/huggingface/optimum-nvidia"
Expand Down
13 changes: 9 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@
"mpmath == 1.3.0",
"numpy >= 1.26.0",
"onnx >= 1.12.0",
"optimum >= 1.13.0",
"optimum >= 1.21.0",
"setuptools",
"tensorrt-llm == 0.12.0.dev2024072300",
"torch>=2.3.0a,<=2.4.0a",
"transformers >= 4.38.2",
"tensorrt-llm == 0.13.0.dev2024082700",
"torch>=2.3.0a,<=2.5.0a",
"transformers >= 4.43.2",
"pynvml"
]

Expand Down Expand Up @@ -98,4 +98,9 @@
dependency_links=["https://pypi.nvidia.com"],
include_package_data=True,
zip_safe=False,
entry_points={
"console_scripts": [
"optimum-cli=optimum.commands.optimum_cli:main",
]
},
)
57 changes: 57 additions & 0 deletions src/optimum/commands/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import platform
import subprocess

import huggingface_hub
from tensorrt import __version__ as trt_version
from tensorrt_llm import __version__ as trtllm_version
from transformers import __version__ as transformers_version
from transformers.utils import is_torch_available

from ..nvidia.version import __version__ as optimum_nvidia_version
from ..version import __version__ as optimum_version
from . import BaseOptimumCLICommand, CommandInfo


class EnvironmentCommand(BaseOptimumCLICommand):
COMMAND = CommandInfo(
name="env", help="Get information about the environment used."
)

@staticmethod
def print_apt_pkgs():
apt = subprocess.Popen(["apt", "list", "--installed"], stdout=subprocess.PIPE)
grep = subprocess.Popen(
["grep", "cuda"], stdin=apt.stdout, stdout=subprocess.PIPE
)
pkgs_list = list(grep.stdout)
for pkg in pkgs_list:
print(pkg.decode("utf-8").split("\n")[0])

def run(self):
pt_version = "not installed"
if is_torch_available():
import torch

pt_version = torch.__version__

platform_info = {
"Platform": platform.platform(),
"Python version": platform.python_version(),
}
info = {
"`optimum-neuron` version": optimum_nvidia_version,
"`tensorrt` version": trt_version,
"`tensorrt-llm` version": trtllm_version,
"`optimum` version": optimum_version,
"`transformers` version": transformers_version,
"`huggingface_hub` version": huggingface_hub.__version__,
"`torch` version": f"{pt_version}",
}

print("\nCopy-and-paste the text below in your GitHub issue:\n")
print("\nPlatform:\n")
print(self.format_dict(platform_info))
print("\nPython packages:\n")
print(self.format_dict(info))
print("\nCUDA system packages:\n")
self.print_apt_pkgs()
41 changes: 41 additions & 0 deletions src/optimum/commands/export/trtllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import subprocess
import sys
from typing import TYPE_CHECKING, Optional

from ..base import BaseOptimumCLICommand, CommandInfo
from ...nvidia.export.cli import common_trtllm_export_args

if TYPE_CHECKING:
from argparse import ArgumentParser, Namespace, _SubParsersAction



class TrtLlmExportCommand(BaseOptimumCLICommand):
COMMAND = CommandInfo(
name="trtllm", help="Export PyTorch models to TensorRT-LLM compiled engines"
)

def __init__(
self,
subparsers: "_SubParsersAction",
args: Optional["Namespace"] = None,
command: Optional["CommandInfo"] = None,
from_defaults_factory: bool = False,
parser: Optional["ArgumentParser"] = None,
):
super().__init__(
subparsers,
args=args,
command=command,
from_defaults_factory=from_defaults_factory,
parser=parser,
)
self.args_string = " ".join(sys.argv[3:])

@staticmethod
def parse_args(parser: "ArgumentParser"):
return common_trtllm_export_args(parser)

def run(self):
full_command = f"python3 -m optimum.exporters.trtllm {self.args_string}"
subprocess.run(full_command, shell=True, check=True)
13 changes: 13 additions & 0 deletions src/optimum/commands/register/register_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Registers the export command for TRTLLM to the Optimum CLI."""

from ...nvidia.utils.import_utils import is_tensorrt_llm_available
from ..export import ExportCommand


if _tensorrt_llm_export_command_was_imported := is_tensorrt_llm_available():
from ..export.trtllm import TrtLlmExportCommand # noqa: F811

if _tensorrt_llm_export_command_was_imported:
REGISTER_COMMANDS = [(TrtLlmExportCommand, ExportCommand)]
else:
REGISTER_COMMANDS = []
Empty file.
26 changes: 26 additions & 0 deletions src/optimum/exporters/trtllm/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from argparse import ArgumentParser

from huggingface_hub import login

from optimum.nvidia import AutoModelForCausalLM
from optimum.nvidia.export import ExportConfig
from optimum.nvidia.export.cli import common_trtllm_export_args

from transformers import AutoConfig

if __name__ == '__main__':
parser = ArgumentParser("Hugging Face Optimum TensorRT-LLM exporter")
common_trtllm_export_args(parser)
parser.add_argument("--push-to-hub", type=str, required=False, help="Repository id where to push engines")
parser.add_argument("destination", help="Local path where the generated engines will be saved")
args = parser.parse_args()

config = AutoConfig.from_pretrained(args.model)
export = ExportConfig.from_config(config, args.max_batch_size)
model = AutoModelForCausalLM.from_pretrained(args.model, export_config=export, export_only=True)
model.save_pretrained(args.destination)

if args.push_to_hub:
print(f"Exporting model to the Hugging Face Hub: {args.push_to_hub}")
model.push_to_hub(args.push_to_hub, commit_message=f"Optimum-CLI TensorRT-LLM {args.model} export")

43 changes: 43 additions & 0 deletions src/optimum/nvidia/export/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
def common_trtllm_export_args(parser: "ArgumentParser"):
required_group = parser.add_argument_group("Required arguments")
required_group.add_argument(
"-m",
"--model",
type=str,
required=True,
help="Model ID on huggingface.co or path on disk to load model from.",
)
required_group.add_argument(
"--max-input-length",
type=int,
default=1,
help="Maximum sequence length, in number of tokens, the prompt can be. The maximum number of potential tokens "
"generated will be <max-output-length> - <max-input-length>.",
)
required_group.add_argument(
"--max-output-length",
type=int,
default=1,
help="Maximum sequence length, in number of tokens, the model supports.",
)

optional_group = parser.add_argument_group("Optional arguments")
optional_group.add_argument(
"-d",
"--dtype",
type=str,
default="auto",
help="Computational data type used for the model.",
)
optional_group.add_argument(
"--max-batch-size",
type=int,
default=1,
help="Maximum number of concurrent requests the model can process.",
)
optional_group.add_argument(
"--max-beams-width",
type=int,
default=1,
help='Maximum number of sampling paths ("beam") to evaluate when decoding new a token.',
)
20 changes: 14 additions & 6 deletions src/optimum/nvidia/export/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from tensorrt_llm import BuildConfig
from tensorrt_llm import Mapping as ShardingInfo
from tensorrt_llm.plugin import PluginConfig
from tensorrt_llm.plugin.plugin import ContextFMHAType
from transformers import AutoConfig

from optimum.nvidia.lang import DataType
Expand Down Expand Up @@ -94,12 +95,18 @@ def validate(self) -> "ExportConfig":
@property
def plugin_config(self) -> "PluginConfig":
config = PluginConfig()
config.gemm_plugin = self.dtype
config.bert_attention_plugin = self.dtype
config.gpt_attention_plugin = self.dtype
config.nccl_plugin = self.dtype
config.mamba_conv1d_plugin = self.dtype
config.moe_plugin = self.dtype

config.gemm_plugin = "auto"
config.gpt_attention_plugin = "auto"
config.set_context_fmha(ContextFMHAType.enabled)

if self.sharding.world_size > 1:
config.lookup_plugin = "auto"
config.set_nccl_plugin()

if DataType(self.dtype) == DataType.FLOAT8:
config.gemm_swiglu_plugin = True

return config

def to_builder_config(
Expand All @@ -115,6 +122,7 @@ def to_builder_config(
max_num_tokens=self.max_num_tokens,
builder_opt=self.optimization_level,
plugin_config=plugin_config or self.plugin_config,
use_fused_mlp=True,
)

def with_sharding(
Expand Down
25 changes: 23 additions & 2 deletions src/optimum/nvidia/export/converter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
from abc import ABC
from enum import Enum
from logging import getLogger
Expand Down Expand Up @@ -66,6 +67,7 @@ def __init__(
model_id: str,
subpart: str = "",
workspace: Optional[Union["Workspace", str, bytes, Path]] = None,
license_path: Optional[Union[str, bytes, Path]] = None,
):
LOGGER.info(f"Creating a model converter for {subpart}")
if not workspace:
Expand All @@ -80,11 +82,26 @@ def __init__(
LOGGER.debug(f"Initializing model converter workspace at {workspace.root}")

self._workspace = workspace
self._license_path = license_path

@property
def workspace(self) -> Workspace:
return self._workspace

def save_license(self, licence_filename: str = "LICENSE"):
"""
Save the license if provided and if the license is not already present.
This method doesn't check the content of the license
:param licence_filename: Name of the file containing the license content
"""
if (
not (
dst_licence_file_path := self.workspace.root / licence_filename
).exists()
and self._license_path
):
shutil.copyfile(self._license_path, dst_licence_file_path)

def quantize(self):
raise NotImplementedError()

Expand All @@ -108,6 +125,7 @@ def convert(
)
model.save_checkpoint(str(self._workspace.checkpoints_path))

self.save_license()
return TensorRTArtifact.checkpoints(str(self._workspace.checkpoints_path))

def build(
Expand All @@ -126,10 +144,13 @@ def build(

config = infer_plugin_from_build_config(config)

for rank, model in enumerate(models):
LOGGER.info(f"Building TRTLLM engine for rank {rank}")
for model in models:
LOGGER.info(
f"Building TRTLLM engine for rank {model.config.mapping.rank} ->> {config.to_dict()}"
)

engine = build(model, config)
engine.save(str(self._workspace.engines_path))

self.save_license()
return TensorRTArtifact.engines(str(self._workspace.engines_path))
Loading
Loading