huggingface · mfuntowicz · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,14 +31,18 @@ dependencies = [
     "mpmath == 1.3.0",
     "numpy >= 1.26.0, < 2.0.0",
     "onnx >= 1.12.0",
-    "optimum >= 1.13.0",
+    "optimum >= 1.21.0",
     "setuptools",
-    "tensorrt-llm == 0.12.0.dev2024072300",
-    "torch>=2.3.0a,<=2.4.0a",
-    "transformers >= 4.38.2",
+    "tensorrt-llm == 0.13.0.dev2024082700",
+    "torch>=2.4.0a,<=2.5.0a",
+#    "transformers >= 4.43.2",
     "pynvml"
 ]
 
+[project.scripts]
+optimum-cli="optimum.commands.optimum_cli:main"
+
+
 [project.urls]
 Homepage = "https://huggingface.co/hardware/nvidia"
 Repository = "https://github.com/huggingface/optimum-nvidia"

diff --git a/setup.py b/setup.py
@@ -34,11 +34,11 @@
     "mpmath == 1.3.0",
     "numpy >= 1.26.0",
     "onnx >= 1.12.0",
-    "optimum >= 1.13.0",
+    "optimum >= 1.21.0",
     "setuptools",
-    "tensorrt-llm == 0.12.0.dev2024072300",
-    "torch>=2.3.0a,<=2.4.0a",
-    "transformers >= 4.38.2",
+    "tensorrt-llm == 0.13.0.dev2024082700",
+    "torch>=2.3.0a,<=2.5.0a",
+    "transformers >= 4.43.2",
     "pynvml"
 ]
 
@@ -98,4 +98,9 @@
     dependency_links=["https://pypi.nvidia.com"],
     include_package_data=True,
     zip_safe=False,
+    entry_points={
+        "console_scripts": [
+            "optimum-cli=optimum.commands.optimum_cli:main",
+        ]
+    },
 )
diff --git a/src/optimum/commands/env.py b/src/optimum/commands/env.py
@@ -0,0 +1,57 @@
+import platform
+import subprocess
+
+import huggingface_hub
+from tensorrt import __version__ as trt_version
+from tensorrt_llm import __version__ as trtllm_version
+from transformers import __version__ as transformers_version
+from transformers.utils import is_torch_available
+
+from ..nvidia.version import __version__ as optimum_nvidia_version
+from ..version import __version__ as optimum_version
+from . import BaseOptimumCLICommand, CommandInfo
+
+
+class EnvironmentCommand(BaseOptimumCLICommand):
+    COMMAND = CommandInfo(
+        name="env", help="Get information about the environment used."
+    )
+
+    @staticmethod
+    def print_apt_pkgs():
+        apt = subprocess.Popen(["apt", "list", "--installed"], stdout=subprocess.PIPE)
+        grep = subprocess.Popen(
+            ["grep", "cuda"], stdin=apt.stdout, stdout=subprocess.PIPE
+        )
+        pkgs_list = list(grep.stdout)
+        for pkg in pkgs_list:
+            print(pkg.decode("utf-8").split("\n")[0])
+
+    def run(self):
+        pt_version = "not installed"
+        if is_torch_available():
+            import torch
+
+            pt_version = torch.__version__
+
+        platform_info = {
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+        }
+        info = {
+            "`optimum-neuron` version": optimum_nvidia_version,
+            "`tensorrt` version": trt_version,
+            "`tensorrt-llm` version": trtllm_version,
+            "`optimum` version": optimum_version,
+            "`transformers` version": transformers_version,
+            "`huggingface_hub` version": huggingface_hub.__version__,
+            "`torch` version": f"{pt_version}",
+        }
+
+        print("\nCopy-and-paste the text below in your GitHub issue:\n")
+        print("\nPlatform:\n")
+        print(self.format_dict(platform_info))
+        print("\nPython packages:\n")
+        print(self.format_dict(info))
+        print("\nCUDA system packages:\n")
+        self.print_apt_pkgs()
diff --git a/src/optimum/commands/export/trtllm.py b/src/optimum/commands/export/trtllm.py
@@ -0,0 +1,41 @@
+import subprocess
+import sys
+from typing import TYPE_CHECKING, Optional
+
+from ..base import BaseOptimumCLICommand, CommandInfo
+from ...nvidia.export.cli import common_trtllm_export_args
+
+if TYPE_CHECKING:
+    from argparse import ArgumentParser, Namespace, _SubParsersAction
+
+
+
+class TrtLlmExportCommand(BaseOptimumCLICommand):
+    COMMAND = CommandInfo(
+        name="trtllm", help="Export PyTorch models to TensorRT-LLM compiled engines"
+    )
+
+    def __init__(
+        self,
+        subparsers: "_SubParsersAction",
+        args: Optional["Namespace"] = None,
+        command: Optional["CommandInfo"] = None,
+        from_defaults_factory: bool = False,
+        parser: Optional["ArgumentParser"] = None,
+    ):
+        super().__init__(
+            subparsers,
+            args=args,
+            command=command,
+            from_defaults_factory=from_defaults_factory,
+            parser=parser,
+        )
+        self.args_string = " ".join(sys.argv[3:])
+
+    @staticmethod
+    def parse_args(parser: "ArgumentParser"):
+        return common_trtllm_export_args(parser)
+
+    def run(self):
+        full_command = f"python3 -m optimum.exporters.trtllm {self.args_string}"
+        subprocess.run(full_command, shell=True, check=True)
diff --git a/src/optimum/commands/register/register_export.py b/src/optimum/commands/register/register_export.py
@@ -0,0 +1,13 @@
+"""Registers the export command for TRTLLM to the Optimum CLI."""
+
+from ...nvidia.utils.import_utils import is_tensorrt_llm_available
+from ..export import ExportCommand
+
+
+if _tensorrt_llm_export_command_was_imported := is_tensorrt_llm_available():
+    from ..export.trtllm import TrtLlmExportCommand  # noqa: F811
+
+if _tensorrt_llm_export_command_was_imported:
+    REGISTER_COMMANDS = [(TrtLlmExportCommand, ExportCommand)]
+else:
+    REGISTER_COMMANDS = []
diff --git a/src/optimum/exporters/trtllm/__init__.py b/src/optimum/exporters/trtllm/__init__.py
diff --git a/src/optimum/exporters/trtllm/main.py b/src/optimum/exporters/trtllm/main.py
@@ -0,0 +1,26 @@
+from argparse import ArgumentParser
+
+from huggingface_hub import login
+
+from optimum.nvidia import AutoModelForCausalLM
+from optimum.nvidia.export import ExportConfig
+from optimum.nvidia.export.cli import common_trtllm_export_args
+
+from transformers import AutoConfig
+
+if __name__ == '__main__':
+    parser = ArgumentParser("Hugging Face Optimum TensorRT-LLM exporter")
+    common_trtllm_export_args(parser)
+    parser.add_argument("--push-to-hub", type=str, required=False, help="Repository id where to push engines")
+    parser.add_argument("destination", help="Local path where the generated engines will be saved")
+    args = parser.parse_args()
+
+    config = AutoConfig.from_pretrained(args.model)
+    export = ExportConfig.from_config(config, args.max_batch_size)
+    model = AutoModelForCausalLM.from_pretrained(args.model, export_config=export, export_only=True)
+    model.save_pretrained(args.destination)
+
+    if args.push_to_hub:
+        print(f"Exporting model to the Hugging Face Hub: {args.push_to_hub}")
+        model.push_to_hub(args.push_to_hub, commit_message=f"Optimum-CLI TensorRT-LLM {args.model} export")
+
diff --git a/src/optimum/nvidia/export/cli.py b/src/optimum/nvidia/export/cli.py
@@ -0,0 +1,43 @@
+def common_trtllm_export_args(parser: "ArgumentParser"):
+    required_group = parser.add_argument_group("Required arguments")
+    required_group.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        required=True,
+        help="Model ID on huggingface.co or path on disk to load model from.",
+    )
+    required_group.add_argument(
+        "--max-input-length",
+        type=int,
+        default=1,
+        help="Maximum sequence length, in number of tokens, the prompt can be. The maximum number of potential tokens "
+             "generated will be <max-output-length> - <max-input-length>.",
+    )
+    required_group.add_argument(
+        "--max-output-length",
+        type=int,
+        default=1,
+        help="Maximum sequence length, in number of tokens, the model supports.",
+    )
+
+    optional_group = parser.add_argument_group("Optional arguments")
+    optional_group.add_argument(
+        "-d",
+        "--dtype",
+        type=str,
+        default="auto",
+        help="Computational data type used for the model.",
+    )
+    optional_group.add_argument(
+        "--max-batch-size",
+        type=int,
+        default=1,
+        help="Maximum number of concurrent requests the model can process.",
+    )
+    optional_group.add_argument(
+        "--max-beams-width",
+        type=int,
+        default=1,
+        help='Maximum number of sampling paths ("beam") to evaluate when decoding new a token.',
+    )
diff --git a/src/optimum/nvidia/export/config.py b/src/optimum/nvidia/export/config.py
@@ -7,6 +7,7 @@
 from tensorrt_llm import BuildConfig
 from tensorrt_llm import Mapping as ShardingInfo
 from tensorrt_llm.plugin import PluginConfig
+from tensorrt_llm.plugin.plugin import ContextFMHAType
 from transformers import AutoConfig
 
 from optimum.nvidia.lang import DataType
@@ -94,12 +95,18 @@ def validate(self) -> "ExportConfig":
     @property
     def plugin_config(self) -> "PluginConfig":
         config = PluginConfig()
-        config.gemm_plugin = self.dtype
-        config.bert_attention_plugin = self.dtype
-        config.gpt_attention_plugin = self.dtype
-        config.nccl_plugin = self.dtype
-        config.mamba_conv1d_plugin = self.dtype
-        config.moe_plugin = self.dtype
+
+        config.gemm_plugin = "auto"
+        config.gpt_attention_plugin = "auto"
+        config.set_context_fmha(ContextFMHAType.enabled)
+
+        if self.sharding.world_size > 1:
+            config.lookup_plugin = "auto"
+            config.set_nccl_plugin()
+
+        if DataType(self.dtype) == DataType.FLOAT8:
+            config.gemm_swiglu_plugin = True
+
         return config
 
     def to_builder_config(
@@ -115,6 +122,7 @@ def to_builder_config(
             max_num_tokens=self.max_num_tokens,
             builder_opt=self.optimization_level,
             plugin_config=plugin_config or self.plugin_config,
+            use_fused_mlp=True,
         )
 
     def with_sharding(

diff --git a/src/optimum/nvidia/export/converter.py b/src/optimum/nvidia/export/converter.py
@@ -1,3 +1,4 @@
+import shutil
 from abc import ABC
 from enum import Enum
 from logging import getLogger
@@ -66,6 +67,7 @@ def __init__(
         model_id: str,
         subpart: str = "",
         workspace: Optional[Union["Workspace", str, bytes, Path]] = None,
+        license_path: Optional[Union[str, bytes, Path]] = None,
     ):
         LOGGER.info(f"Creating a model converter for {subpart}")
         if not workspace:
@@ -80,11 +82,26 @@ def __init__(
         LOGGER.debug(f"Initializing model converter workspace at {workspace.root}")
 
         self._workspace = workspace
+        self._license_path = license_path
 
     @property
     def workspace(self) -> Workspace:
         return self._workspace
 
+    def save_license(self, licence_filename: str = "LICENSE"):
+        """
+        Save the license if provided and if the license is not already present.
+        This method doesn't check the content of the license
+        :param licence_filename: Name of the file containing the license content
+        """
+        if (
+            not (
+                dst_licence_file_path := self.workspace.root / licence_filename
+            ).exists()
+            and self._license_path
+        ):
+            shutil.copyfile(self._license_path, dst_licence_file_path)
+
     def quantize(self):
         raise NotImplementedError()
 
@@ -108,6 +125,7 @@ def convert(
             )
             model.save_checkpoint(str(self._workspace.checkpoints_path))
 
+        self.save_license()
         return TensorRTArtifact.checkpoints(str(self._workspace.checkpoints_path))
 
     def build(
@@ -126,10 +144,13 @@ def build(
 
         config = infer_plugin_from_build_config(config)
 
-        for rank, model in enumerate(models):
-            LOGGER.info(f"Building TRTLLM engine for rank {rank}")
+        for model in models:
+            LOGGER.info(
+                f"Building TRTLLM engine for rank {model.config.mapping.rank} ->> {config.to_dict()}"
+            )
 
             engine = build(model, config)
             engine.save(str(self._workspace.engines_path))
 
+        self.save_license()
         return TensorRTArtifact.engines(str(self._workspace.engines_path))