add 0.11.2 docs

NVIDIA · May 8, 2024 · 822d7c6 · 822d7c6
commit 822d7c6
Showing 374 changed files with 45,438 additions and 0 deletions.
diff --git a/.buildinfo b/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 89ada319c94fcb1610b7f80d777e8b12
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/.doctrees/deployment/1_tensorrt_llm_deployment.doctree b/.doctrees/deployment/1_tensorrt_llm_deployment.doctree
diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle
diff --git a/.doctrees/examples/0_all_examples.doctree b/.doctrees/examples/0_all_examples.doctree
diff --git a/.doctrees/getting_started/1_overview.doctree b/.doctrees/getting_started/1_overview.doctree
diff --git a/.doctrees/getting_started/2_installation.doctree b/.doctrees/getting_started/2_installation.doctree
diff --git a/.doctrees/getting_started/3_quantization.doctree b/.doctrees/getting_started/3_quantization.doctree
diff --git a/.doctrees/getting_started/6_sparsity.doctree b/.doctrees/getting_started/6_sparsity.doctree
diff --git a/.doctrees/guides/1_quantization.doctree b/.doctrees/guides/1_quantization.doctree
diff --git a/.doctrees/guides/5_sparsity.doctree b/.doctrees/guides/5_sparsity.doctree
diff --git a/.doctrees/guides/_basic_quantization.doctree b/.doctrees/guides/_basic_quantization.doctree
diff --git a/.doctrees/guides/_choosing_quant_methods.doctree b/.doctrees/guides/_choosing_quant_methods.doctree
diff --git a/.doctrees/guides/_onnx_quantization.doctree b/.doctrees/guides/_onnx_quantization.doctree
diff --git a/.doctrees/guides/_pytorch_quantization.doctree b/.doctrees/guides/_pytorch_quantization.doctree
diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree
diff --git a/.doctrees/reference/0_versions.doctree b/.doctrees/reference/0_versions.doctree
diff --git a/.doctrees/reference/1_modelopt_api.doctree b/.doctrees/reference/1_modelopt_api.doctree
diff --git a/.doctrees/reference/generated/modelopt.deploy.doctree b/.doctrees/reference/generated/modelopt.deploy.doctree
diff --git a/.doctrees/reference/generated/modelopt.deploy.llm.doctree b/.doctrees/reference/generated/modelopt.deploy.llm.doctree
diff --git a/.doctrees/reference/generated/modelopt.deploy.llm.generate.doctree b/.doctrees/reference/generated/modelopt.deploy.llm.generate.doctree
diff --git a/.doctrees/reference/generated/modelopt.deploy.llm.model_config_trt.doctree b/.doctrees/reference/generated/modelopt.deploy.llm.model_config_trt.doctree
diff --git a/.doctrees/reference/generated/modelopt.deploy.llm.nemo_utils.doctree b/.doctrees/reference/generated/modelopt.deploy.llm.nemo_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.doctree b/.doctrees/reference/generated/modelopt.onnx.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.op_types.doctree b/.doctrees/reference/generated/modelopt.onnx.op_types.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.calib_utils.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.calib_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.graph_utils.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.graph_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.gs_patching.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.gs_patching.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.int4.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.int4.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.operators.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.operators.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.ort_patching.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.ort_patching.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.ort_utils.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.ort_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.partitioning.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.partitioning.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.qdq_utils.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.qdq_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.quant_utils.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.quant_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.quantization.quantize.doctree b/.doctrees/reference/generated/modelopt.onnx.quantization.quantize.doctree
diff --git a/.doctrees/reference/generated/modelopt.onnx.utils.doctree b/.doctrees/reference/generated/modelopt.onnx.utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.doctree b/.doctrees/reference/generated/modelopt.torch.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.distribute.doctree b/.doctrees/reference/generated/modelopt.torch.export.distribute.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.doctree b/.doctrees/reference/generated/modelopt.torch.export.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.layer_utils.doctree b/.doctrees/reference/generated/modelopt.torch.export.layer_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.model_config.doctree b/.doctrees/reference/generated/modelopt.torch.export.model_config.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.model_config_export.doctree b/.doctrees/reference/generated/modelopt.torch.export.model_config_export.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.model_config_utils.doctree b/.doctrees/reference/generated/modelopt.torch.export.model_config_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.postprocess.doctree b/.doctrees/reference/generated/modelopt.torch.export.postprocess.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.scaling_factor_utils.doctree b/.doctrees/reference/generated/modelopt.torch.export.scaling_factor_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.tensorrt_llm_utils.doctree b/.doctrees/reference/generated/modelopt.torch.export.tensorrt_llm_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.export.transformer_engine.doctree b/.doctrees/reference/generated/modelopt.torch.export.transformer_engine.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.config.doctree b/.doctrees/reference/generated/modelopt.torch.opt.config.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.conversion.doctree b/.doctrees/reference/generated/modelopt.torch.opt.conversion.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.doctree b/.doctrees/reference/generated/modelopt.torch.opt.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.dynamic.doctree b/.doctrees/reference/generated/modelopt.torch.opt.dynamic.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.hparam.doctree b/.doctrees/reference/generated/modelopt.torch.opt.hparam.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.mode.doctree b/.doctrees/reference/generated/modelopt.torch.opt.mode.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.plugins.doctree b/.doctrees/reference/generated/modelopt.torch.opt.plugins.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.searcher.doctree b/.doctrees/reference/generated/modelopt.torch.opt.searcher.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.opt.utils.doctree b/.doctrees/reference/generated/modelopt.torch.opt.utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.calib.calibrator.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.calib.calibrator.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.calib.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.calib.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.calib.histogram.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.calib.histogram.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.calib.max.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.calib.max.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.config.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.config.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.conversion.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.conversion.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.extensions.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.extensions.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.mode.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.mode.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.model_calib.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.model_calib.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.model_quant.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.model_quant.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.functional.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.functional.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.clip.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.clip.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.doctree
diff --git a/...rees/reference/generated/modelopt.torch.quantization.nn.modules.quant_activations.doctree b/...rees/reference/generated/modelopt.torch.quantization.nn.modules.quant_activations.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_batchnorm.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_batchnorm.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_conv.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_conv.doctree
diff --git a/...ees/reference/generated/modelopt.torch.quantization.nn.modules.quant_instancenorm.doctree b/...ees/reference/generated/modelopt.torch.quantization.nn.modules.quant_instancenorm.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_linear.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_linear.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_module.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_module.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_pooling.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.nn.modules.quant_pooling.doctree
diff --git a/...trees/reference/generated/modelopt.torch.quantization.nn.modules.tensor_quantizer.doctree b/...trees/reference/generated/modelopt.torch.quantization.nn.modules.tensor_quantizer.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.optim.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.optim.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.plugins.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.plugins.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.quant_modules.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.quant_modules.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.tensor_quant.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.tensor_quant.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.quantization.utils.doctree b/.doctrees/reference/generated/modelopt.torch.quantization.utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.config.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.config.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.magnitude.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.magnitude.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.mode.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.mode.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.module.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.module.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.plugins.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.plugins.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.searcher.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.searcher.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.sparsegpt.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.sparsegpt.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.sparsity.sparsification.doctree b/.doctrees/reference/generated/modelopt.torch.sparsity.sparsification.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.cpp_extension.doctree b/.doctrees/reference/generated/modelopt.torch.utils.cpp_extension.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.dataset_utils.doctree b/.doctrees/reference/generated/modelopt.torch.utils.dataset_utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.distributed.doctree b/.doctrees/reference/generated/modelopt.torch.utils.distributed.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.doctree b/.doctrees/reference/generated/modelopt.torch.utils.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.graph.doctree b/.doctrees/reference/generated/modelopt.torch.utils.graph.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.list.doctree b/.doctrees/reference/generated/modelopt.torch.utils.list.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.logging.doctree b/.doctrees/reference/generated/modelopt.torch.utils.logging.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.network.doctree b/.doctrees/reference/generated/modelopt.torch.utils.network.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.perf.doctree b/.doctrees/reference/generated/modelopt.torch.utils.perf.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.random.doctree b/.doctrees/reference/generated/modelopt.torch.utils.random.doctree
diff --git a/.doctrees/reference/generated/modelopt.torch.utils.tensor.doctree b/.doctrees/reference/generated/modelopt.torch.utils.tensor.doctree
diff --git a/.doctrees/support/1_contact.doctree b/.doctrees/support/1_contact.doctree
diff --git a/.doctrees/support/2_faqs.doctree b/.doctrees/support/2_faqs.doctree
diff --git a/.nojekyll b/.nojekyll
diff --git a/_sources/deployment/1_tensorrt_llm_deployment.rst.txt b/_sources/deployment/1_tensorrt_llm_deployment.rst.txt
@@ -0,0 +1,142 @@
+==========================
+TensorRT-LLM Deployment
+==========================
+
+.. note::
+
+    Please read the `TensorRT-LLM checkpoint workflow <https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/architecture/checkpoint.md>`_
+    first before going through this section.
+
+
+ModelOpt toolkit supports automatic conversion of ModelOpt exported LLM to the TensorRT-LLM checkpoint and the engines for accelerated inferencing.
+
+This conversion is achieved by:
+
+#. Converting Huggingface, NeMo and ModelOpt exported checkpoints to the TensorRT-LLM checkpoint.
+#. Building TensorRT-LLM engine from the TensorRT-LLM checkpoint.
+
+
+Export Quantized Model
+======================
+
+After the model is quantized, the quantized model can be exported to the TensorRT-LLM checkpoint format stored as
+
+#. A single JSON file recording the model structure and metadata (config.json)
+#. A group of safetensors files, each recording the local calibrated model on a single GPU rank (model weights, scaling factors per GPU).
+
+The export API (:meth:`export_tensorrt_llm_checkpoint <modelopt.torch.export.model_config_export.export_tensorrt_llm_checkpoint>`) can be used as follows:
+
+.. code-block:: python
+
+    from modelopt.torch.export import export_tensorrt_llm_checkpoint
+
+    with torch.inference_mode():
+        export_tensorrt_llm_checkpoint(
+            model,  # The quantized model.
+            decoder_type,  # The type of the model as str, e.g gptj, llama or gptnext.
+            dtype,  # the weights data type to export the unquantized layers.
+            export_dir,  # The directory where the exported files will be stored.
+            inference_tensor_parallel,  # The number of GPUs used in the inference time for tensor parallelism.
+            inference_pipeline_parallel,  # The number of GPUs used in the inference time for pipeline parallelism.
+        )
+
+If the :meth:`export_tensorrt_llm_checkpoint <modelopt.torch.export.model_config_export.export_tensorrt_llm_checkpoint>` call is successful, the TensorRT-LLM checkpoint will be saved. Otherwise, e.g. the ``decoder_type`` is not supported, a torch state_dict checkpoint will be saved instead.
+
+.. list-table:: Model support matrix for the TensorRT-LLM checkpoint export
+   :header-rows: 1
+
+   * - Model / Quantization
+     - FP16 / BF16
+     - FP8
+     - INT8_SQ
+     - INT4_AWQ
+   * - GPT2
+     - Yes
+     - Yes
+     - Yes
+     - No
+   * - GPTJ
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - LLAMA 2
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - LLAMA 3
+     - Yes
+     - Yes
+     - No
+     - Yes
+   * - Mistral
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - Mixtral 8x7B
+     - Yes
+     - Yes
+     - No
+     - Yes
+   * - Falcon 40B, 180B
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - Falcon 7B
+     - Yes
+     - Yes
+     - Yes
+     - No
+   * - Falcon RW 1B, 7B
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - MPT 7B, 30B
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - Baichuan 1, 2
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - Qwen 7B, 14B
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - ChatGLM2, 3 6B
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - Bloom
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - Phi-1, 2, 3
+     - Yes
+     - Yes
+     - Yes
+     - Yes
+   * - Nemotron 8
+     - Yes
+     - Yes
+     - No
+     - Yes
+   * - Gemma 2B, 7B
+     - Yes
+     - Yes
+     - No
+     - Yes
+
+Convert to TensorRT-LLM
+=======================
+
+Once the TensorRT-LLM checkpoint is available, please follow the `TensorRT-LLM build API <https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/architecture/workflow.md#build-apis>`_ to build and deploy the quantized LLM.
diff --git a/_sources/examples/0_all_examples.rst.txt b/_sources/examples/0_all_examples.rst.txt
@@ -0,0 +1,5 @@
+All ModelOpt Examples
+=====================
+
+Please visit the `TensorRT-Model-Optimizer GitHub repository <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_
+for all ModelOpt examples.
diff --git a/_sources/getting_started/1_overview.rst.txt b/_sources/getting_started/1_overview.rst.txt
@@ -0,0 +1,41 @@
+Overview
+########
+
+**NVIDIA TensorRT Model Optimizer**
+===================================
+
+Minimizing inference costs presents a significant challenge as generative AI models continue to grow in complexity and size.
+The `NVIDIA TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ (referred to as Model Optimizer, or ModelOpt)
+is a library comprising state-of-the-art model optimization techniques including quantization and sparsity to compress model.
+It accepts a torch or ONNX model as inputs and provides Python APIs for users to easily stack different model optimization
+techniques to produce quantized checkpoint. Seamlessly integrated within the NVIDIA AI software ecosystem, the quantized
+checkpoint generated from Model Optimizer is ready for deployment in downstream inference frameworks like
+`TensorRT-LLM <https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/quantization>`_ or `TensorRT <https://github.com/NVIDIA/TensorRT>`_.
+Further integrations are planned for `NVIDIA NeMo <https://github.com/NVIDIA/NeMo>`_ and `Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`_
+for training-in-the-loop optimization techniques. For enterprise users, the 8-bit quantization with Stable Diffusion is also available on
+`NVIDIA NIM <https://developer.nvidia.com/blog/nvidia-nim-offers-optimized-inference-microservices-for-deploying-ai-models-at-scale/>`_.
+
+Model Optimizer is available for free for all developers on `NVIDIA PyPI <https://pypi.org/project/nvidia-modelopt/>`_.
+Visit `/NVIDIA/TensorRT-Model-Optimizer repository <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_ for end-to-end
+example scripts and recipes optimized for NVIDIA GPUs.
+
+Techniques
+----------
+
+Quantization
+^^^^^^^^^^^^
+Quantization is an effective model optimization technique for large models. Quantization with Model Optimizer can compress
+model size by 2x-4x, speeding up inference while preserving model quality. Model Optimizer enables highly performant
+quantization formats including FP8, INT8, INT4, etc and supports advanced algorithms such as SmoothQuant, AWQ, and
+Double Quantization with easy-to-use Python APIs. Both Post-training quantization (PTQ) and Quantization-aware training (QAT)
+are supported. Visit :meth:`Quantization Format page <modelopt.torch.quantization.config>`
+for list of formats supported.
+
+Sparsity
+^^^^^^^^
+Sparsity is a technique to further reduce the memory footprint of deep learning models and accelerate the inference.
+Model Optimizer provides Python API :meth:`mts.sparsify() <modelopt.torch.sparsity.sparsification.sparsify>` to apply
+weight sparsity to a given model. The ``mts.sparsify()`` API supports `NVIDIA 2:4 <https://arxiv.org/pdf/2104.0837>`_
+sparsity pattern and various sparsification methods, such as NVIDIA `ASP <https://github.com/NVIDIA/apex/tree/master/apex/contrib/sparsity>`_
+and `SparseGPT <https://arxiv.org/abs/2301.00774>`_. It supports both post-training sparsity and sparsity with fine-tuning.
+The latter workflow is recommended to minimize accuracy degradation.
diff --git a/_sources/getting_started/2_installation.rst.txt b/_sources/getting_started/2_installation.rst.txt
@@ -0,0 +1,110 @@
+============
+Installation
+============
+
+System requirements
+===================
+
+Model Optimizer (``nvidia-modelopt``) currently has the following system requirements:
+
++----------------------+-----------------------------+
+| OS                   |  Linux, Windows             |
++----------------------+-----------------------------+
+| Architecture         |  x86_64, aarch64, win_amd64 |
++----------------------+-----------------------------+
+| Python               |  >=3.8,<3.12                |
++----------------------+-----------------------------+
+| PyTorch              |  >=1.11                     |
++----------------------+-----------------------------+
+| CUDA                 |  >=11.8 (Recommended)       |
++----------------------+-----------------------------+
+
+Install Model Optimizer
+=======================
+
+ModelOpt including its dependencies can be installed via ``pip``. Please review the
+license terms of ModelOpt and any dependencies before use.
+
+.. tab:: Quick install
+
+    .. empty tab
+
+.. tab:: Detailed instructions
+
+    **Setting up a virtual environment**
+
+    We recommend setting up a virtual environment if you don't have one already. Run the following
+    command to set up and activate a ``conda`` virtual environment named ``modelopt`` with Python 3.11:
+
+    .. code-block:: bash
+
+        conda create -n modelopt python=3.11 pip
+
+    .. code-block:: bash
+
+        conda activate modelopt
+
+    (Optional) **Install desired PyTorch version**
+
+
+    By default, the latest PyTorch version (``torch>=1.11``) available on ``pip`` will
+    be installed. If you want to install a specific PyTorch version for a specific CUDA version, please first
+    `follow the instructions to install your desired PyTorch version <https://pytorch.org/get-started/locally/>`_.
+    For example, to install latest ``torch>=1.11`` with CUDA 11.8 run:
+
+    .. code-block:: bash
+
+        pip install torch --extra-index-url https://download.pytorch.org/whl/cu118
+
+    **Identify correct partial dependencies**
+
+    Note that when installing ``nvidia-modelopt`` without optional dependencies, only the barebone
+    requirements are installed and none of the modules will work without the appropriate optional
+    dependencies or ``[all]`` optional dependencies. Below is a list of optional dependencies that
+    need to be installed to correctly use the corresponding modules:
+
+    .. list-table::
+        :widths: 30 30
+        :header-rows: 1
+
+        * - Module
+          - Optional dependencies
+        * - ``modelopt.deploy``
+          - ``[deploy]``
+        * - ``modelopt.onnx``
+          - ``[onnx]``
+        * - ``modelopt.torch``
+          - ``[torch]``
+        * - ``modelopt.torch._deploy``
+          - ``[torch, deploy]``
+
+    Additionally, we support the following 3rd-party plugins:
+
+    .. list-table::
+        :widths: 30 30
+        :header-rows: 1
+
+        * - Third-party package
+          - Optional dependencies
+        * - ``transformers`` (Huggingface)
+          - ``[hf]``
+
+**Install Model Optimizer** (``nvidia-modelopt``)
+
+.. code-block:: bash
+
+    pip install "nvidia-modelopt[all]" --no-cache-dir --extra-index-url https://pypi.nvidia.com
+
+Check installation
+==================
+
+.. tip::
+
+    When you use ModelOpt's PyTorch quantization APIs for the first time, it will compile the fast quantization kernels
+    using your installed torch and CUDA if available.
+    This may take a few minutes but subsequent quantization calls will be much faster.
+    To invoke the compilation now and check if it is successful, run the following command:
+
+    .. code-block:: bash
+
+        python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
diff --git a/_sources/getting_started/3_quantization.rst.txt b/_sources/getting_started/3_quantization.rst.txt
@@ -0,0 +1,70 @@
+=========================
+Quick Start: Quantization
+=========================
+
+Quantization
+------------
+
+Quantization is an effective technique to reduce the memory footprint of deep learning models and to
+accelerate the inference speed.
+
+ModelOpt's :meth:`mtq.quantize() <modelopt.torch.quantization.model_quant.quantize>` API enables
+users to quantize a model with advanced algorithms like SmoothQuant, AWQ etc. ModelOpt supports both
+Post Training Quantization (PTQ) and Quantization Aware Training (QAT).
+
+.. tip::
+
+    Please refer to :any:`quantization-formats` for details on the ModelOpt supported quantization
+    formats and their use-cases.
+
+PTQ for PyTorch models
+-----------------------------
+
+:meth:`mtq.quantize <modelopt.torch.quantization.model_quant.quantize>` requires the model,
+the appropriate quantization configuration and a forward loop as inputs. Here is a quick example of
+quantizing a model with int8 SmoothQuant using
+:meth:`mtq.quantize <modelopt.torch.quantization.model_quant.quantize>`:
+
+.. code-block:: python
+
+    import modelopt.torch.quantization as mtq
+
+    # Setup the model
+    model = get_model()
+
+    # The quantization algorithm requires calibration data. Below we show a rough example of how to
+    # set up a calibration data loader with the desired calib_size
+    data_loader = get_dataloader(num_samples=calib_size)
+
+
+    # Define the forward_loop function with the model as input. The data loader should be wrapped
+    # inside the function.
+    def forward_loop(model):
+        for batch in data_loader:
+            model(batch)
+
+
+    # Quantize the model and perform calibration (PTQ)
+    model = mtq.quantize(model, mtq.INT8_SMOOTHQUANT_CFG, forward_loop)
+
+Refer to :any:`quantization-configs` for the quantization configurations available from ModelOpt.
+
+Deployment
+----------------
+
+The quantized model is just like a regular Pytorch model and is ready for evaluation or deployment.
+
+Huggingface or Nemo LLM models can be exported to TensorRT-LLM using ModelOpt.
+Please see :doc:`TensorRT-LLM Deployment <../deployment/1_tensorrt_llm_deployment>` guide for more
+details.
+
+The model can be also exported to ONNX using
+`torch.onnx.export <https://pytorch.org/docs/stable/onnx_torchscript.html#torch.onnx.export>`_.
+
+--------------------------------
+
+**Next Steps**
+    * Learn more about quantization and advanced usage of Model Optimizer quantization in
+      :doc:`Quantization guide <../guides/1_quantization>`.
+    * Checkout out the end-to-end examples on GitHub for PTQ and QAT
+      `here <https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#examples>`_.