From c89329774c25adae81254074932a26975ab41bac Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH 01/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py                    |  29 ++---
 QEfficient/cloud/infer.py                     | 101 +++++++++---------
 .../transformers/models/modeling_auto.py      |   1 +
 3 files changed, 59 insertions(+), 72 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 53184450e..700532f3c 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -11,8 +11,8 @@
 
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -39,27 +39,10 @@ def get_onnx_model_path(
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
-    if onnx_path_exists:
-        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
-    else:
-        ###################
-        # hf model -> export
-        ####################
-        # Export to the Onnx
-        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-        _, onnx_model_path = qualcomm_efficient_converter(
-            model_name=model_name,
-            local_model_dir=local_model_dir,
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            kv=True,
-            form_factor="cloud",
-            hf_token=hf_token,
-            cache_dir=cache_dir,
-            full_batch_size=full_batch_size,
-        )  # type: ignore
-        logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
+    logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+    onnx_model_path = qeff_model.export()
+    logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 870005c91..7e2328eaf 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -7,13 +7,11 @@
 
 import argparse
 import logging
-import os
+import sys
 from typing import List, Optional
 
-import QEfficient
-from QEfficient.cloud.export import get_onnx_model_path
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -36,6 +34,7 @@ def main(
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
     allow_mxint8_mdp_io: bool = False,
+    **kwargs,
 ) -> None:
     """
     1. Check if compiled qpc for given config already exists, if it does jump to execute, else
@@ -75,52 +74,42 @@ def main(
         hf_token=hf_token,
     )
 
-    qpc_dir_path = get_qpc_dir_path(
-        model_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size
-    )
-
-    # Handle qpc generation
-    if qpc_exists(qpc_dir_path):
-        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
-    else:
-        # Handle onnx model generation
-        onnx_model_path = get_onnx_model_path(
-            model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
-        )  # , base_dir_name)
-
-        #########
-        # Compile
-        #########
-        _ = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(
-                qpc_dir_path
-            ),  # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-            full_batch_size=full_batch_size,
-            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
-        )
+    if '--mxfp6' in sys.argv:
+        if args.mxfp6:
+            logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
+    if '--mxint8' in sys.argv:
+        if args.mxint8:
+            logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
+
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
 
     #########
-    # Execute
+    # Compile
     #########
-    cloud_ai_100_exec_kv(
-        tokenizer=tokenizer,
-        qpc_path=qpc_dir_path,
-        device_id=device_group,
-        prompt=prompt,
-        prompts_txt_file_path=prompts_txt_file_path,
-        generation_len=generation_len,
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=num_cores,
+        mxfp6_matmul=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        batch_size=batch_size,
+        mos=mos,
+        mxint8_kv_cache=mxint8,
+        num_devices=len(device_group),
+        full_batch_size=full_batch_size,
+        allow_mxint8_mdp_io=allow_mxint8_mdp_io,
+        **kwargs,
     )
 
+    #########
+    # Execute
+    #########
+    _ = qeff_model.generate(tokenizer,
+                            prompts=prompt,
+                            device_id=device_group,
+                            prompts_txt_file_path=prompts_txt_file_path,
+                            generation_len=generation_len,)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -146,10 +135,16 @@ def main(
     )
     parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.")
     parser.add_argument(
-        "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
+        "--mxfp6",
+        "--mxfp6_matmul",
+        "--mxfp6-matmul",
+        action="store_true",
+        help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
     )
     parser.add_argument(
         "--mxint8",
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
         action="store_true",
         help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
     )
@@ -207,8 +202,16 @@ def main(
         help="If passed, this option allows MXINT8 compression of MDP IO traffic",
     )
 
-    args = parser.parse_args()
+    args, compiler_options = parser.parse_known_args()
+    compiler_options_dict = {}
+    for i in range(0, len(compiler_options)):
+        if (compiler_options[i].startswith('--')):
+            key = compiler_options[i].lstrip('-')
+            value = compiler_options[i+1] if i+1 < len(compiler_options) and not compiler_options[i+1].startswith('-') else True
+            compiler_options_dict[key] = value
+
     if args.verbose:
         logger.setLevel(logging.INFO)
     del args.verbose  # type: ignore
-    main(**args.__dict__)
+
+    main(**args.__dict__, **compiler_options_dict)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 83c573f6d..6b7c1443c 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -266,6 +266,7 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.

From 78bb2d7c76f52fc3c0832291612fa825d58f7dc8 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH 02/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py  | 28 +++++++++++++++++-----------
 tests/cloud/test_export.py | 10 ----------
 tests/cloud/test_infer.py  | 19 -------------------
 3 files changed, 17 insertions(+), 40 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 7e2328eaf..e8690d5de 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -74,10 +74,10 @@ def main(
         hf_token=hf_token,
     )
 
-    if '--mxfp6' in sys.argv:
+    if "--mxfp6" in sys.argv:
         if args.mxfp6:
             logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
-    if '--mxint8' in sys.argv:
+    if "--mxint8" in sys.argv:
         if args.mxint8:
             logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
@@ -104,11 +104,13 @@ def main(
     #########
     # Execute
     #########
-    _ = qeff_model.generate(tokenizer,
-                            prompts=prompt,
-                            device_id=device_group,
-                            prompts_txt_file_path=prompts_txt_file_path,
-                            generation_len=generation_len,)
+    _ = qeff_model.generate(
+        tokenizer,
+        prompts=prompt,
+        device_id=device_group,
+        prompts_txt_file_path=prompts_txt_file_path,
+        generation_len=generation_len,
+    )
 
 
 if __name__ == "__main__":
@@ -139,7 +141,7 @@ def main(
         "--mxfp6_matmul",
         "--mxfp6-matmul",
         action="store_true",
-        help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
+        help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
     )
     parser.add_argument(
         "--mxint8",
@@ -205,9 +207,13 @@ def main(
     args, compiler_options = parser.parse_known_args()
     compiler_options_dict = {}
     for i in range(0, len(compiler_options)):
-        if (compiler_options[i].startswith('--')):
-            key = compiler_options[i].lstrip('-')
-            value = compiler_options[i+1] if i+1 < len(compiler_options) and not compiler_options[i+1].startswith('-') else True
+        if compiler_options[i].startswith("--"):
+            key = compiler_options[i].lstrip("-")
+            value = (
+                compiler_options[i + 1]
+                if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
+                else True
+            )
             compiler_options_dict[key] = value
 
     if args.verbose:
diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
index 4291da23a..a2b717634 100644
--- a/tests/cloud/test_export.py
+++ b/tests/cloud/test_export.py
@@ -5,12 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
 
 import pytest
 
-import QEfficient
-import QEfficient.cloud.export
 from QEfficient.cloud.export import main as export
 
 
@@ -25,8 +22,6 @@ def test_export(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
-    check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
 
     export(
         model_name=ms.model_name,
@@ -34,8 +29,3 @@ def test_export(setup, mocker):
         local_model_dir=ms.local_model_dir,
         full_batch_size=ms.full_batch_size,
     )
-
-    check_and_assign_cache_dir_spy.assert_called_once()
-    get_onnx_model_path_spy.assert_called_once()
-    assert any(os.path.isfile(x) for x in ms.onnx_model_path())
-    assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path()
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index 8cd61a050..0dacba570 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -5,12 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
 
 import pytest
 
-import QEfficient
-import QEfficient.cloud.infer
 from QEfficient.cloud.infer import main as infer
 
 
@@ -30,11 +27,6 @@ def test_infer(setup, mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     ms = setup
-    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer")
-    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path")
-    compile_spy = mocker.spy(QEfficient, "compile")
-    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv")
 
     infer(
         model_name=ms.model_name,
@@ -53,14 +45,3 @@ def test_infer(setup, mocker):
         mxint8=ms.mxint8,
         full_batch_size=ms.full_batch_size,
     )
-    # tokenizer check
-    load_hf_tokenizer_spy.assert_called_once()
-    # qpc exist check
-    qpc_exists_spy.assert_called_once()
-    if qpc_exists_spy.spy_return is True:
-        assert os.path.isdir(ms.qpc_dir_path())
-    else:
-        get_onnx_model_path_spy.assert_called_once()
-        compile_spy.assert_called_once()
-        assert compile_spy.spy_return == ms.qpc_dir_path()
-    cloud_ai_100_exec_kv_spy.assert_called_once()

From 7229571e1403f974166ab4a0bdf769177a8282be Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH 03/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index e8690d5de..657cb4a3c 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -95,7 +95,7 @@ def main(
         batch_size=batch_size,
         mos=mos,
         mxint8_kv_cache=mxint8,
-        num_devices=len(device_group),
+        num_devices = (0 if device_group is None else len(device_group)),
         full_batch_size=full_batch_size,
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         **kwargs,

From c4859a7dd437c2939d190472343489e7549868ed Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH 04/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 657cb4a3c..48fbc5af0 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -95,7 +95,7 @@ def main(
         batch_size=batch_size,
         mos=mos,
         mxint8_kv_cache=mxint8,
-        num_devices = (0 if device_group is None else len(device_group)),
+        num_devices=(0 if device_group is None else len(device_group)),
         full_batch_size=full_batch_size,
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         **kwargs,

From 807574ee7f613df74dfe2fd1967ff8c7b76c6d20 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Wed, 18 Dec 2024 23:18:34 +0530
Subject: [PATCH 05/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py                    |  29 +----
 QEfficient/cloud/infer.py                     | 111 +++++++++---------
 .../transformers/models/modeling_auto.py      |   1 +
 3 files changed, 63 insertions(+), 78 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 53184450e..700532f3c 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -11,8 +11,8 @@
 
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -39,27 +39,10 @@ def get_onnx_model_path(
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
-    if onnx_path_exists:
-        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
-    else:
-        ###################
-        # hf model -> export
-        ####################
-        # Export to the Onnx
-        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-        _, onnx_model_path = qualcomm_efficient_converter(
-            model_name=model_name,
-            local_model_dir=local_model_dir,
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            kv=True,
-            form_factor="cloud",
-            hf_token=hf_token,
-            cache_dir=cache_dir,
-            full_batch_size=full_batch_size,
-        )  # type: ignore
-        logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
+    logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+    onnx_model_path = qeff_model.export()
+    logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 0ba0961e3..ee82303ef 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -7,13 +7,11 @@
 
 import argparse
 import logging
-import os
+import sys
 from typing import List, Optional
 
-import QEfficient
-from QEfficient.cloud.export import get_onnx_model_path
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -38,6 +36,7 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    **kwargs,
 ) -> None:
     """
     1. Check if compiled qpc for given config already exists, if it does jump to execute, else
@@ -79,60 +78,44 @@ def main(
         hf_token=hf_token,
     )
 
-    qpc_dir_path = get_qpc_dir_path(
-        model_name,
-        num_cores,
-        mos,
-        batch_size,
-        prompt_len,
-        ctx_len,
-        mxfp6,
-        mxint8,
-        device_group,
-        full_batch_size,
+    if enable_qnn and qnn_config is not None:
+        logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.")
+
+    if "--mxfp6" in sys.argv:
+        if args.mxfp6:
+            logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
+    if "--mxint8" in sys.argv:
+        if args.mxint8:
+            logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
+
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+
+    #########
+    # Compile
+    #########
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=num_cores,
+        mxfp6_matmul=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        batch_size=batch_size,
+        mos=mos,
+        mxint8_kv_cache=mxint8,
+        num_devices=len(device_group),
+        full_batch_size=full_batch_size,
+        allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,
+        **kwargs,
     )
 
-    # Handle qpc generation
-    if qpc_exists(qpc_dir_path):
-        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
-    else:
-        # Handle onnx model generation
-        onnx_model_path = get_onnx_model_path(
-            model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
-        )  # , base_dir_name)
-
-        #########
-        # Compile
-        #########
-        _ = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(
-                qpc_dir_path
-            ),  # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-            full_batch_size=full_batch_size,
-            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
-
     #########
     # Execute
     #########
-    cloud_ai_100_exec_kv(
-        tokenizer=tokenizer,
-        qpc_path=qpc_dir_path,
+    _ = qeff_model.generate(
+        tokenizer,
+        prompts=prompt,
         device_id=device_group,
-        prompt=prompt,
         prompts_txt_file_path=prompts_txt_file_path,
         generation_len=generation_len,
     )
@@ -162,10 +145,16 @@ def main(
     )
     parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.")
     parser.add_argument(
-        "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
+        "--mxfp6",
+        "--mxfp6_matmul",
+        "--mxfp6-matmul",
+        action="store_true",
+        help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
     )
     parser.add_argument(
         "--mxint8",
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
         action="store_true",
         help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
     )
@@ -237,8 +226,20 @@ def main(
         type=str,
     )
 
-    args = parser.parse_args()
+    args, compiler_options = parser.parse_known_args()
+    compiler_options_dict = {}
+    for i in range(0, len(compiler_options)):
+        if compiler_options[i].startswith("--"):
+            key = compiler_options[i].lstrip("-")
+            value = (
+                compiler_options[i + 1]
+                if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
+                else True
+            )
+            compiler_options_dict[key] = value
+
     if args.verbose:
         logger.setLevel(logging.INFO)
     del args.verbose  # type: ignore
-    main(**args.__dict__)
+
+    main(**args.__dict__, **compiler_options_dict)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 83c573f6d..6b7c1443c 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -266,6 +266,7 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.

From 4d5b14c3f6ad5e757f4afcc6fae0a271dc11da80 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 19 Dec 2024 09:23:20 +0000
Subject: [PATCH 06/12] Revert "Migrating HL compile and export to infer APIs"

This reverts commit 78bb2d7c76f52fc3c0832291612fa825d58f7dc8.

Change-Id: I39efc3c537ab08f354585695c4fd0d42ece48478
---
 tests/cloud/test_export.py | 10 ++++++++++
 tests/cloud/test_infer.py  | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
index a2b717634..4291da23a 100644
--- a/tests/cloud/test_export.py
+++ b/tests/cloud/test_export.py
@@ -5,9 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import pytest
 
+import QEfficient
+import QEfficient.cloud.export
 from QEfficient.cloud.export import main as export
 
 
@@ -22,6 +25,8 @@ def test_export(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
+    check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir")
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
 
     export(
         model_name=ms.model_name,
@@ -29,3 +34,8 @@ def test_export(setup, mocker):
         local_model_dir=ms.local_model_dir,
         full_batch_size=ms.full_batch_size,
     )
+
+    check_and_assign_cache_dir_spy.assert_called_once()
+    get_onnx_model_path_spy.assert_called_once()
+    assert any(os.path.isfile(x) for x in ms.onnx_model_path())
+    assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path()
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index 22191b9ce..e28c3a38a 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -5,9 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import pytest
 
+import QEfficient
+import QEfficient.cloud.infer
 from QEfficient.cloud.infer import main as infer
 
 
@@ -27,6 +30,11 @@ def test_infer(setup, mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     ms = setup
+    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer")
+    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists")
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path")
+    compile_spy = mocker.spy(QEfficient, "compile")
+    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv")
 
     infer(
         model_name=ms.model_name,
@@ -46,3 +54,14 @@ def test_infer(setup, mocker):
         full_batch_size=ms.full_batch_size,
         enable_qnn=ms.enable_qnn,
     )
+    # tokenizer check
+    load_hf_tokenizer_spy.assert_called_once()
+    # qpc exist check
+    qpc_exists_spy.assert_called_once()
+    if qpc_exists_spy.spy_return is True:
+        assert os.path.isdir(ms.qpc_dir_path())
+    else:
+        get_onnx_model_path_spy.assert_called_once()
+        compile_spy.assert_called_once()
+        assert compile_spy.spy_return == ms.qpc_dir_path()
+    cloud_ai_100_exec_kv_spy.assert_called_once()

From 1ee05b33b377aedb5bbda1174211f82062f54398 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 19 Dec 2024 09:23:20 +0000
Subject: [PATCH 07/12] Migrating HL compile and export to infer APIs

Change-Id: I39efc3c537ab08f354585695c4fd0d42ece48478
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 tests/cloud/test_export.py | 10 ++++++++++
 tests/cloud/test_infer.py  | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
index a2b717634..4291da23a 100644
--- a/tests/cloud/test_export.py
+++ b/tests/cloud/test_export.py
@@ -5,9 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import pytest
 
+import QEfficient
+import QEfficient.cloud.export
 from QEfficient.cloud.export import main as export
 
 
@@ -22,6 +25,8 @@ def test_export(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
+    check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir")
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
 
     export(
         model_name=ms.model_name,
@@ -29,3 +34,8 @@ def test_export(setup, mocker):
         local_model_dir=ms.local_model_dir,
         full_batch_size=ms.full_batch_size,
     )
+
+    check_and_assign_cache_dir_spy.assert_called_once()
+    get_onnx_model_path_spy.assert_called_once()
+    assert any(os.path.isfile(x) for x in ms.onnx_model_path())
+    assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path()
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index 22191b9ce..e28c3a38a 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -5,9 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import pytest
 
+import QEfficient
+import QEfficient.cloud.infer
 from QEfficient.cloud.infer import main as infer
 
 
@@ -27,6 +30,11 @@ def test_infer(setup, mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     ms = setup
+    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer")
+    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists")
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path")
+    compile_spy = mocker.spy(QEfficient, "compile")
+    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv")
 
     infer(
         model_name=ms.model_name,
@@ -46,3 +54,14 @@ def test_infer(setup, mocker):
         full_batch_size=ms.full_batch_size,
         enable_qnn=ms.enable_qnn,
     )
+    # tokenizer check
+    load_hf_tokenizer_spy.assert_called_once()
+    # qpc exist check
+    qpc_exists_spy.assert_called_once()
+    if qpc_exists_spy.spy_return is True:
+        assert os.path.isdir(ms.qpc_dir_path())
+    else:
+        get_onnx_model_path_spy.assert_called_once()
+        compile_spy.assert_called_once()
+        assert compile_spy.spy_return == ms.qpc_dir_path()
+    cloud_ai_100_exec_kv_spy.assert_called_once()

From 0c337cfcab2a47a9697b7e1e432a9a74b6846ead Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH 08/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 tests/cloud/test_export.py | 10 ----------
 tests/cloud/test_infer.py  | 19 -------------------
 2 files changed, 29 deletions(-)

diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
index 4291da23a..a2b717634 100644
--- a/tests/cloud/test_export.py
+++ b/tests/cloud/test_export.py
@@ -5,12 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
 
 import pytest
 
-import QEfficient
-import QEfficient.cloud.export
 from QEfficient.cloud.export import main as export
 
 
@@ -25,8 +22,6 @@ def test_export(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
-    check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
 
     export(
         model_name=ms.model_name,
@@ -34,8 +29,3 @@ def test_export(setup, mocker):
         local_model_dir=ms.local_model_dir,
         full_batch_size=ms.full_batch_size,
     )
-
-    check_and_assign_cache_dir_spy.assert_called_once()
-    get_onnx_model_path_spy.assert_called_once()
-    assert any(os.path.isfile(x) for x in ms.onnx_model_path())
-    assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path()
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index e28c3a38a..22191b9ce 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -5,12 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
 
 import pytest
 
-import QEfficient
-import QEfficient.cloud.infer
 from QEfficient.cloud.infer import main as infer
 
 
@@ -30,11 +27,6 @@ def test_infer(setup, mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     ms = setup
-    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer")
-    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path")
-    compile_spy = mocker.spy(QEfficient, "compile")
-    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv")
 
     infer(
         model_name=ms.model_name,
@@ -54,14 +46,3 @@ def test_infer(setup, mocker):
         full_batch_size=ms.full_batch_size,
         enable_qnn=ms.enable_qnn,
     )
-    # tokenizer check
-    load_hf_tokenizer_spy.assert_called_once()
-    # qpc exist check
-    qpc_exists_spy.assert_called_once()
-    if qpc_exists_spy.spy_return is True:
-        assert os.path.isdir(ms.qpc_dir_path())
-    else:
-        get_onnx_model_path_spy.assert_called_once()
-        compile_spy.assert_called_once()
-        assert compile_spy.spy_return == ms.qpc_dir_path()
-    cloud_ai_100_exec_kv_spy.assert_called_once()

From 78a082692298ebe74328a20c182e1a9258585789 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH 09/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index ee82303ef..fd7e431d5 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -102,7 +102,7 @@ def main(
         batch_size=batch_size,
         mos=mos,
         mxint8_kv_cache=mxint8,
-        num_devices=len(device_group),
+        num_devices = (0 if device_group is None else len(device_group)),
         full_batch_size=full_batch_size,
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,

From d7f83fac176120eecd30b85c6c6355dcfbc94fb0 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH 10/12] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index fd7e431d5..ba0860152 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -102,7 +102,7 @@ def main(
         batch_size=batch_size,
         mos=mos,
         mxint8_kv_cache=mxint8,
-        num_devices = (0 if device_group is None else len(device_group)),
+        num_devices=(0 if device_group is None else len(device_group)),
         full_batch_size=full_batch_size,
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,

From 4a79f93b0b86907de7678fb2ac54049aa7676c01 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 19 Dec 2024 09:23:20 +0000
Subject: [PATCH 11/12] Migrating HL compile and export to infer APIs

Change-Id: I39efc3c537ab08f354585695c4fd0d42ece48478
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 tests/cloud/test_export.py | 10 ++++++++++
 tests/cloud/test_infer.py  | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
index a2b717634..4291da23a 100644
--- a/tests/cloud/test_export.py
+++ b/tests/cloud/test_export.py
@@ -5,9 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import pytest
 
+import QEfficient
+import QEfficient.cloud.export
 from QEfficient.cloud.export import main as export
 
 
@@ -22,6 +25,8 @@ def test_export(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
+    check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir")
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
 
     export(
         model_name=ms.model_name,
@@ -29,3 +34,8 @@ def test_export(setup, mocker):
         local_model_dir=ms.local_model_dir,
         full_batch_size=ms.full_batch_size,
     )
+
+    check_and_assign_cache_dir_spy.assert_called_once()
+    get_onnx_model_path_spy.assert_called_once()
+    assert any(os.path.isfile(x) for x in ms.onnx_model_path())
+    assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path()
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index 22191b9ce..e28c3a38a 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -5,9 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import pytest
 
+import QEfficient
+import QEfficient.cloud.infer
 from QEfficient.cloud.infer import main as infer
 
 
@@ -27,6 +30,11 @@ def test_infer(setup, mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     ms = setup
+    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer")
+    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists")
+    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path")
+    compile_spy = mocker.spy(QEfficient, "compile")
+    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv")
 
     infer(
         model_name=ms.model_name,
@@ -46,3 +54,14 @@ def test_infer(setup, mocker):
         full_batch_size=ms.full_batch_size,
         enable_qnn=ms.enable_qnn,
     )
+    # tokenizer check
+    load_hf_tokenizer_spy.assert_called_once()
+    # qpc exist check
+    qpc_exists_spy.assert_called_once()
+    if qpc_exists_spy.spy_return is True:
+        assert os.path.isdir(ms.qpc_dir_path())
+    else:
+        get_onnx_model_path_spy.assert_called_once()
+        compile_spy.assert_called_once()
+        assert compile_spy.spy_return == ms.qpc_dir_path()
+    cloud_ai_100_exec_kv_spy.assert_called_once()

From 3edbd7c8f73804e4549891746ba770e103c45e80 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 19 Dec 2024 09:44:37 +0000
Subject: [PATCH 12/12] Migrating HL compile and export to infer APIs

Signed-off-by:Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index ba0860152..fd7e431d5 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -102,7 +102,7 @@ def main(
         batch_size=batch_size,
         mos=mos,
         mxint8_kv_cache=mxint8,
-        num_devices=(0 if device_group is None else len(device_group)),
+        num_devices = (0 if device_group is None else len(device_group)),
         full_batch_size=full_batch_size,
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,