From 12da558dc912a3d4a60c3d246bd5c2114c55596e Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Wed, 8 Jan 2025 08:01:57 +0000
Subject: [PATCH 1/7] Migrating HL compile and export to infer APIs

Change-Id: If27fbc1636ed1fe9b475d07cef7c83ed7dc46ca8
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py                    |  29 +----
 QEfficient/cloud/infer.py                     | 108 +++++++++---------
 .../transformers/models/modeling_auto.py      |   1 +
 3 files changed, 61 insertions(+), 77 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 53184450e..7e9442411 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -11,8 +11,8 @@
 
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -39,27 +39,10 @@ def get_onnx_model_path(
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
-    if onnx_path_exists:
-        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
-    else:
-        ###################
-        # hf model -> export
-        ####################
-        # Export to the Onnx
-        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-        _, onnx_model_path = qualcomm_efficient_converter(
-            model_name=model_name,
-            local_model_dir=local_model_dir,
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            kv=True,
-            form_factor="cloud",
-            hf_token=hf_token,
-            cache_dir=cache_dir,
-            full_batch_size=full_batch_size,
-        )  # type: ignore
-        logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
+    logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir)
+    onnx_model_path = qeff_model.export()
+    logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 0ba0961e3..1a732db9c 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -7,13 +7,11 @@
 
 import argparse
 import logging
-import os
+import sys
 from typing import List, Optional
 
-import QEfficient
-from QEfficient.cloud.export import get_onnx_model_path
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -38,6 +36,7 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    **kwargs,
 ) -> None:
     """
     1. Check if compiled qpc for given config already exists, if it does jump to execute, else
@@ -79,58 +78,43 @@ def main(
         hf_token=hf_token,
     )
 
-    qpc_dir_path = get_qpc_dir_path(
-        model_name,
-        num_cores,
-        mos,
-        batch_size,
-        prompt_len,
-        ctx_len,
-        mxfp6,
-        mxint8,
-        device_group,
-        full_batch_size,
-        enable_qnn=enable_qnn,
-    )
+    if enable_qnn and qnn_config is not None:
+        logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.")
 
-    # Handle qpc generation
-    if qpc_exists(qpc_dir_path):
-        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
-    else:
-        # Handle onnx model generation
-        onnx_model_path = get_onnx_model_path(
-            model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
-        )  # , base_dir_name)
+    if "--mxfp6" in sys.argv:
+        if args.mxfp6:
+            logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
+    if "--mxint8" in sys.argv:
+        if args.mxint8:
+            logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
-        #########
-        # Compile
-        #########
-        _ = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(
-                qpc_dir_path
-            ),  # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-            full_batch_size=full_batch_size,
-            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+
+    #########
+    # Compile
+    #########
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=num_cores,
+        mxfp6_matmul=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        batch_size=batch_size,
+        mos=mos,
+        mxint8_kv_cache=mxint8,
+        num_devices=(0 if device_group is None else len(device_group)),
+        full_batch_size=full_batch_size,
+        allow_mxint8_mdp_io=allow_mxint8_mdp_io,
+        enable_qnn=enable_qnn,
+        **kwargs,
+    )
 
     #########
     # Execute
     #########
-    cloud_ai_100_exec_kv(
-        tokenizer=tokenizer,
-        qpc_path=qpc_dir_path,
+    _ = qeff_model.generate(
+        tokenizer,
+        prompts=prompt,
         device_id=device_group,
         prompt=prompt,
         prompts_txt_file_path=prompts_txt_file_path,
@@ -162,10 +146,16 @@ def main(
     )
     parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.")
     parser.add_argument(
-        "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
+        "--mxfp6",
+        "--mxfp6_matmul",
+        "--mxfp6-matmul",
+        action="store_true",
+        help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
     )
     parser.add_argument(
         "--mxint8",
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
         action="store_true",
         help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
     )
@@ -237,8 +227,18 @@ def main(
         type=str,
     )
 
-    args = parser.parse_args()
+    args, compiler_options = parser.parse_known_args()
+    compiler_options_dict = {}
+    for i in range(0, len(compiler_options)):
+        if compiler_options[i].startswith("--"):
+            key = compiler_options[i].lstrip("-")
+            value = (
+                compiler_options[i + 1]
+                if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
+                else True
+            )
+            compiler_options_dict[key] = value
     if args.verbose:
         logger.setLevel(logging.INFO)
     del args.verbose  # type: ignore
-    main(**args.__dict__)
+    main(**args.__dict__, **compiler_options_dict)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index c2e3777bc..51e56c83e 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -278,6 +278,7 @@ def compile(
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
             :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.

From d3a97ca672dc2d86b13e27376c317ed3e4946b5b Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Wed, 8 Jan 2025 08:01:57 +0000
Subject: [PATCH 2/7] Migrating HL compile and export to infer APIs

Change-Id: If27fbc1636ed1fe9b475d07cef7c83ed7dc46ca8
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py                    |  29 +----
 QEfficient/cloud/infer.py                     | 108 +++++++++---------
 .../transformers/models/modeling_auto.py      |   1 +
 3 files changed, 61 insertions(+), 77 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 53184450e..7e9442411 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -11,8 +11,8 @@
 
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -39,27 +39,10 @@ def get_onnx_model_path(
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
-    if onnx_path_exists:
-        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
-    else:
-        ###################
-        # hf model -> export
-        ####################
-        # Export to the Onnx
-        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-        _, onnx_model_path = qualcomm_efficient_converter(
-            model_name=model_name,
-            local_model_dir=local_model_dir,
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            kv=True,
-            form_factor="cloud",
-            hf_token=hf_token,
-            cache_dir=cache_dir,
-            full_batch_size=full_batch_size,
-        )  # type: ignore
-        logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
+    logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir)
+    onnx_model_path = qeff_model.export()
+    logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 0ba0961e3..1a732db9c 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -7,13 +7,11 @@
 
 import argparse
 import logging
-import os
+import sys
 from typing import List, Optional
 
-import QEfficient
-from QEfficient.cloud.export import get_onnx_model_path
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -38,6 +36,7 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    **kwargs,
 ) -> None:
     """
     1. Check if compiled qpc for given config already exists, if it does jump to execute, else
@@ -79,58 +78,43 @@ def main(
         hf_token=hf_token,
     )
 
-    qpc_dir_path = get_qpc_dir_path(
-        model_name,
-        num_cores,
-        mos,
-        batch_size,
-        prompt_len,
-        ctx_len,
-        mxfp6,
-        mxint8,
-        device_group,
-        full_batch_size,
-        enable_qnn=enable_qnn,
-    )
+    if enable_qnn and qnn_config is not None:
+        logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.")
 
-    # Handle qpc generation
-    if qpc_exists(qpc_dir_path):
-        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
-    else:
-        # Handle onnx model generation
-        onnx_model_path = get_onnx_model_path(
-            model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
-        )  # , base_dir_name)
+    if "--mxfp6" in sys.argv:
+        if args.mxfp6:
+            logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
+    if "--mxint8" in sys.argv:
+        if args.mxint8:
+            logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
-        #########
-        # Compile
-        #########
-        _ = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(
-                qpc_dir_path
-            ),  # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-            full_batch_size=full_batch_size,
-            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+
+    #########
+    # Compile
+    #########
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=num_cores,
+        mxfp6_matmul=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        batch_size=batch_size,
+        mos=mos,
+        mxint8_kv_cache=mxint8,
+        num_devices=(0 if device_group is None else len(device_group)),
+        full_batch_size=full_batch_size,
+        allow_mxint8_mdp_io=allow_mxint8_mdp_io,
+        enable_qnn=enable_qnn,
+        **kwargs,
+    )
 
     #########
     # Execute
     #########
-    cloud_ai_100_exec_kv(
-        tokenizer=tokenizer,
-        qpc_path=qpc_dir_path,
+    _ = qeff_model.generate(
+        tokenizer,
+        prompts=prompt,
         device_id=device_group,
         prompt=prompt,
         prompts_txt_file_path=prompts_txt_file_path,
@@ -162,10 +146,16 @@ def main(
     )
     parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.")
     parser.add_argument(
-        "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
+        "--mxfp6",
+        "--mxfp6_matmul",
+        "--mxfp6-matmul",
+        action="store_true",
+        help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
     )
     parser.add_argument(
         "--mxint8",
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
         action="store_true",
         help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
     )
@@ -237,8 +227,18 @@ def main(
         type=str,
     )
 
-    args = parser.parse_args()
+    args, compiler_options = parser.parse_known_args()
+    compiler_options_dict = {}
+    for i in range(0, len(compiler_options)):
+        if compiler_options[i].startswith("--"):
+            key = compiler_options[i].lstrip("-")
+            value = (
+                compiler_options[i + 1]
+                if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
+                else True
+            )
+            compiler_options_dict[key] = value
     if args.verbose:
         logger.setLevel(logging.INFO)
     del args.verbose  # type: ignore
-    main(**args.__dict__)
+    main(**args.__dict__, **compiler_options_dict)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index c2e3777bc..51e56c83e 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -278,6 +278,7 @@ def compile(
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
             :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
             :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.

From 9ed96bea74d568c870afa389be5aad44afe604f4 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Tue, 28 Jan 2025 10:14:58 +0000
Subject: [PATCH 3/7] Made modelling class generic in infer

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py | 14 +++++++-------
 QEfficient/cloud/infer.py  | 26 +++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 7e9442411..bf83798d4 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -7,9 +7,7 @@
 
 import argparse
 import os
-from typing import Optional, Union
-
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from typing import Optional
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils import check_and_assign_cache_dir
@@ -22,9 +20,7 @@
 def get_onnx_model_path(
     model_name: str,
     cache_dir: Optional[str] = None,
-    tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]] = None,
     hf_token: Optional[str] = None,
-    local_model_dir: Optional[str] = None,
     full_batch_size: Optional[int] = None,
 ):
     """
@@ -40,7 +36,12 @@ def get_onnx_model_path(
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
     logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir)
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
+        model_name,
+        cache_dir,
+        hf_token=hf_token,
+        full_batch_size=full_batch_size,
+    )
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
@@ -75,7 +76,6 @@ def main(
         model_name=model_name,
         cache_dir=cache_dir,
         hf_token=hf_token,
-        local_model_dir=local_model_dir,
         full_batch_size=full_batch_size,
     )
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 1a732db9c..509d5e7f0 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -10,10 +10,26 @@
 import sys
 from typing import List, Optional
 
+from transformers import AutoConfig
+
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
+# Map model's architecture to class
+architecture_mapping = {
+    "LlamaForCausalLM": QEFFAutoModelForCausalLM,
+    "GPT2LMHeadModel": QEFFAutoModelForCausalLM,
+    "MistralForCausalLM": QEFFAutoModelForCausalLM,
+    "FalconForCausalLM": QEFFAutoModelForCausalLM,
+    "GPTJForCausalLM": QEFFAutoModelForCausalLM,
+    "GemmaForCausalLM": QEFFAutoModelForCausalLM,
+    "Gemma2ForCausalLM": QEFFAutoModelForCausalLM,
+    "Phi3ForCausalLM": QEFFAutoModelForCausalLM,
+    "Qwen2ForCausalLM": QEFFAutoModelForCausalLM,
+    "GPTBigCodeForCausalLM": QEFFAutoModelForCausalLM,
+}
+
 
 def main(
     model_name: str,
@@ -88,7 +104,15 @@ def main(
         if args.mxint8:
             logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
-    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+    config = AutoConfig.from_pretrained(model_name)
+    architecture = config.architectures[0] if config.architectures else None
+
+    model_class = architecture_mapping.get(architecture)
+    if not model_class:
+        logger.error(f"Model class for model name {model_name} not found in mapping")
+        return
+
+    qeff_model = model_class.from_pretrained(model_name)
 
     #########
     # Compile

From 55b753c3dab4b0bd2052077ccfb727254a9e72fd Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Wed, 29 Jan 2025 10:21:57 +0000
Subject: [PATCH 4/7] Made modelling class generic

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py | 21 ++++++++++++++++++---
 QEfficient/cloud/infer.py  | 27 ++++++++++-----------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index bf83798d4..46a50ff6c 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -9,6 +9,9 @@
 import os
 from typing import Optional
 
+from transformers import AutoConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
@@ -22,6 +25,7 @@ def get_onnx_model_path(
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
     full_batch_size: Optional[int] = None,
+    local_model_dir: Optional[str] = None,
 ):
     """
     exports the model to onnx if pre-exported file is not found and returns onnx_model_path
@@ -36,9 +40,19 @@ def get_onnx_model_path(
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
     logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
-        model_name,
-        cache_dir,
+
+    config = AutoConfig.from_pretrained(model_name)
+    architecture = config.architectures[0] if config.architectures else None
+
+    if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+        model_class = QEFFAutoModelForCausalLM
+    else:
+        logger.error(f"Model class for model name {model_name} not found in mapping")
+        return
+
+    qeff_model = model_class.from_pretrained(
+        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+        cache_dir=cache_dir,
         hf_token=hf_token,
         full_batch_size=full_batch_size,
     )
@@ -77,6 +91,7 @@ def main(
         cache_dir=cache_dir,
         hf_token=hf_token,
         full_batch_size=full_batch_size,
+        local_model_dir=local_model_dir,
     )
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 509d5e7f0..a79227748 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -11,25 +11,12 @@
 from typing import List, Optional
 
 from transformers import AutoConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
-# Map model's architecture to class
-architecture_mapping = {
-    "LlamaForCausalLM": QEFFAutoModelForCausalLM,
-    "GPT2LMHeadModel": QEFFAutoModelForCausalLM,
-    "MistralForCausalLM": QEFFAutoModelForCausalLM,
-    "FalconForCausalLM": QEFFAutoModelForCausalLM,
-    "GPTJForCausalLM": QEFFAutoModelForCausalLM,
-    "GemmaForCausalLM": QEFFAutoModelForCausalLM,
-    "Gemma2ForCausalLM": QEFFAutoModelForCausalLM,
-    "Phi3ForCausalLM": QEFFAutoModelForCausalLM,
-    "Qwen2ForCausalLM": QEFFAutoModelForCausalLM,
-    "GPTBigCodeForCausalLM": QEFFAutoModelForCausalLM,
-}
-
 
 def main(
     model_name: str,
@@ -107,12 +94,18 @@ def main(
     config = AutoConfig.from_pretrained(model_name)
     architecture = config.architectures[0] if config.architectures else None
 
-    model_class = architecture_mapping.get(architecture)
-    if not model_class:
+    if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+        model_class = QEFFAutoModelForCausalLM
+    else:
         logger.error(f"Model class for model name {model_name} not found in mapping")
         return
 
-    qeff_model = model_class.from_pretrained(model_name)
+    qeff_model = model_class.from_pretrained(
+        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+        full_batch_size=full_batch_size,
+    )
 
     #########
     # Compile

From 292fe7389df70ec1bcad613b610610bb2d9a7141 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Wed, 29 Jan 2025 11:20:00 +0000
Subject: [PATCH 5/7] Added load qeff model

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py   | 26 ++++++---------------
 QEfficient/cloud/infer.py    | 26 ++++++---------------
 QEfficient/utils/__init__.py |  1 +
 QEfficient/utils/_utils.py   | 44 +++++++++++++++++++++++++++++++++++-
 4 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 46a50ff6c..f3604c0f2 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -9,11 +9,7 @@
 import os
 from typing import Optional
 
-from transformers import AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
-from QEfficient.utils import check_and_assign_cache_dir
+from QEfficient.utils import check_and_assign_cache_dir, load_qeff_model
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -41,20 +37,12 @@ def get_onnx_model_path(
     """
     logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
 
-    config = AutoConfig.from_pretrained(model_name)
-    architecture = config.architectures[0] if config.architectures else None
-
-    if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-        model_class = QEFFAutoModelForCausalLM
-    else:
-        logger.error(f"Model class for model name {model_name} not found in mapping")
-        return
-
-    qeff_model = model_class.from_pretrained(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-        full_batch_size=full_batch_size,
+    qeff_model = load_qeff_model(
+        model_name,
+        cache_dir,
+        hf_token,
+        full_batch_size,
+        local_model_dir,
     )
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index a79227748..c30c58ffe 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -10,11 +10,7 @@
 import sys
 from typing import List, Optional
 
-from transformers import AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_qeff_model
 from QEfficient.utils.logging_utils import logger
 
 
@@ -91,20 +87,12 @@ def main(
         if args.mxint8:
             logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
-    config = AutoConfig.from_pretrained(model_name)
-    architecture = config.architectures[0] if config.architectures else None
-
-    if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-        model_class = QEFFAutoModelForCausalLM
-    else:
-        logger.error(f"Model class for model name {model_name} not found in mapping")
-        return
-
-    qeff_model = model_class.from_pretrained(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-        full_batch_size=full_batch_size,
+    qeff_model = load_qeff_model(
+        model_name,
+        cache_dir,
+        hf_token,
+        full_batch_size,
+        local_model_dir,
     )
 
     #########
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 2506b9233..86f563e05 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -17,6 +17,7 @@
     get_qpc_dir_path,
     hf_download,
     load_hf_tokenizer,
+    load_qeff_model,
     login_and_download_hf_lm,
     onnx_exists,
     padding_check_and_fix,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 2729267d6..ab899a56e 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import importlib
 import json
 import os
 import subprocess
@@ -13,7 +14,8 @@
 import requests
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
@@ -394,3 +396,43 @@ def create_json(file_path: str, json_data: object):
             json.dump(json_data, file, indent=4)
     except Exception as e:
         print(f"Failed to create JSON File {file_path}: {e}")
+
+
+def load_qeff_model(
+    model_name: str,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    local_model_dir: Optional[str] = None,
+    full_batch_size: Optional[int] = None,
+):
+    """
+    Loads the model using the QEfficient Modelling Class.
+
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
+    ``Optional`` Args:
+        :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
+        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.``
+        :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
+        :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
+        :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
+
+    """
+    config = AutoConfig.from_pretrained(model_name)
+    architecture = config.architectures[0] if config.architectures else None
+
+    module = importlib.import_module("QEfficient")
+    if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+        model_class = getattr(module, "QEFFAutoModelForCausalLM")
+    else:
+        raise NotImplementedError(
+            f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
+        )
+
+    qeff_model = model_class.from_pretrained(
+        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+        full_batch_size=full_batch_size,
+    )
+    return qeff_model

From 26ca6a7ff55a6649dfedd43f5a65356c7a925983 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 5 Feb 2025 09:51:36 +0530
Subject: [PATCH 6/7] clean code

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/base/common.py    | 74 +++++++++++-------------------------
 QEfficient/cloud/export.py   | 15 ++++----
 QEfficient/cloud/infer.py    | 19 +++++----
 QEfficient/utils/__init__.py |  1 -
 QEfficient/utils/_utils.py   | 44 +--------------------
 5 files changed, 41 insertions(+), 112 deletions(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index ce6b1cdc2..6e9b7a6af 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -12,59 +12,20 @@
 QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
 """
 
-import os
-from enum import Enum
-from typing import Any, Dict, Type
+from typing import Any
 
 from transformers import AutoConfig
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
-from QEfficient.utils._utils import login_and_download_hf_lm
-
-
-class QEFF_MODEL_TYPE(Enum):
-    """
-    Defines Names of the different varities of transformer models.
-    """
-
-    CAUSALLM = "LLM"
-    DIFFUSION = "DIFFUSION"
-
-
-MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
-    QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
-}
-
-AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {
-    v: k for k, v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()
-}
-
-
-def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
-    """
-    Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library.
-    """
-    if not os.path.isdir(hf_model_path):
-        raise FileNotFoundError(
-            "Please pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
-        )
-    config, kwargs = AutoConfig.from_pretrained(
-        hf_model_path,
-        return_unused_kwargs=True,
-    )
-
-    if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
-        return QEFF_MODEL_TYPE.CAUSALLM
-    else:
-        raise NotImplementedError(f"model type {type(config)} is not yet supported")
 
 
 class QEFFCommonLoader:
     """
     Provides HuggingFace model loading interface same as transformers APIs.
     Supports loading any model on HuggingFace.
+    Wrapper on top of Auto Classes
     """
 
     def __init__(self, *args: Any, **kwds: Any) -> None:
@@ -78,14 +39,25 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         """
         Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model.
         """
-        if not os.path.isdir(pretrained_model_name_or_path):
-            pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
-        kwargs.pop("hf_token", None)
-        model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path)
-        qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type]
-        if not issubclass(qeff_auto_model_class, QEFFBaseModel):
-            raise Exception(f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}")
-
-        return qeff_auto_model_class.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+        architecture = config.architectures[0] if config.architectures else None
+
+        if architecture in MODEL_FOR_CAUSAL_LM_MAPPING:
+            model_class = QEFFAutoModelForCausalLM
+        else:
+            raise NotImplementedError(
+                f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
+            )
+
+        local_model_dir = kwargs.pop("local_model_dir", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        hf_token = kwargs.pop("hf_token", None)
+        continuous_batching = True if kwargs.pop("full_batch_size", None) else False
+
+        qeff_model = model_class.from_pretrained(
+            pretrained_model_name_or_path=(local_model_dir if local_model_dir else pretrained_model_name_or_path),
+            cache_dir=cache_dir,
+            hf_token=hf_token,
+            continuous_batching=continuous_batching,
         )
+        return qeff_model
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index f3604c0f2..504240b66 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -9,7 +9,8 @@
 import os
 from typing import Optional
 
-from QEfficient.utils import check_and_assign_cache_dir, load_qeff_model
+from QEfficient.base.common import QEFFCommonLoader
+from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -37,12 +38,12 @@ def get_onnx_model_path(
     """
     logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
 
-    qeff_model = load_qeff_model(
-        model_name,
-        cache_dir,
-        hf_token,
-        full_batch_size,
-        local_model_dir,
+    qeff_model = QEFFCommonLoader.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+        full_batch_size=full_batch_size,
+        local_model_dir=local_model_dir,
     )
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index c30c58ffe..4b43c8ded 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -10,7 +10,8 @@
 import sys
 from typing import List, Optional
 
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_qeff_model
+from QEfficient.base.common import QEFFCommonLoader
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -77,9 +78,6 @@ def main(
         hf_token=hf_token,
     )
 
-    if enable_qnn and qnn_config is not None:
-        logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.")
-
     if "--mxfp6" in sys.argv:
         if args.mxfp6:
             logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
@@ -87,12 +85,12 @@ def main(
         if args.mxint8:
             logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
-    qeff_model = load_qeff_model(
-        model_name,
-        cache_dir,
-        hf_token,
-        full_batch_size,
-        local_model_dir,
+    qeff_model = QEFFCommonLoader.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+        full_batch_size=full_batch_size,
+        local_model_dir=local_model_dir,
     )
 
     #########
@@ -111,6 +109,7 @@ def main(
         full_batch_size=full_batch_size,
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
         **kwargs,
     )
 
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 86f563e05..2506b9233 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -17,7 +17,6 @@
     get_qpc_dir_path,
     hf_download,
     load_hf_tokenizer,
-    load_qeff_model,
     login_and_download_hf_lm,
     onnx_exists,
     padding_check_and_fix,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index ab899a56e..2729267d6 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import importlib
 import json
 import os
 import subprocess
@@ -14,8 +13,7 @@
 import requests
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
@@ -396,43 +394,3 @@ def create_json(file_path: str, json_data: object):
             json.dump(json_data, file, indent=4)
     except Exception as e:
         print(f"Failed to create JSON File {file_path}: {e}")
-
-
-def load_qeff_model(
-    model_name: str,
-    cache_dir: Optional[str] = None,
-    hf_token: Optional[str] = None,
-    local_model_dir: Optional[str] = None,
-    full_batch_size: Optional[int] = None,
-):
-    """
-    Loads the model using the QEfficient Modelling Class.
-
-    ``Mandatory`` Args:
-        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
-    ``Optional`` Args:
-        :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
-        :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.``
-        :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
-        :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
-        :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
-
-    """
-    config = AutoConfig.from_pretrained(model_name)
-    architecture = config.architectures[0] if config.architectures else None
-
-    module = importlib.import_module("QEfficient")
-    if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-        model_class = getattr(module, "QEFFAutoModelForCausalLM")
-    else:
-        raise NotImplementedError(
-            f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
-        )
-
-    qeff_model = model_class.from_pretrained(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-        full_batch_size=full_batch_size,
-    )
-    return qeff_model

From 6ade0fa99c2b97229d1d61776f3b5bd205507173 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 5 Feb 2025 16:37:19 +0530
Subject: [PATCH 7/7] removed extra code as part of deprecation

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 .../exporter/export_hf_to_cloud_ai_100.py     | 28 ++-----------------
 QEfficient/transformers/transform.py          |  8 ++----
 2 files changed, 4 insertions(+), 32 deletions(-)

diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index 276faf94c..6b6cbe18a 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -13,7 +13,7 @@
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
+from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
@@ -307,30 +307,6 @@ def export_kvstyle_transformed_model_to_onnx(
     return model_name
 
 
-def export_for_cloud(
-    model_name: str,
-    qeff_model: QEFFBaseModel,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    onnx_dir_path: str,
-    seq_length: int = Constants.SEQ_LEN,
-    full_batch_size: Optional[int] = None,
-) -> str:
-    # FIXME: move all this to class instead of here, and just call qeff_model.export here.
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:  # type: ignore
-        return export_lm_model_for_cloud(
-            model_name=model_name,
-            qeff_model=qeff_model,  # type: ignore
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            seq_length=seq_length,
-            full_batch_size=full_batch_size,
-        )
-    else:
-        raise NotImplementedError(
-            f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}"
-        )
-
-
 def export_lm_model_for_cloud(
     model_name: str,
     qeff_model: QEFFAutoModelForCausalLM,
@@ -434,7 +410,7 @@ def qualcomm_efficient_converter(
     )
 
     if form_factor == "cloud":
-        generated_onnx_model_path = export_for_cloud(
+        generated_onnx_model_path = export_lm_model_for_cloud(
             model_name=model_name,
             qeff_model=model_kv,
             tokenizer=tokenizer,
diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py
index 8bb084fbf..f4024a1f3 100644
--- a/QEfficient/transformers/transform.py
+++ b/QEfficient/transformers/transform.py
@@ -10,7 +10,6 @@
 import torch.nn as nn
 import transformers
 
-from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
@@ -96,8 +95,5 @@ def transform(model: QEFFBaseModel, form_factor="cloud"):
     if form_factor != "cloud":
         raise ValueError("Only form_factor='cloud' is supported as of now!")
     # FIXME: move this to class and use model.transform()
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:
-        transform_lm(model.model)  # type: ignore
-        return model
-    else:
-        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
+    transform_lm(model.model)  # type: ignore
+    return model