From d267e40a62b6ac6511604ff5b927c4e115bda284 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 13 Dec 2024 06:38:22 +0000
Subject: [PATCH] Migrating HL compile and export to infer APIs

Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/export.py                    | 29 ++-------
 QEfficient/cloud/infer.py                     | 63 ++++++-------------
 .../transformers/models/modeling_auto.py      |  1 +
 3 files changed, 25 insertions(+), 68 deletions(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 53184450e..700532f3c 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -11,8 +11,8 @@
 
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -39,27 +39,10 @@ def get_onnx_model_path(
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
-    if onnx_path_exists:
-        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
-    else:
-        ###################
-        # hf model -> export
-        ####################
-        # Export to the Onnx
-        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-        _, onnx_model_path = qualcomm_efficient_converter(
-            model_name=model_name,
-            local_model_dir=local_model_dir,
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            kv=True,
-            form_factor="cloud",
-            hf_token=hf_token,
-            cache_dir=cache_dir,
-            full_batch_size=full_batch_size,
-        )  # type: ignore
-        logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
+    logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+    onnx_model_path = qeff_model.export()
+    logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 870005c91..0fadd7226 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -7,13 +7,10 @@
 
 import argparse
 import logging
-import os
 from typing import List, Optional
 
-import QEfficient
-from QEfficient.cloud.export import get_onnx_model_path
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -75,51 +72,27 @@ def main(
         hf_token=hf_token,
     )
 
-    qpc_dir_path = get_qpc_dir_path(
-        model_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+    #########
+    # Compile
+    #########
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=num_cores,
+        mxfp6_matmul=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        batch_size=batch_size,
+        mos=mos,
+        num_devices=len(device_group),
+        full_batch_size=full_batch_size,
+        allow_mxint8_mdp_io=allow_mxint8_mdp_io,
     )
 
-    # Handle qpc generation
-    if qpc_exists(qpc_dir_path):
-        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
-    else:
-        # Handle onnx model generation
-        onnx_model_path = get_onnx_model_path(
-            model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
-        )  # , base_dir_name)
-
-        #########
-        # Compile
-        #########
-        _ = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(
-                qpc_dir_path
-            ),  # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-            full_batch_size=full_batch_size,
-            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
-        )
-
     #########
     # Execute
     #########
-    cloud_ai_100_exec_kv(
-        tokenizer=tokenizer,
-        qpc_path=qpc_dir_path,
-        device_id=device_group,
-        prompt=prompt,
-        prompts_txt_file_path=prompts_txt_file_path,
-        generation_len=generation_len,
-    )
+    _ = qeff_model.generate(tokenizer, prompts=prompt)
 
 
 if __name__ == "__main__":
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index d0bb4285f..3e6c47569 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -281,6 +281,7 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
 
         Returns:
             :str: Path of the compiled ``qpc`` package.