From d267e40a62b6ac6511604ff5b927c4e115bda284 Mon Sep 17 00:00:00 2001 From: Asmita Goswami <quic_asmigosw@quicinc.com> Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com> --- QEfficient/cloud/export.py | 29 ++------- QEfficient/cloud/infer.py | 63 ++++++------------- .../transformers/models/modeling_auto.py | 1 + 3 files changed, 25 insertions(+), 68 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 53184450e..700532f3c 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -11,8 +11,8 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.utils import check_and_assign_cache_dir, onnx_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir from QEfficient.utils.logging_utils import logger # Specifically for Docker images. @@ -39,27 +39,10 @@ def get_onnx_model_path( :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` """ - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size) - if onnx_path_exists: - logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") - else: - ################### - # hf model -> export - #################### - # Export to the Onnx - logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - _, onnx_model_path = qualcomm_efficient_converter( - model_name=model_name, - local_model_dir=local_model_dir, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - kv=True, - form_factor="cloud", - hf_token=hf_token, - cache_dir=cache_dir, - full_batch_size=full_batch_size, - ) # type: ignore - logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}") + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + onnx_model_path = qeff_model.export() + logger.info(f"Generated onnx_path: {onnx_model_path}") return onnx_model_path diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 870005c91..0fadd7226 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -7,13 +7,10 @@ import argparse import logging -import os from typing import List, Optional -import QEfficient -from QEfficient.cloud.export import get_onnx_model_path -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv -from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -75,51 +72,27 @@ def main( hf_token=hf_token, ) - qpc_dir_path = get_qpc_dir_path( - model_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + ######### + # Compile + ######### + _ = qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_cores=num_cores, + mxfp6_matmul=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + batch_size=batch_size, + mos=mos, + num_devices=len(device_group), + full_batch_size=full_batch_size, + allow_mxint8_mdp_io=allow_mxint8_mdp_io, ) - # Handle qpc generation - if qpc_exists(qpc_dir_path): - logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") - else: - # Handle onnx model generation - onnx_model_path = get_onnx_model_path( - model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size - ) # , base_dir_name) - - ######### - # Compile - ######### - _ = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname( - qpc_dir_path - ), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - full_batch_size=full_batch_size, - allow_mxint8_mdp_io=allow_mxint8_mdp_io, - ) - ######### # Execute ######### - cloud_ai_100_exec_kv( - tokenizer=tokenizer, - qpc_path=qpc_dir_path, - device_id=device_group, - prompt=prompt, - prompts_txt_file_path=prompts_txt_file_path, - generation_len=generation_len, - ) + _ = qeff_model.generate(tokenizer, prompts=prompt) if __name__ == "__main__": diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index d0bb4285f..3e6c47569 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -281,6 +281,7 @@ def compile( :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` Returns: :str: Path of the compiled ``qpc`` package.