From 12da558dc912a3d4a60c3d246bd5c2114c55596e Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 8 Jan 2025 08:01:57 +0000 Subject: [PATCH 1/7] Migrating HL compile and export to infer APIs Change-Id: If27fbc1636ed1fe9b475d07cef7c83ed7dc46ca8 Signed-off-by: Asmita Goswami --- QEfficient/cloud/export.py | 29 +---- QEfficient/cloud/infer.py | 108 +++++++++--------- .../transformers/models/modeling_auto.py | 1 + 3 files changed, 61 insertions(+), 77 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 53184450e..7e9442411 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -11,8 +11,8 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.utils import check_and_assign_cache_dir, onnx_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir from QEfficient.utils.logging_utils import logger # Specifically for Docker images. @@ -39,27 +39,10 @@ def get_onnx_model_path( :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` """ - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size) - if onnx_path_exists: - logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") - else: - ################### - # hf model -> export - #################### - # Export to the Onnx - logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - _, onnx_model_path = qualcomm_efficient_converter( - model_name=model_name, - local_model_dir=local_model_dir, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - kv=True, - form_factor="cloud", - hf_token=hf_token, - cache_dir=cache_dir, - full_batch_size=full_batch_size, - ) # type: ignore - logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}") + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir) + onnx_model_path = qeff_model.export() + logger.info(f"Generated onnx_path: {onnx_model_path}") return onnx_model_path diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 0ba0961e3..1a732db9c 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -7,13 +7,11 @@ import argparse import logging -import os +import sys from typing import List, Optional -import QEfficient -from QEfficient.cloud.export import get_onnx_model_path -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv -from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -38,6 +36,7 @@ def main( allow_mxint8_mdp_io: bool = False, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + **kwargs, ) -> None: """ 1. Check if compiled qpc for given config already exists, if it does jump to execute, else @@ -79,58 +78,43 @@ def main( hf_token=hf_token, ) - qpc_dir_path = get_qpc_dir_path( - model_name, - num_cores, - mos, - batch_size, - prompt_len, - ctx_len, - mxfp6, - mxint8, - device_group, - full_batch_size, - enable_qnn=enable_qnn, - ) + if enable_qnn and qnn_config is not None: + logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.") - # Handle qpc generation - if qpc_exists(qpc_dir_path): - logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") - else: - # Handle onnx model generation - onnx_model_path = get_onnx_model_path( - model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size - ) # , base_dir_name) + if "--mxfp6" in sys.argv: + if args.mxfp6: + logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.") + if "--mxint8" in sys.argv: + if args.mxint8: + logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") - ######### - # Compile - ######### - _ = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname( - qpc_dir_path - ), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - full_batch_size=full_batch_size, - allow_mxint8_mdp_io=allow_mxint8_mdp_io, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + + ######### + # Compile + ######### + _ = qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_cores=num_cores, + mxfp6_matmul=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + batch_size=batch_size, + mos=mos, + mxint8_kv_cache=mxint8, + num_devices=(0 if device_group is None else len(device_group)), + full_batch_size=full_batch_size, + allow_mxint8_mdp_io=allow_mxint8_mdp_io, + enable_qnn=enable_qnn, + **kwargs, + ) ######### # Execute ######### - cloud_ai_100_exec_kv( - tokenizer=tokenizer, - qpc_path=qpc_dir_path, + _ = qeff_model.generate( + tokenizer, + prompts=prompt, device_id=device_group, prompt=prompt, prompts_txt_file_path=prompts_txt_file_path, @@ -162,10 +146,16 @@ def main( ) parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.") parser.add_argument( - "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression" + "--mxfp6", + "--mxfp6_matmul", + "--mxfp6-matmul", + action="store_true", + help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression", ) parser.add_argument( "--mxint8", + "--mxint8_kv_cache", + "--mxint8-kv-cache", action="store_true", help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False", ) @@ -237,8 +227,18 @@ def main( type=str, ) - args = parser.parse_args() + args, compiler_options = parser.parse_known_args() + compiler_options_dict = {} + for i in range(0, len(compiler_options)): + if compiler_options[i].startswith("--"): + key = compiler_options[i].lstrip("-") + value = ( + compiler_options[i + 1] + if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-") + else True + ) + compiler_options_dict[key] = value if args.verbose: logger.setLevel(logging.INFO) del args.verbose # type: ignore - main(**args.__dict__) + main(**args.__dict__, **compiler_options_dict) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c2e3777bc..51e56c83e 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -278,6 +278,7 @@ def compile( :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` Returns: :str: Path of the compiled ``qpc`` package. From d3a97ca672dc2d86b13e27376c317ed3e4946b5b Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 8 Jan 2025 08:01:57 +0000 Subject: [PATCH 2/7] Migrating HL compile and export to infer APIs Change-Id: If27fbc1636ed1fe9b475d07cef7c83ed7dc46ca8 Signed-off-by: Asmita Goswami --- QEfficient/cloud/export.py | 29 +---- QEfficient/cloud/infer.py | 108 +++++++++--------- .../transformers/models/modeling_auto.py | 1 + 3 files changed, 61 insertions(+), 77 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 53184450e..7e9442411 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -11,8 +11,8 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.utils import check_and_assign_cache_dir, onnx_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir from QEfficient.utils.logging_utils import logger # Specifically for Docker images. @@ -39,27 +39,10 @@ def get_onnx_model_path( :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` """ - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size) - if onnx_path_exists: - logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") - else: - ################### - # hf model -> export - #################### - # Export to the Onnx - logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - _, onnx_model_path = qualcomm_efficient_converter( - model_name=model_name, - local_model_dir=local_model_dir, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - kv=True, - form_factor="cloud", - hf_token=hf_token, - cache_dir=cache_dir, - full_batch_size=full_batch_size, - ) # type: ignore - logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}") + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir) + onnx_model_path = qeff_model.export() + logger.info(f"Generated onnx_path: {onnx_model_path}") return onnx_model_path diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 0ba0961e3..1a732db9c 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -7,13 +7,11 @@ import argparse import logging -import os +import sys from typing import List, Optional -import QEfficient -from QEfficient.cloud.export import get_onnx_model_path -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv -from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -38,6 +36,7 @@ def main( allow_mxint8_mdp_io: bool = False, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + **kwargs, ) -> None: """ 1. Check if compiled qpc for given config already exists, if it does jump to execute, else @@ -79,58 +78,43 @@ def main( hf_token=hf_token, ) - qpc_dir_path = get_qpc_dir_path( - model_name, - num_cores, - mos, - batch_size, - prompt_len, - ctx_len, - mxfp6, - mxint8, - device_group, - full_batch_size, - enable_qnn=enable_qnn, - ) + if enable_qnn and qnn_config is not None: + logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.") - # Handle qpc generation - if qpc_exists(qpc_dir_path): - logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") - else: - # Handle onnx model generation - onnx_model_path = get_onnx_model_path( - model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size - ) # , base_dir_name) + if "--mxfp6" in sys.argv: + if args.mxfp6: + logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.") + if "--mxint8" in sys.argv: + if args.mxint8: + logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") - ######### - # Compile - ######### - _ = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname( - qpc_dir_path - ), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - full_batch_size=full_batch_size, - allow_mxint8_mdp_io=allow_mxint8_mdp_io, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + + ######### + # Compile + ######### + _ = qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_cores=num_cores, + mxfp6_matmul=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + batch_size=batch_size, + mos=mos, + mxint8_kv_cache=mxint8, + num_devices=(0 if device_group is None else len(device_group)), + full_batch_size=full_batch_size, + allow_mxint8_mdp_io=allow_mxint8_mdp_io, + enable_qnn=enable_qnn, + **kwargs, + ) ######### # Execute ######### - cloud_ai_100_exec_kv( - tokenizer=tokenizer, - qpc_path=qpc_dir_path, + _ = qeff_model.generate( + tokenizer, + prompts=prompt, device_id=device_group, prompt=prompt, prompts_txt_file_path=prompts_txt_file_path, @@ -162,10 +146,16 @@ def main( ) parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.") parser.add_argument( - "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression" + "--mxfp6", + "--mxfp6_matmul", + "--mxfp6-matmul", + action="store_true", + help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression", ) parser.add_argument( "--mxint8", + "--mxint8_kv_cache", + "--mxint8-kv-cache", action="store_true", help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False", ) @@ -237,8 +227,18 @@ def main( type=str, ) - args = parser.parse_args() + args, compiler_options = parser.parse_known_args() + compiler_options_dict = {} + for i in range(0, len(compiler_options)): + if compiler_options[i].startswith("--"): + key = compiler_options[i].lstrip("-") + value = ( + compiler_options[i + 1] + if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-") + else True + ) + compiler_options_dict[key] = value if args.verbose: logger.setLevel(logging.INFO) del args.verbose # type: ignore - main(**args.__dict__) + main(**args.__dict__, **compiler_options_dict) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c2e3777bc..51e56c83e 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -278,6 +278,7 @@ def compile( :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` Returns: :str: Path of the compiled ``qpc`` package. From 9ed96bea74d568c870afa389be5aad44afe604f4 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 28 Jan 2025 10:14:58 +0000 Subject: [PATCH 3/7] Made modelling class generic in infer Signed-off-by: Asmita Goswami --- QEfficient/cloud/export.py | 14 +++++++------- QEfficient/cloud/infer.py | 26 +++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 7e9442411..bf83798d4 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -7,9 +7,7 @@ import argparse import os -from typing import Optional, Union - -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +from typing import Optional from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.utils import check_and_assign_cache_dir @@ -22,9 +20,7 @@ def get_onnx_model_path( model_name: str, cache_dir: Optional[str] = None, - tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]] = None, hf_token: Optional[str] = None, - local_model_dir: Optional[str] = None, full_batch_size: Optional[int] = None, ): """ @@ -40,7 +36,12 @@ def get_onnx_model_path( :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` """ logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, cache_dir) + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + cache_dir, + hf_token=hf_token, + full_batch_size=full_batch_size, + ) onnx_model_path = qeff_model.export() logger.info(f"Generated onnx_path: {onnx_model_path}") return onnx_model_path @@ -75,7 +76,6 @@ def main( model_name=model_name, cache_dir=cache_dir, hf_token=hf_token, - local_model_dir=local_model_dir, full_batch_size=full_batch_size, ) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 1a732db9c..509d5e7f0 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -10,10 +10,26 @@ import sys from typing import List, Optional +from transformers import AutoConfig + from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger +# Map model's architecture to class +architecture_mapping = { + "LlamaForCausalLM": QEFFAutoModelForCausalLM, + "GPT2LMHeadModel": QEFFAutoModelForCausalLM, + "MistralForCausalLM": QEFFAutoModelForCausalLM, + "FalconForCausalLM": QEFFAutoModelForCausalLM, + "GPTJForCausalLM": QEFFAutoModelForCausalLM, + "GemmaForCausalLM": QEFFAutoModelForCausalLM, + "Gemma2ForCausalLM": QEFFAutoModelForCausalLM, + "Phi3ForCausalLM": QEFFAutoModelForCausalLM, + "Qwen2ForCausalLM": QEFFAutoModelForCausalLM, + "GPTBigCodeForCausalLM": QEFFAutoModelForCausalLM, +} + def main( model_name: str, @@ -88,7 +104,15 @@ def main( if args.mxint8: logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") - qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + config = AutoConfig.from_pretrained(model_name) + architecture = config.architectures[0] if config.architectures else None + + model_class = architecture_mapping.get(architecture) + if not model_class: + logger.error(f"Model class for model name {model_name} not found in mapping") + return + + qeff_model = model_class.from_pretrained(model_name) ######### # Compile From 55b753c3dab4b0bd2052077ccfb727254a9e72fd Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 29 Jan 2025 10:21:57 +0000 Subject: [PATCH 4/7] Made modelling class generic Signed-off-by: Asmita Goswami --- QEfficient/cloud/export.py | 21 ++++++++++++++++++--- QEfficient/cloud/infer.py | 27 ++++++++++----------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index bf83798d4..46a50ff6c 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -9,6 +9,9 @@ import os from typing import Optional +from transformers import AutoConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.utils import check_and_assign_cache_dir from QEfficient.utils.logging_utils import logger @@ -22,6 +25,7 @@ def get_onnx_model_path( cache_dir: Optional[str] = None, hf_token: Optional[str] = None, full_batch_size: Optional[int] = None, + local_model_dir: Optional[str] = None, ): """ exports the model to onnx if pre-exported file is not found and returns onnx_model_path @@ -36,9 +40,19 @@ def get_onnx_model_path( :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` """ logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_name, - cache_dir, + + config = AutoConfig.from_pretrained(model_name) + architecture = config.architectures[0] if config.architectures else None + + if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + model_class = QEFFAutoModelForCausalLM + else: + logger.error(f"Model class for model name {model_name} not found in mapping") + return + + qeff_model = model_class.from_pretrained( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, hf_token=hf_token, full_batch_size=full_batch_size, ) @@ -77,6 +91,7 @@ def main( cache_dir=cache_dir, hf_token=hf_token, full_batch_size=full_batch_size, + local_model_dir=local_model_dir, ) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 509d5e7f0..a79227748 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -11,25 +11,12 @@ from typing import List, Optional from transformers import AutoConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger -# Map model's architecture to class -architecture_mapping = { - "LlamaForCausalLM": QEFFAutoModelForCausalLM, - "GPT2LMHeadModel": QEFFAutoModelForCausalLM, - "MistralForCausalLM": QEFFAutoModelForCausalLM, - "FalconForCausalLM": QEFFAutoModelForCausalLM, - "GPTJForCausalLM": QEFFAutoModelForCausalLM, - "GemmaForCausalLM": QEFFAutoModelForCausalLM, - "Gemma2ForCausalLM": QEFFAutoModelForCausalLM, - "Phi3ForCausalLM": QEFFAutoModelForCausalLM, - "Qwen2ForCausalLM": QEFFAutoModelForCausalLM, - "GPTBigCodeForCausalLM": QEFFAutoModelForCausalLM, -} - def main( model_name: str, @@ -107,12 +94,18 @@ def main( config = AutoConfig.from_pretrained(model_name) architecture = config.architectures[0] if config.architectures else None - model_class = architecture_mapping.get(architecture) - if not model_class: + if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + model_class = QEFFAutoModelForCausalLM + else: logger.error(f"Model class for model name {model_name} not found in mapping") return - qeff_model = model_class.from_pretrained(model_name) + qeff_model = model_class.from_pretrained( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + full_batch_size=full_batch_size, + ) ######### # Compile From 292fe7389df70ec1bcad613b610610bb2d9a7141 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 29 Jan 2025 11:20:00 +0000 Subject: [PATCH 5/7] Added load qeff model Signed-off-by: Asmita Goswami --- QEfficient/cloud/export.py | 26 ++++++--------------- QEfficient/cloud/infer.py | 26 ++++++--------------- QEfficient/utils/__init__.py | 1 + QEfficient/utils/_utils.py | 44 +++++++++++++++++++++++++++++++++++- 4 files changed, 58 insertions(+), 39 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 46a50ff6c..f3604c0f2 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -9,11 +9,7 @@ import os from typing import Optional -from transformers import AutoConfig -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM -from QEfficient.utils import check_and_assign_cache_dir +from QEfficient.utils import check_and_assign_cache_dir, load_qeff_model from QEfficient.utils.logging_utils import logger # Specifically for Docker images. @@ -41,20 +37,12 @@ def get_onnx_model_path( """ logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - config = AutoConfig.from_pretrained(model_name) - architecture = config.architectures[0] if config.architectures else None - - if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - model_class = QEFFAutoModelForCausalLM - else: - logger.error(f"Model class for model name {model_name} not found in mapping") - return - - qeff_model = model_class.from_pretrained( - pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), - cache_dir=cache_dir, - hf_token=hf_token, - full_batch_size=full_batch_size, + qeff_model = load_qeff_model( + model_name, + cache_dir, + hf_token, + full_batch_size, + local_model_dir, ) onnx_model_path = qeff_model.export() logger.info(f"Generated onnx_path: {onnx_model_path}") diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index a79227748..c30c58ffe 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -10,11 +10,7 @@ import sys from typing import List, Optional -from transformers import AutoConfig -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM -from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_qeff_model from QEfficient.utils.logging_utils import logger @@ -91,20 +87,12 @@ def main( if args.mxint8: logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") - config = AutoConfig.from_pretrained(model_name) - architecture = config.architectures[0] if config.architectures else None - - if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - model_class = QEFFAutoModelForCausalLM - else: - logger.error(f"Model class for model name {model_name} not found in mapping") - return - - qeff_model = model_class.from_pretrained( - pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), - cache_dir=cache_dir, - hf_token=hf_token, - full_batch_size=full_batch_size, + qeff_model = load_qeff_model( + model_name, + cache_dir, + hf_token, + full_batch_size, + local_model_dir, ) ######### diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 2506b9233..86f563e05 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -17,6 +17,7 @@ get_qpc_dir_path, hf_download, load_hf_tokenizer, + load_qeff_model, login_and_download_hf_lm, onnx_exists, padding_check_and_fix, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 2729267d6..ab899a56e 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import importlib import json import os import subprocess @@ -13,7 +14,8 @@ import requests from huggingface_hub import login, snapshot_download from requests.exceptions import HTTPError -from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger @@ -394,3 +396,43 @@ def create_json(file_path: str, json_data: object): json.dump(json_data, file, indent=4) except Exception as e: print(f"Failed to create JSON File {file_path}: {e}") + + +def load_qeff_model( + model_name: str, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, + local_model_dir: Optional[str] = None, + full_batch_size: Optional[int] = None, +): + """ + Loads the model using the QEfficient Modelling Class. + + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. + ``Optional`` Args: + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` + + """ + config = AutoConfig.from_pretrained(model_name) + architecture = config.architectures[0] if config.architectures else None + + module = importlib.import_module("QEfficient") + if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + model_class = getattr(module, "QEFFAutoModelForCausalLM") + else: + raise NotImplementedError( + f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!" + ) + + qeff_model = model_class.from_pretrained( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + full_batch_size=full_batch_size, + ) + return qeff_model From 26ca6a7ff55a6649dfedd43f5a65356c7a925983 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 5 Feb 2025 09:51:36 +0530 Subject: [PATCH 6/7] clean code Signed-off-by: Onkar Chougule --- QEfficient/base/common.py | 74 +++++++++++------------------------- QEfficient/cloud/export.py | 15 ++++---- QEfficient/cloud/infer.py | 19 +++++---- QEfficient/utils/__init__.py | 1 - QEfficient/utils/_utils.py | 44 +-------------------- 5 files changed, 41 insertions(+), 112 deletions(-) diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index ce6b1cdc2..6e9b7a6af 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -12,59 +12,20 @@ QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. """ -import os -from enum import Enum -from typing import Any, Dict, Type +from typing import Any from transformers import AutoConfig from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM -from QEfficient.utils._utils import login_and_download_hf_lm - - -class QEFF_MODEL_TYPE(Enum): - """ - Defines Names of the different varities of transformer models. - """ - - CAUSALLM = "LLM" - DIFFUSION = "DIFFUSION" - - -MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = { - QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM -} - -AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = { - v: k for k, v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items() -} - - -def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE: - """ - Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library. - """ - if not os.path.isdir(hf_model_path): - raise FileNotFoundError( - "Please pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model" - ) - config, kwargs = AutoConfig.from_pretrained( - hf_model_path, - return_unused_kwargs=True, - ) - - if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING: - return QEFF_MODEL_TYPE.CAUSALLM - else: - raise NotImplementedError(f"model type {type(config)} is not yet supported") class QEFFCommonLoader: """ Provides HuggingFace model loading interface same as transformers APIs. Supports loading any model on HuggingFace. + Wrapper on top of Auto Classes """ def __init__(self, *args: Any, **kwds: Any) -> None: @@ -78,14 +39,25 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> """ Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model. """ - if not os.path.isdir(pretrained_model_name_or_path): - pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs) - kwargs.pop("hf_token", None) - model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path) - qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type] - if not issubclass(qeff_auto_model_class, QEFFBaseModel): - raise Exception(f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}") - - return qeff_auto_model_class.from_pretrained( - pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs + config = AutoConfig.from_pretrained(pretrained_model_name_or_path) + architecture = config.architectures[0] if config.architectures else None + + if architecture in MODEL_FOR_CAUSAL_LM_MAPPING: + model_class = QEFFAutoModelForCausalLM + else: + raise NotImplementedError( + f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!" + ) + + local_model_dir = kwargs.pop("local_model_dir", None) + cache_dir = kwargs.pop("cache_dir", None) + hf_token = kwargs.pop("hf_token", None) + continuous_batching = True if kwargs.pop("full_batch_size", None) else False + + qeff_model = model_class.from_pretrained( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else pretrained_model_name_or_path), + cache_dir=cache_dir, + hf_token=hf_token, + continuous_batching=continuous_batching, ) + return qeff_model diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index f3604c0f2..504240b66 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -9,7 +9,8 @@ import os from typing import Optional -from QEfficient.utils import check_and_assign_cache_dir, load_qeff_model +from QEfficient.base.common import QEFFCommonLoader +from QEfficient.utils import check_and_assign_cache_dir from QEfficient.utils.logging_utils import logger # Specifically for Docker images. @@ -37,12 +38,12 @@ def get_onnx_model_path( """ logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - qeff_model = load_qeff_model( - model_name, - cache_dir, - hf_token, - full_batch_size, - local_model_dir, + qeff_model = QEFFCommonLoader.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=cache_dir, + hf_token=hf_token, + full_batch_size=full_batch_size, + local_model_dir=local_model_dir, ) onnx_model_path = qeff_model.export() logger.info(f"Generated onnx_path: {onnx_model_path}") diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index c30c58ffe..4b43c8ded 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -10,7 +10,8 @@ import sys from typing import List, Optional -from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_qeff_model +from QEfficient.base.common import QEFFCommonLoader +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -77,9 +78,6 @@ def main( hf_token=hf_token, ) - if enable_qnn and qnn_config is not None: - logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.") - if "--mxfp6" in sys.argv: if args.mxfp6: logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.") @@ -87,12 +85,12 @@ def main( if args.mxint8: logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") - qeff_model = load_qeff_model( - model_name, - cache_dir, - hf_token, - full_batch_size, - local_model_dir, + qeff_model = QEFFCommonLoader.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=cache_dir, + hf_token=hf_token, + full_batch_size=full_batch_size, + local_model_dir=local_model_dir, ) ######### @@ -111,6 +109,7 @@ def main( full_batch_size=full_batch_size, allow_mxint8_mdp_io=allow_mxint8_mdp_io, enable_qnn=enable_qnn, + qnn_config=qnn_config, **kwargs, ) diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 86f563e05..2506b9233 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -17,7 +17,6 @@ get_qpc_dir_path, hf_download, load_hf_tokenizer, - load_qeff_model, login_and_download_hf_lm, onnx_exists, padding_check_and_fix, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index ab899a56e..2729267d6 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -import importlib import json import os import subprocess @@ -14,8 +13,7 @@ import requests from huggingface_hub import login, snapshot_download from requests.exceptions import HTTPError -from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants from QEfficient.utils.logging_utils import logger @@ -396,43 +394,3 @@ def create_json(file_path: str, json_data: object): json.dump(json_data, file, indent=4) except Exception as e: print(f"Failed to create JSON File {file_path}: {e}") - - -def load_qeff_model( - model_name: str, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, - local_model_dir: Optional[str] = None, - full_batch_size: Optional[int] = None, -): - """ - Loads the model using the QEfficient Modelling Class. - - ``Mandatory`` Args: - :model_name (str): Hugging Face Model Card name, Example: ``gpt2``. - ``Optional`` Args: - :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` - :tokenizer (Union[PreTrainedTokenizer, PreTrainedTokenizerFast]): Pass model tokenizer. ``Defaults to None.`` - :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` - :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` - :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` - - """ - config = AutoConfig.from_pretrained(model_name) - architecture = config.architectures[0] if config.architectures else None - - module = importlib.import_module("QEfficient") - if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - model_class = getattr(module, "QEFFAutoModelForCausalLM") - else: - raise NotImplementedError( - f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!" - ) - - qeff_model = model_class.from_pretrained( - pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), - cache_dir=cache_dir, - hf_token=hf_token, - full_batch_size=full_batch_size, - ) - return qeff_model From 6ade0fa99c2b97229d1d61776f3b5bd205507173 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 5 Feb 2025 16:37:19 +0530 Subject: [PATCH 7/7] removed extra code as part of deprecation Signed-off-by: Onkar Chougule --- .../exporter/export_hf_to_cloud_ai_100.py | 28 ++----------------- QEfficient/transformers/transform.py | 8 ++---- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py index 276faf94c..6b6cbe18a 100644 --- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py +++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py @@ -13,7 +13,7 @@ import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader +from QEfficient.base.common import QEFFCommonLoader from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM @@ -307,30 +307,6 @@ def export_kvstyle_transformed_model_to_onnx( return model_name -def export_for_cloud( - model_name: str, - qeff_model: QEFFBaseModel, - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - onnx_dir_path: str, - seq_length: int = Constants.SEQ_LEN, - full_batch_size: Optional[int] = None, -) -> str: - # FIXME: move all this to class instead of here, and just call qeff_model.export here. - if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: # type: ignore - return export_lm_model_for_cloud( - model_name=model_name, - qeff_model=qeff_model, # type: ignore - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - seq_length=seq_length, - full_batch_size=full_batch_size, - ) - else: - raise NotImplementedError( - f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}" - ) - - def export_lm_model_for_cloud( model_name: str, qeff_model: QEFFAutoModelForCausalLM, @@ -434,7 +410,7 @@ def qualcomm_efficient_converter( ) if form_factor == "cloud": - generated_onnx_model_path = export_for_cloud( + generated_onnx_model_path = export_lm_model_for_cloud( model_name=model_name, qeff_model=model_kv, tokenizer=tokenizer, diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py index 8bb084fbf..f4024a1f3 100644 --- a/QEfficient/transformers/transform.py +++ b/QEfficient/transformers/transform.py @@ -10,7 +10,6 @@ import torch.nn as nn import transformers -from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict @@ -96,8 +95,5 @@ def transform(model: QEFFBaseModel, form_factor="cloud"): if form_factor != "cloud": raise ValueError("Only form_factor='cloud' is supported as of now!") # FIXME: move this to class and use model.transform() - if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM: - transform_lm(model.model) # type: ignore - return model - else: - raise NotImplementedError(f"Recieved unsupported class of type {type(model)}") + transform_lm(model.model) # type: ignore + return model