From c89329774c25adae81254074932a26975ab41bac Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH 01/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- QEfficient/cloud/export.py | 29 ++--- QEfficient/cloud/infer.py | 101 +++++++++--------- .../transformers/models/modeling_auto.py | 1 + 3 files changed, 59 insertions(+), 72 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 53184450e..700532f3c 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -11,8 +11,8 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.utils import check_and_assign_cache_dir, onnx_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir from QEfficient.utils.logging_utils import logger # Specifically for Docker images. @@ -39,27 +39,10 @@ def get_onnx_model_path( :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` """ - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size) - if onnx_path_exists: - logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") - else: - ################### - # hf model -> export - #################### - # Export to the Onnx - logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - _, onnx_model_path = qualcomm_efficient_converter( - model_name=model_name, - local_model_dir=local_model_dir, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - kv=True, - form_factor="cloud", - hf_token=hf_token, - cache_dir=cache_dir, - full_batch_size=full_batch_size, - ) # type: ignore - logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}") + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + onnx_model_path = qeff_model.export() + logger.info(f"Generated onnx_path: {onnx_model_path}") return onnx_model_path diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 870005c91..7e2328eaf 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -7,13 +7,11 @@ import argparse import logging -import os +import sys from typing import List, Optional -import QEfficient -from QEfficient.cloud.export import get_onnx_model_path -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv -from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -36,6 +34,7 @@ def main( cache_dir: Optional[str] = None, hf_token: Optional[str] = None, allow_mxint8_mdp_io: bool = False, + **kwargs, ) -> None: """ 1. Check if compiled qpc for given config already exists, if it does jump to execute, else @@ -75,52 +74,42 @@ def main( hf_token=hf_token, ) - qpc_dir_path = get_qpc_dir_path( - model_name, num_cores, mos, batch_size, prompt_len, ctx_len, mxfp6, mxint8, device_group, full_batch_size - ) - - # Handle qpc generation - if qpc_exists(qpc_dir_path): - logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") - else: - # Handle onnx model generation - onnx_model_path = get_onnx_model_path( - model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size - ) # , base_dir_name) - - ######### - # Compile - ######### - _ = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname( - qpc_dir_path - ), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - full_batch_size=full_batch_size, - allow_mxint8_mdp_io=allow_mxint8_mdp_io, - ) + if '--mxfp6' in sys.argv: + if args.mxfp6: + logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.") + if '--mxint8' in sys.argv: + if args.mxint8: + logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) ######### - # Execute + # Compile ######### - cloud_ai_100_exec_kv( - tokenizer=tokenizer, - qpc_path=qpc_dir_path, - device_id=device_group, - prompt=prompt, - prompts_txt_file_path=prompts_txt_file_path, - generation_len=generation_len, + _ = qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_cores=num_cores, + mxfp6_matmul=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + batch_size=batch_size, + mos=mos, + mxint8_kv_cache=mxint8, + num_devices=len(device_group), + full_batch_size=full_batch_size, + allow_mxint8_mdp_io=allow_mxint8_mdp_io, + **kwargs, ) + ######### + # Execute + ######### + _ = qeff_model.generate(tokenizer, + prompts=prompt, + device_id=device_group, + prompts_txt_file_path=prompts_txt_file_path, + generation_len=generation_len,) + if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -146,10 +135,16 @@ def main( ) parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.") parser.add_argument( - "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression" + "--mxfp6", + "--mxfp6_matmul", + "--mxfp6-matmul", + action="store_true", + help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression" ) parser.add_argument( "--mxint8", + "--mxint8_kv_cache", + "--mxint8-kv-cache", action="store_true", help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False", ) @@ -207,8 +202,16 @@ def main( help="If passed, this option allows MXINT8 compression of MDP IO traffic", ) - args = parser.parse_args() + args, compiler_options = parser.parse_known_args() + compiler_options_dict = {} + for i in range(0, len(compiler_options)): + if (compiler_options[i].startswith('--')): + key = compiler_options[i].lstrip('-') + value = compiler_options[i+1] if i+1 < len(compiler_options) and not compiler_options[i+1].startswith('-') else True + compiler_options_dict[key] = value + if args.verbose: logger.setLevel(logging.INFO) del args.verbose # type: ignore - main(**args.__dict__) + + main(**args.__dict__, **compiler_options_dict) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 83c573f6d..6b7c1443c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -266,6 +266,7 @@ def compile( :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` Returns: :str: Path of the compiled ``qpc`` package. From 78bb2d7c76f52fc3c0832291612fa825d58f7dc8 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH 02/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 28 +++++++++++++++++----------- tests/cloud/test_export.py | 10 ---------- tests/cloud/test_infer.py | 19 ------------------- 3 files changed, 17 insertions(+), 40 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 7e2328eaf..e8690d5de 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -74,10 +74,10 @@ def main( hf_token=hf_token, ) - if '--mxfp6' in sys.argv: + if "--mxfp6" in sys.argv: if args.mxfp6: logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.") - if '--mxint8' in sys.argv: + if "--mxint8" in sys.argv: if args.mxint8: logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") @@ -104,11 +104,13 @@ def main( ######### # Execute ######### - _ = qeff_model.generate(tokenizer, - prompts=prompt, - device_id=device_group, - prompts_txt_file_path=prompts_txt_file_path, - generation_len=generation_len,) + _ = qeff_model.generate( + tokenizer, + prompts=prompt, + device_id=device_group, + prompts_txt_file_path=prompts_txt_file_path, + generation_len=generation_len, + ) if __name__ == "__main__": @@ -139,7 +141,7 @@ def main( "--mxfp6_matmul", "--mxfp6-matmul", action="store_true", - help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression" + help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression", ) parser.add_argument( "--mxint8", @@ -205,9 +207,13 @@ def main( args, compiler_options = parser.parse_known_args() compiler_options_dict = {} for i in range(0, len(compiler_options)): - if (compiler_options[i].startswith('--')): - key = compiler_options[i].lstrip('-') - value = compiler_options[i+1] if i+1 < len(compiler_options) and not compiler_options[i+1].startswith('-') else True + if compiler_options[i].startswith("--"): + key = compiler_options[i].lstrip("-") + value = ( + compiler_options[i + 1] + if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-") + else True + ) compiler_options_dict[key] = value if args.verbose: diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py index 4291da23a..a2b717634 100644 --- a/tests/cloud/test_export.py +++ b/tests/cloud/test_export.py @@ -5,12 +5,9 @@ # # ----------------------------------------------------------------------------- -import os import pytest -import QEfficient -import QEfficient.cloud.export from QEfficient.cloud.export import main as export @@ -25,8 +22,6 @@ def test_export(setup, mocker): mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. """ ms = setup - check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir") - get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") export( model_name=ms.model_name, @@ -34,8 +29,3 @@ def test_export(setup, mocker): local_model_dir=ms.local_model_dir, full_batch_size=ms.full_batch_size, ) - - check_and_assign_cache_dir_spy.assert_called_once() - get_onnx_model_path_spy.assert_called_once() - assert any(os.path.isfile(x) for x in ms.onnx_model_path()) - assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path() diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 8cd61a050..0dacba570 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -5,12 +5,9 @@ # # ----------------------------------------------------------------------------- -import os import pytest -import QEfficient -import QEfficient.cloud.infer from QEfficient.cloud.infer import main as infer @@ -30,11 +27,6 @@ def test_infer(setup, mocker): Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html """ ms = setup - load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer") - qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists") - get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path") - compile_spy = mocker.spy(QEfficient, "compile") - cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv") infer( model_name=ms.model_name, @@ -53,14 +45,3 @@ def test_infer(setup, mocker): mxint8=ms.mxint8, full_batch_size=ms.full_batch_size, ) - # tokenizer check - load_hf_tokenizer_spy.assert_called_once() - # qpc exist check - qpc_exists_spy.assert_called_once() - if qpc_exists_spy.spy_return is True: - assert os.path.isdir(ms.qpc_dir_path()) - else: - get_onnx_model_path_spy.assert_called_once() - compile_spy.assert_called_once() - assert compile_spy.spy_return == ms.qpc_dir_path() - cloud_ai_100_exec_kv_spy.assert_called_once() From 7229571e1403f974166ab4a0bdf769177a8282be Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH 03/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index e8690d5de..657cb4a3c 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -95,7 +95,7 @@ def main( batch_size=batch_size, mos=mos, mxint8_kv_cache=mxint8, - num_devices=len(device_group), + num_devices = (0 if device_group is None else len(device_group)), full_batch_size=full_batch_size, allow_mxint8_mdp_io=allow_mxint8_mdp_io, **kwargs, From c4859a7dd437c2939d190472343489e7549868ed Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH 04/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 657cb4a3c..48fbc5af0 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -95,7 +95,7 @@ def main( batch_size=batch_size, mos=mos, mxint8_kv_cache=mxint8, - num_devices = (0 if device_group is None else len(device_group)), + num_devices=(0 if device_group is None else len(device_group)), full_batch_size=full_batch_size, allow_mxint8_mdp_io=allow_mxint8_mdp_io, **kwargs, From 807574ee7f613df74dfe2fd1967ff8c7b76c6d20 Mon Sep 17 00:00:00 2001 From: shubhagr-quic Date: Wed, 18 Dec 2024 23:18:34 +0530 Subject: [PATCH 05/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- QEfficient/cloud/export.py | 29 +---- QEfficient/cloud/infer.py | 111 +++++++++--------- .../transformers/models/modeling_auto.py | 1 + 3 files changed, 63 insertions(+), 78 deletions(-) diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py index 53184450e..700532f3c 100644 --- a/QEfficient/cloud/export.py +++ b/QEfficient/cloud/export.py @@ -11,8 +11,8 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter -from QEfficient.utils import check_and_assign_cache_dir, onnx_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir from QEfficient.utils.logging_utils import logger # Specifically for Docker images. @@ -39,27 +39,10 @@ def get_onnx_model_path( :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.`` """ - onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size) - if onnx_path_exists: - logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation") - else: - ################### - # hf model -> export - #################### - # Export to the Onnx - logger.info(f"Exporting Pytorch {model_name} model to ONNX...") - _, onnx_model_path = qualcomm_efficient_converter( - model_name=model_name, - local_model_dir=local_model_dir, - tokenizer=tokenizer, - onnx_dir_path=onnx_dir_path, - kv=True, - form_factor="cloud", - hf_token=hf_token, - cache_dir=cache_dir, - full_batch_size=full_batch_size, - ) # type: ignore - logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}") + logger.info(f"Exporting Pytorch {model_name} model to ONNX...") + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + onnx_model_path = qeff_model.export() + logger.info(f"Generated onnx_path: {onnx_model_path}") return onnx_model_path diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 0ba0961e3..ee82303ef 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -7,13 +7,11 @@ import argparse import logging -import os +import sys from typing import List, Optional -import QEfficient -from QEfficient.cloud.export import get_onnx_model_path -from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv -from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -38,6 +36,7 @@ def main( allow_mxint8_mdp_io: bool = False, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + **kwargs, ) -> None: """ 1. Check if compiled qpc for given config already exists, if it does jump to execute, else @@ -79,60 +78,44 @@ def main( hf_token=hf_token, ) - qpc_dir_path = get_qpc_dir_path( - model_name, - num_cores, - mos, - batch_size, - prompt_len, - ctx_len, - mxfp6, - mxint8, - device_group, - full_batch_size, + if enable_qnn and qnn_config is not None: + logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.") + + if "--mxfp6" in sys.argv: + if args.mxfp6: + logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.") + if "--mxint8" in sys.argv: + if args.mxint8: + logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + + ######### + # Compile + ######### + _ = qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_cores=num_cores, + mxfp6_matmul=mxfp6, + aic_enable_depth_first=aic_enable_depth_first, + batch_size=batch_size, + mos=mos, + mxint8_kv_cache=mxint8, + num_devices=len(device_group), + full_batch_size=full_batch_size, + allow_mxint8_mdp_io=allow_mxint8_mdp_io, enable_qnn=enable_qnn, + **kwargs, ) - # Handle qpc generation - if qpc_exists(qpc_dir_path): - logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt") - else: - # Handle onnx model generation - onnx_model_path = get_onnx_model_path( - model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size - ) # , base_dir_name) - - ######### - # Compile - ######### - _ = QEfficient.compile( - onnx_path=onnx_model_path, - qpc_path=os.path.dirname( - qpc_dir_path - ), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation - num_cores=num_cores, - batch_size=batch_size, - prompt_len=prompt_len, - ctx_len=ctx_len, - mxfp6=mxfp6, - mxint8=mxint8, - aic_enable_depth_first=aic_enable_depth_first, - mos=mos, - device_group=device_group, - full_batch_size=full_batch_size, - allow_mxint8_mdp_io=allow_mxint8_mdp_io, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) - ######### # Execute ######### - cloud_ai_100_exec_kv( - tokenizer=tokenizer, - qpc_path=qpc_dir_path, + _ = qeff_model.generate( + tokenizer, + prompts=prompt, device_id=device_group, - prompt=prompt, prompts_txt_file_path=prompts_txt_file_path, generation_len=generation_len, ) @@ -162,10 +145,16 @@ def main( ) parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.") parser.add_argument( - "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression" + "--mxfp6", + "--mxfp6_matmul", + "--mxfp6-matmul", + action="store_true", + help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression", ) parser.add_argument( "--mxint8", + "--mxint8_kv_cache", + "--mxint8-kv-cache", action="store_true", help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False", ) @@ -237,8 +226,20 @@ def main( type=str, ) - args = parser.parse_args() + args, compiler_options = parser.parse_known_args() + compiler_options_dict = {} + for i in range(0, len(compiler_options)): + if compiler_options[i].startswith("--"): + key = compiler_options[i].lstrip("-") + value = ( + compiler_options[i + 1] + if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-") + else True + ) + compiler_options_dict[key] = value + if args.verbose: logger.setLevel(logging.INFO) del args.verbose # type: ignore - main(**args.__dict__) + + main(**args.__dict__, **compiler_options_dict) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 83c573f6d..6b7c1443c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -266,6 +266,7 @@ def compile( :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` Returns: :str: Path of the compiled ``qpc`` package. From 4d5b14c3f6ad5e757f4afcc6fae0a271dc11da80 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 19 Dec 2024 09:23:20 +0000 Subject: [PATCH 06/12] Revert "Migrating HL compile and export to infer APIs" This reverts commit 78bb2d7c76f52fc3c0832291612fa825d58f7dc8. Change-Id: I39efc3c537ab08f354585695c4fd0d42ece48478 --- tests/cloud/test_export.py | 10 ++++++++++ tests/cloud/test_infer.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py index a2b717634..4291da23a 100644 --- a/tests/cloud/test_export.py +++ b/tests/cloud/test_export.py @@ -5,9 +5,12 @@ # # ----------------------------------------------------------------------------- +import os import pytest +import QEfficient +import QEfficient.cloud.export from QEfficient.cloud.export import main as export @@ -22,6 +25,8 @@ def test_export(setup, mocker): mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. """ ms = setup + check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") export( model_name=ms.model_name, @@ -29,3 +34,8 @@ def test_export(setup, mocker): local_model_dir=ms.local_model_dir, full_batch_size=ms.full_batch_size, ) + + check_and_assign_cache_dir_spy.assert_called_once() + get_onnx_model_path_spy.assert_called_once() + assert any(os.path.isfile(x) for x in ms.onnx_model_path()) + assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path() diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 22191b9ce..e28c3a38a 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -5,9 +5,12 @@ # # ----------------------------------------------------------------------------- +import os import pytest +import QEfficient +import QEfficient.cloud.infer from QEfficient.cloud.infer import main as infer @@ -27,6 +30,11 @@ def test_infer(setup, mocker): Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html """ ms = setup + load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer") + qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path") + compile_spy = mocker.spy(QEfficient, "compile") + cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv") infer( model_name=ms.model_name, @@ -46,3 +54,14 @@ def test_infer(setup, mocker): full_batch_size=ms.full_batch_size, enable_qnn=ms.enable_qnn, ) + # tokenizer check + load_hf_tokenizer_spy.assert_called_once() + # qpc exist check + qpc_exists_spy.assert_called_once() + if qpc_exists_spy.spy_return is True: + assert os.path.isdir(ms.qpc_dir_path()) + else: + get_onnx_model_path_spy.assert_called_once() + compile_spy.assert_called_once() + assert compile_spy.spy_return == ms.qpc_dir_path() + cloud_ai_100_exec_kv_spy.assert_called_once() From 1ee05b33b377aedb5bbda1174211f82062f54398 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 19 Dec 2024 09:23:20 +0000 Subject: [PATCH 07/12] Migrating HL compile and export to infer APIs Change-Id: I39efc3c537ab08f354585695c4fd0d42ece48478 Signed-off-by: Asmita Goswami --- tests/cloud/test_export.py | 10 ++++++++++ tests/cloud/test_infer.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py index a2b717634..4291da23a 100644 --- a/tests/cloud/test_export.py +++ b/tests/cloud/test_export.py @@ -5,9 +5,12 @@ # # ----------------------------------------------------------------------------- +import os import pytest +import QEfficient +import QEfficient.cloud.export from QEfficient.cloud.export import main as export @@ -22,6 +25,8 @@ def test_export(setup, mocker): mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. """ ms = setup + check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") export( model_name=ms.model_name, @@ -29,3 +34,8 @@ def test_export(setup, mocker): local_model_dir=ms.local_model_dir, full_batch_size=ms.full_batch_size, ) + + check_and_assign_cache_dir_spy.assert_called_once() + get_onnx_model_path_spy.assert_called_once() + assert any(os.path.isfile(x) for x in ms.onnx_model_path()) + assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path() diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 22191b9ce..e28c3a38a 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -5,9 +5,12 @@ # # ----------------------------------------------------------------------------- +import os import pytest +import QEfficient +import QEfficient.cloud.infer from QEfficient.cloud.infer import main as infer @@ -27,6 +30,11 @@ def test_infer(setup, mocker): Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html """ ms = setup + load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer") + qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path") + compile_spy = mocker.spy(QEfficient, "compile") + cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv") infer( model_name=ms.model_name, @@ -46,3 +54,14 @@ def test_infer(setup, mocker): full_batch_size=ms.full_batch_size, enable_qnn=ms.enable_qnn, ) + # tokenizer check + load_hf_tokenizer_spy.assert_called_once() + # qpc exist check + qpc_exists_spy.assert_called_once() + if qpc_exists_spy.spy_return is True: + assert os.path.isdir(ms.qpc_dir_path()) + else: + get_onnx_model_path_spy.assert_called_once() + compile_spy.assert_called_once() + assert compile_spy.spy_return == ms.qpc_dir_path() + cloud_ai_100_exec_kv_spy.assert_called_once() From 0c337cfcab2a47a9697b7e1e432a9a74b6846ead Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH 08/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- tests/cloud/test_export.py | 10 ---------- tests/cloud/test_infer.py | 19 ------------------- 2 files changed, 29 deletions(-) diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py index 4291da23a..a2b717634 100644 --- a/tests/cloud/test_export.py +++ b/tests/cloud/test_export.py @@ -5,12 +5,9 @@ # # ----------------------------------------------------------------------------- -import os import pytest -import QEfficient -import QEfficient.cloud.export from QEfficient.cloud.export import main as export @@ -25,8 +22,6 @@ def test_export(setup, mocker): mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. """ ms = setup - check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir") - get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") export( model_name=ms.model_name, @@ -34,8 +29,3 @@ def test_export(setup, mocker): local_model_dir=ms.local_model_dir, full_batch_size=ms.full_batch_size, ) - - check_and_assign_cache_dir_spy.assert_called_once() - get_onnx_model_path_spy.assert_called_once() - assert any(os.path.isfile(x) for x in ms.onnx_model_path()) - assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path() diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index e28c3a38a..22191b9ce 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -5,12 +5,9 @@ # # ----------------------------------------------------------------------------- -import os import pytest -import QEfficient -import QEfficient.cloud.infer from QEfficient.cloud.infer import main as infer @@ -30,11 +27,6 @@ def test_infer(setup, mocker): Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html """ ms = setup - load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer") - qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists") - get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path") - compile_spy = mocker.spy(QEfficient, "compile") - cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv") infer( model_name=ms.model_name, @@ -54,14 +46,3 @@ def test_infer(setup, mocker): full_batch_size=ms.full_batch_size, enable_qnn=ms.enable_qnn, ) - # tokenizer check - load_hf_tokenizer_spy.assert_called_once() - # qpc exist check - qpc_exists_spy.assert_called_once() - if qpc_exists_spy.spy_return is True: - assert os.path.isdir(ms.qpc_dir_path()) - else: - get_onnx_model_path_spy.assert_called_once() - compile_spy.assert_called_once() - assert compile_spy.spy_return == ms.qpc_dir_path() - cloud_ai_100_exec_kv_spy.assert_called_once() From 78a082692298ebe74328a20c182e1a9258585789 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH 09/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index ee82303ef..fd7e431d5 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -102,7 +102,7 @@ def main( batch_size=batch_size, mos=mos, mxint8_kv_cache=mxint8, - num_devices=len(device_group), + num_devices = (0 if device_group is None else len(device_group)), full_batch_size=full_batch_size, allow_mxint8_mdp_io=allow_mxint8_mdp_io, enable_qnn=enable_qnn, From d7f83fac176120eecd30b85c6c6355dcfbc94fb0 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 13 Dec 2024 06:38:22 +0000 Subject: [PATCH 10/12] Migrating HL compile and export to infer APIs Change-Id: If99f867d0ea0bead87f43fdcdb5537eda72a9db5 Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index fd7e431d5..ba0860152 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -102,7 +102,7 @@ def main( batch_size=batch_size, mos=mos, mxint8_kv_cache=mxint8, - num_devices = (0 if device_group is None else len(device_group)), + num_devices=(0 if device_group is None else len(device_group)), full_batch_size=full_batch_size, allow_mxint8_mdp_io=allow_mxint8_mdp_io, enable_qnn=enable_qnn, From 4a79f93b0b86907de7678fb2ac54049aa7676c01 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 19 Dec 2024 09:23:20 +0000 Subject: [PATCH 11/12] Migrating HL compile and export to infer APIs Change-Id: I39efc3c537ab08f354585695c4fd0d42ece48478 Signed-off-by: Asmita Goswami --- tests/cloud/test_export.py | 10 ++++++++++ tests/cloud/test_infer.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py index a2b717634..4291da23a 100644 --- a/tests/cloud/test_export.py +++ b/tests/cloud/test_export.py @@ -5,9 +5,12 @@ # # ----------------------------------------------------------------------------- +import os import pytest +import QEfficient +import QEfficient.cloud.export from QEfficient.cloud.export import main as export @@ -22,6 +25,8 @@ def test_export(setup, mocker): mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions. """ ms = setup + check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path") export( model_name=ms.model_name, @@ -29,3 +34,8 @@ def test_export(setup, mocker): local_model_dir=ms.local_model_dir, full_batch_size=ms.full_batch_size, ) + + check_and_assign_cache_dir_spy.assert_called_once() + get_onnx_model_path_spy.assert_called_once() + assert any(os.path.isfile(x) for x in ms.onnx_model_path()) + assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path() diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py index 22191b9ce..e28c3a38a 100644 --- a/tests/cloud/test_infer.py +++ b/tests/cloud/test_infer.py @@ -5,9 +5,12 @@ # # ----------------------------------------------------------------------------- +import os import pytest +import QEfficient +import QEfficient.cloud.infer from QEfficient.cloud.infer import main as infer @@ -27,6 +30,11 @@ def test_infer(setup, mocker): Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html """ ms = setup + load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer") + qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists") + get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path") + compile_spy = mocker.spy(QEfficient, "compile") + cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv") infer( model_name=ms.model_name, @@ -46,3 +54,14 @@ def test_infer(setup, mocker): full_batch_size=ms.full_batch_size, enable_qnn=ms.enable_qnn, ) + # tokenizer check + load_hf_tokenizer_spy.assert_called_once() + # qpc exist check + qpc_exists_spy.assert_called_once() + if qpc_exists_spy.spy_return is True: + assert os.path.isdir(ms.qpc_dir_path()) + else: + get_onnx_model_path_spy.assert_called_once() + compile_spy.assert_called_once() + assert compile_spy.spy_return == ms.qpc_dir_path() + cloud_ai_100_exec_kv_spy.assert_called_once() From 3edbd7c8f73804e4549891746ba770e103c45e80 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 19 Dec 2024 09:44:37 +0000 Subject: [PATCH 12/12] Migrating HL compile and export to infer APIs Signed-off-by:Asmita Goswami --- QEfficient/cloud/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index ba0860152..fd7e431d5 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -102,7 +102,7 @@ def main( batch_size=batch_size, mos=mos, mxint8_kv_cache=mxint8, - num_devices=(0 if device_group is None else len(device_group)), + num_devices = (0 if device_group is None else len(device_group)), full_batch_size=full_batch_size, allow_mxint8_mdp_io=allow_mxint8_mdp_io, enable_qnn=enable_qnn,