Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrating HL compile and export to infer APIs #214

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 12 additions & 29 deletions QEfficient/cloud/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@

import argparse
import os
from typing import Optional, Union
from typing import Optional

from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
from QEfficient.utils import check_and_assign_cache_dir
from QEfficient.utils.logging_utils import logger

# Specifically for Docker images.
Expand All @@ -22,9 +20,7 @@
def get_onnx_model_path(
model_name: str,
cache_dir: Optional[str] = None,
tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]] = None,
hf_token: Optional[str] = None,
local_model_dir: Optional[str] = None,
full_batch_size: Optional[int] = None,
):
"""
Expand All @@ -39,27 +35,15 @@ def get_onnx_model_path(
:local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
:full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
"""
onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
if onnx_path_exists:
logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
else:
###################
# hf model -> export
####################
# Export to the Onnx
logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
_, onnx_model_path = qualcomm_efficient_converter(
model_name=model_name,
local_model_dir=local_model_dir,
tokenizer=tokenizer,
onnx_dir_path=onnx_dir_path,
kv=True,
form_factor="cloud",
hf_token=hf_token,
cache_dir=cache_dir,
full_batch_size=full_batch_size,
) # type: ignore
logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
qeff_model = QEFFAutoModelForCausalLM.from_pretrained(
model_name,
cache_dir,
hf_token=hf_token,
full_batch_size=full_batch_size,
)
onnx_model_path = qeff_model.export()
logger.info(f"Generated onnx_path: {onnx_model_path}")
return onnx_model_path


Expand Down Expand Up @@ -92,7 +76,6 @@ def main(
model_name=model_name,
cache_dir=cache_dir,
hf_token=hf_token,
local_model_dir=local_model_dir,
full_batch_size=full_batch_size,
)

Expand Down
132 changes: 78 additions & 54 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,29 @@

import argparse
import logging
import os
import sys
from typing import List, Optional

import QEfficient
from QEfficient.cloud.export import get_onnx_model_path
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
from transformers import AutoConfig

from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
from QEfficient.utils.logging_utils import logger

# Map model's architecture to class
architecture_mapping = {
"LlamaForCausalLM": QEFFAutoModelForCausalLM,
"GPT2LMHeadModel": QEFFAutoModelForCausalLM,
"MistralForCausalLM": QEFFAutoModelForCausalLM,
"FalconForCausalLM": QEFFAutoModelForCausalLM,
"GPTJForCausalLM": QEFFAutoModelForCausalLM,
"GemmaForCausalLM": QEFFAutoModelForCausalLM,
"Gemma2ForCausalLM": QEFFAutoModelForCausalLM,
"Phi3ForCausalLM": QEFFAutoModelForCausalLM,
"Qwen2ForCausalLM": QEFFAutoModelForCausalLM,
"GPTBigCodeForCausalLM": QEFFAutoModelForCausalLM,
}


def main(
model_name: str,
Expand All @@ -38,6 +52,7 @@ def main(
allow_mxint8_mdp_io: bool = False,
enable_qnn: Optional[bool] = False,
qnn_config: Optional[str] = None,
**kwargs,
) -> None:
"""
1. Check if compiled qpc for given config already exists, if it does jump to execute, else
Expand Down Expand Up @@ -79,58 +94,51 @@ def main(
hf_token=hf_token,
)

qpc_dir_path = get_qpc_dir_path(
model_name,
num_cores,
mos,
batch_size,
prompt_len,
ctx_len,
mxfp6,
mxint8,
device_group,
full_batch_size,
if enable_qnn and qnn_config is not None:
logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.")

if "--mxfp6" in sys.argv:
if args.mxfp6:
logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
if "--mxint8" in sys.argv:
if args.mxint8:
logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")

config = AutoConfig.from_pretrained(model_name)
architecture = config.architectures[0] if config.architectures else None

model_class = architecture_mapping.get(architecture)
if not model_class:
logger.error(f"Model class for model name {model_name} not found in mapping")
return

qeff_model = model_class.from_pretrained(model_name)

#########
# Compile
#########
_ = qeff_model.compile(
prefill_seq_len=prompt_len,
ctx_len=ctx_len,
num_cores=num_cores,
mxfp6_matmul=mxfp6,
aic_enable_depth_first=aic_enable_depth_first,
batch_size=batch_size,
mos=mos,
mxint8_kv_cache=mxint8,
num_devices=(0 if device_group is None else len(device_group)),
full_batch_size=full_batch_size,
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
enable_qnn=enable_qnn,
**kwargs,
)

# Handle qpc generation
if qpc_exists(qpc_dir_path):
logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
else:
# Handle onnx model generation
onnx_model_path = get_onnx_model_path(
model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
) # , base_dir_name)

#########
# Compile
#########
_ = QEfficient.compile(
onnx_path=onnx_model_path,
qpc_path=os.path.dirname(
qpc_dir_path
), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
num_cores=num_cores,
batch_size=batch_size,
prompt_len=prompt_len,
ctx_len=ctx_len,
mxfp6=mxfp6,
mxint8=mxint8,
aic_enable_depth_first=aic_enable_depth_first,
mos=mos,
device_group=device_group,
full_batch_size=full_batch_size,
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
enable_qnn=enable_qnn,
qnn_config=qnn_config,
)

#########
# Execute
#########
cloud_ai_100_exec_kv(
tokenizer=tokenizer,
qpc_path=qpc_dir_path,
_ = qeff_model.generate(
tokenizer,
prompts=prompt,
device_id=device_group,
prompt=prompt,
prompts_txt_file_path=prompts_txt_file_path,
Expand Down Expand Up @@ -162,10 +170,16 @@ def main(
)
parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.")
parser.add_argument(
"--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
"--mxfp6",
"--mxfp6_matmul",
"--mxfp6-matmul",
action="store_true",
help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
)
parser.add_argument(
"--mxint8",
"--mxint8_kv_cache",
"--mxint8-kv-cache",
action="store_true",
help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
)
Expand Down Expand Up @@ -237,8 +251,18 @@ def main(
type=str,
)

args = parser.parse_args()
args, compiler_options = parser.parse_known_args()
compiler_options_dict = {}
for i in range(0, len(compiler_options)):
if compiler_options[i].startswith("--"):
key = compiler_options[i].lstrip("-")
value = (
compiler_options[i + 1]
if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
else True
)
compiler_options_dict[key] = value
if args.verbose:
logger.setLevel(logging.INFO)
del args.verbose # type: ignore
main(**args.__dict__)
main(**args.__dict__, **compiler_options_dict)
1 change: 1 addition & 0 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def compile(
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
:enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
:qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
:allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``

Returns:
:str: Path of the compiled ``qpc`` package.
Expand Down
Loading