Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrating HL compile and export to infer APIs #200

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 6 additions & 23 deletions QEfficient/cloud/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast

from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
from QEfficient.utils import check_and_assign_cache_dir
from QEfficient.utils.logging_utils import logger

# Specifically for Docker images.
Expand All @@ -39,27 +39,10 @@ def get_onnx_model_path(
:local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
:full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
"""
onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
if onnx_path_exists:
logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
else:
###################
# hf model -> export
####################
# Export to the Onnx
logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
_, onnx_model_path = qualcomm_efficient_converter(
model_name=model_name,
local_model_dir=local_model_dir,
tokenizer=tokenizer,
onnx_dir_path=onnx_dir_path,
kv=True,
form_factor="cloud",
hf_token=hf_token,
cache_dir=cache_dir,
full_batch_size=full_batch_size,
) # type: ignore
logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
onnx_model_path = qeff_model.export()
logger.info(f"Generated onnx_path: {onnx_model_path}")
return onnx_model_path


Expand Down
111 changes: 56 additions & 55 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@

import argparse
import logging
import os
import sys
from typing import List, Optional

import QEfficient
from QEfficient.cloud.export import get_onnx_model_path
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
from QEfficient.utils.logging_utils import logger


Expand All @@ -38,6 +36,7 @@ def main(
allow_mxint8_mdp_io: bool = False,
enable_qnn: Optional[bool] = False,
qnn_config: Optional[str] = None,
**kwargs,
) -> None:
"""
1. Check if compiled qpc for given config already exists, if it does jump to execute, else
Expand Down Expand Up @@ -79,60 +78,44 @@ def main(
hf_token=hf_token,
)

qpc_dir_path = get_qpc_dir_path(
model_name,
num_cores,
mos,
batch_size,
prompt_len,
ctx_len,
mxfp6,
mxint8,
device_group,
full_batch_size,
if enable_qnn and qnn_config is not None:
logger.error("QNN compilation is currently not supported in High Level APIs of QEFFAutoModelForCausalLM.")

if "--mxfp6" in sys.argv:
if args.mxfp6:
logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
if "--mxint8" in sys.argv:
if args.mxint8:
logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")

qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name)

#########
# Compile
#########
_ = qeff_model.compile(
prefill_seq_len=prompt_len,
ctx_len=ctx_len,
num_cores=num_cores,
mxfp6_matmul=mxfp6,
aic_enable_depth_first=aic_enable_depth_first,
batch_size=batch_size,
mos=mos,
mxint8_kv_cache=mxint8,
num_devices = (0 if device_group is None else len(device_group)),
full_batch_size=full_batch_size,
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
asmigosw marked this conversation as resolved.
Show resolved Hide resolved
enable_qnn=enable_qnn,
**kwargs,
)

# Handle qpc generation
if qpc_exists(qpc_dir_path):
logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
else:
# Handle onnx model generation
onnx_model_path = get_onnx_model_path(
model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
) # , base_dir_name)

#########
# Compile
#########
_ = QEfficient.compile(
onnx_path=onnx_model_path,
qpc_path=os.path.dirname(
qpc_dir_path
), # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
num_cores=num_cores,
batch_size=batch_size,
prompt_len=prompt_len,
ctx_len=ctx_len,
mxfp6=mxfp6,
mxint8=mxint8,
aic_enable_depth_first=aic_enable_depth_first,
mos=mos,
device_group=device_group,
full_batch_size=full_batch_size,
allow_mxint8_mdp_io=allow_mxint8_mdp_io,
enable_qnn=enable_qnn,
qnn_config=qnn_config,
)

#########
# Execute
#########
cloud_ai_100_exec_kv(
tokenizer=tokenizer,
qpc_path=qpc_dir_path,
_ = qeff_model.generate(
tokenizer,
prompts=prompt,
device_id=device_group,
prompt=prompt,
prompts_txt_file_path=prompts_txt_file_path,
generation_len=generation_len,
)
Expand Down Expand Up @@ -162,10 +145,16 @@ def main(
)
parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.")
parser.add_argument(
"--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
"--mxfp6",
"--mxfp6_matmul",
"--mxfp6-matmul",
action="store_true",
help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
)
parser.add_argument(
"--mxint8",
"--mxint8_kv_cache",
"--mxint8-kv-cache",
action="store_true",
help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
)
Expand Down Expand Up @@ -237,8 +226,20 @@ def main(
type=str,
)

args = parser.parse_args()
args, compiler_options = parser.parse_known_args()
compiler_options_dict = {}
for i in range(0, len(compiler_options)):
if compiler_options[i].startswith("--"):
key = compiler_options[i].lstrip("-")
value = (
compiler_options[i + 1]
if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
else True
)
compiler_options_dict[key] = value

if args.verbose:
logger.setLevel(logging.INFO)
del args.verbose # type: ignore
main(**args.__dict__)

main(**args.__dict__, **compiler_options_dict)
1 change: 1 addition & 0 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ def compile(
:num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
:mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
:aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
:allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``

Returns:
:str: Path of the compiled ``qpc`` package.
Expand Down
Loading