Skip to content

Commit

Permalink
v0.13.0
Browse files Browse the repository at this point in the history
See https://github.com/quic/ai-hub-models/releases/v0.13.0 for changelog.

Signed-off-by: QAIHM Team <[email protected]>
  • Loading branch information
qaihm-bot committed Sep 10, 2024
1 parent d661a75 commit 357d57d
Show file tree
Hide file tree
Showing 128 changed files with 1,288 additions and 1,290 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,7 @@ dmypy.json
qai_hub_models/**/README.md
# Hugging Face Model Cards (these are autogenerated)
qai_hub_models/**/HF_MODEL_CARD.md

# Allow following files
# TODO: #12151 wt shared llama binary support in export.py
!qai_hub_models/models/llama_v2_7b_chat_quantized/gen_ondevice_llama/README.md
2 changes: 1 addition & 1 deletion qai_hub_models/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# ---------------------------------------------------------------------
__version__ = "0.12.2"
__version__ = "0.13.0"
18 changes: 15 additions & 3 deletions qai_hub_models/models/_shared/llama/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,14 @@ def __init__(
num_past_key_val_heads: int,
model_split_map: Dict[int, Tuple[int, int]],
is_token_generator: bool = False,
is_bundled_kvcache: bool = True,
):
self.num_splits = num_splits
self.is_token_generator = is_token_generator
self.num_past_key_val_heads = num_past_key_val_heads
self.model_split_map = model_split_map
self.model_type = "TokenGenerator" if is_token_generator else "PromptProcessor"
self.is_bundled_kvcache = is_bundled_kvcache

def __call__(
self,
Expand Down Expand Up @@ -91,9 +93,10 @@ def forward_tg(
model = self.load_model_part(i)
print(f"Running {self.model_type} {i}/{self.num_splits}")
layer_start, layer_end = self.model_split_map[i]
num_of_key_vals = (
self.num_past_key_val_heads * 2 * (layer_end - layer_start)
num_past_key_val_head = (
1 if self.is_bundled_kvcache else self.num_past_key_val_heads
)
num_of_key_vals = num_past_key_val_head * 2 * (layer_end - layer_start)

end_past_key_offset = start_past_key_offset + num_of_key_vals
past_values = past_key_values[start_past_key_offset:end_past_key_offset]
Expand Down Expand Up @@ -157,12 +160,14 @@ def __init__(
num_past_key_val_heads: int,
model_split_map: Dict[int, Tuple[int, int]],
is_token_generator: bool = False,
is_bundled_kvcache: bool = True,
):
super().__init__(
len(hub_model_ids),
num_past_key_val_heads,
model_split_map,
is_token_generator=is_token_generator,
is_bundled_kvcache=is_bundled_kvcache,
)
self.models = []
for i, model_id in enumerate(hub_model_ids):
Expand Down Expand Up @@ -202,6 +207,7 @@ def __init__(
num_past_key_val_heads: int,
model_split_map: Dict[int, Tuple[int, int]],
is_token_generator: bool = False,
is_bundled_kvcache: bool = True,
):
self.models = models
self.num_splits = num_splits
Expand All @@ -211,6 +217,7 @@ def __init__(
num_past_key_val_heads=num_past_key_val_heads,
model_split_map=model_split_map,
is_token_generator=is_token_generator,
is_bundled_kvcache=is_bundled_kvcache,
)

def load_model_part(self, model_part: int):
Expand Down Expand Up @@ -262,7 +269,11 @@ def __init__(
self.num_past_key_val_heads = num_past_key_val_heads

def generate_output_prompt(
self, input_prompt: str, max_seq_len: int, max_output_tokens: int
self,
input_prompt: str,
max_seq_len: int,
max_output_tokens: int,
bundled_kvcache: bool = True,
):
input_prompt_processed = self.get_input_prompt_with_tags(
user_input_prompt=input_prompt
Expand Down Expand Up @@ -302,6 +313,7 @@ def generate_output_prompt(
output[1:],
past_key_start=0,
num_of_past_key_heads=self.num_past_key_val_heads,
bundled_kvcache=bundled_kvcache,
).values()
output_prompt = self.tokenizer.decode(output_token)
print()
Expand Down
6 changes: 6 additions & 0 deletions qai_hub_models/models/_shared/llama/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def llama_chat_demo(
default_prompt: str = DEFAULT_USER_PROMPT,
is_test: bool = False,
available_target_runtimes: List[TargetRuntime] = [TargetRuntime.QNN],
bundled_kvcache: bool = True,
):
"""
Shared Chat Demo App to generate output for provided input prompt
Expand Down Expand Up @@ -125,13 +126,15 @@ def llama_chat_demo(
num_splits=num_splits,
num_past_key_val_heads=num_key_val_heads,
model_split_map=model_split_map,
is_bundled_kvcache=bundled_kvcache,
)
token_generator = LlamaModelPipeline(
model_cls.from_pretrained(),
num_splits=num_splits,
num_past_key_val_heads=num_key_val_heads,
model_split_map=model_split_map,
is_token_generator=True,
is_bundled_kvcache=bundled_kvcache,
)
else:
hub_model_ids = args.hub_model_id.split(",")
Expand All @@ -156,6 +159,7 @@ def llama_chat_demo(
get_model_class=get_model_class,
num_past_key_val_heads=num_key_val_heads,
model_split_map=model_split_map,
is_bundled_kvcache=bundled_kvcache,
)
token_generator = OnDeviceLlamaModelPipeline(
hub_model_ids[num_splits:],
Expand All @@ -165,6 +169,7 @@ def llama_chat_demo(
num_past_key_val_heads=num_key_val_heads,
model_split_map=model_split_map,
is_token_generator=True,
is_bundled_kvcache=bundled_kvcache,
)

has_model_access(hf_repo_name, hf_repo_url)
Expand All @@ -182,4 +187,5 @@ def llama_chat_demo(
args.prompt,
max_seq_len=args.prompt_processor_input_seq_len,
max_output_tokens=args.max_output_tokens,
bundled_kvcache=bundled_kvcache,
)
61 changes: 57 additions & 4 deletions qai_hub_models/models/_shared/llama/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,24 @@ def get_hidden_layer_range_from_split(split_part: int, model_split_map: dict):


def get_past_key_names(
start: int = 0, end: int = 8, num_of_past_key_heads=32, suffix=""
start: int = 0,
end: int = 8,
num_of_past_key_heads: int = 32,
suffix: str = "",
bundled_kvcache: bool = True,
):
past_key_val_name = []

if bundled_kvcache:
# Key and Values are concatanated on batch dimension
for i in range(start, end):
past_key_val_name += [
f"past_key_{i}{suffix}",
f"past_value_{i}{suffix}",
]
return past_key_val_name

# Key and Values are separate for each head
for i in range(start, end):
cache_names = [
f"past_key_{i}_h{j}{suffix}" for j in range(num_of_past_key_heads)
Expand Down Expand Up @@ -118,11 +132,27 @@ def get_past_keyval_with_shift(
past_key_start: int,
num_of_past_key_heads: int = 32,
new_key_suffix: str = "",
bundled_kvcache: bool = True,
):
"""
Clip past key value to feed next iteration
"""
tg_inputs = {}
if bundled_kvcache:
# Key and Values are concatanated on batch dimension
for i in range(0, len(past_key_vals), 2):
l_num = i // 2
past_key_num = l_num + past_key_start
tg_inputs[f"past_key_{past_key_num}{new_key_suffix}"] = past_key_vals[i][
:, :, :, 1:
].detach()

tg_inputs[f"past_value_{past_key_num}{new_key_suffix}"] = past_key_vals[
i + 1
][:, :, 1:, :].detach()
return tg_inputs

# Key and Values separate for each head
total_key_val = num_of_past_key_heads * 2
for i in range(0, len(past_key_vals), total_key_val):
l_num = i // total_key_val
Expand All @@ -138,16 +168,34 @@ def get_past_keyval_with_shift(
tg_inputs[f"past_value_{past_key_num}_h{j}{new_key_suffix}"] = val[
:, :, 1:, :
].detach()

return tg_inputs


def make_torch_compatible_past_key_values(
decode_layers, past_key_val_per_layer, *past_values_flattened
decode_layers: int,
past_key_val_per_layer: int,
bundled_kvcache: bool = True,
*past_values_flattened,
):
past_key_values = []
total_past_entries = len(past_values_flattened)

if bundled_kvcache:
# Key and Value are concatanated on batch dimension
if decode_layers * 2 != total_past_entries:
raise RuntimeError(
"Incorrect number of past key-values provided for model."
f"Expecting {decode_layers * 2}, got {total_past_entries}."
)

for i in range(0, total_past_entries, 2):
past_key_values.append(
(past_values_flattened[i], past_values_flattened[i + 1])
)
return tuple(past_key_values)

# Key and Value are separate for each head

# past values consists of
# 1. k decode/hidden layers
# 2. each decode layer has 2 entries: key and value
Expand Down Expand Up @@ -248,6 +296,7 @@ def get_output_names(
start: int = 0,
end: int = 8,
past_key_val_heads: int = 32,
bundled_kvcache: bool = True,
output_name: str = "",
) -> List[str]:
# Clipped hidden layers are named same as first part for all parts
Expand All @@ -256,7 +305,11 @@ def get_output_names(

output_list = [output_name if output_name else f"layers_{end - 1}_add_out_0"]
output_list += get_past_key_names(
start, end, num_of_past_key_heads=past_key_val_heads, suffix="_out"
start,
end,
num_of_past_key_heads=past_key_val_heads,
bundled_kvcache=bundled_kvcache,
suffix="_out",
)
return output_list

Expand Down
11 changes: 1 addition & 10 deletions qai_hub_models/models/aotgan/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,18 +172,9 @@ def export_model(

# 5. Download the model asset to a local file
if not skip_downloading:
if target_runtime == TargetRuntime.QNN:
target_runtime_extension = "so"
elif target_runtime == TargetRuntime.TFLITE:
target_runtime_extension = "tflite"
elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
target_runtime_extension = "onnx"

os.makedirs(output_path, exist_ok=True)
target_model: hub.Model = compile_job.get_target_model() # type: ignore
target_model.download(
str(output_path / f"{model_name}.{target_runtime_extension}")
)
target_model.download(str(output_path / model_name))

# 6. Summarize the results from profiling and inference
if not skip_summary and not skip_profiling:
Expand Down
1 change: 0 additions & 1 deletion qai_hub_models/models/baichuan_7b_quantized/info.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ technical_details:
Token generator output: 1 output token + KVCache for next iteration
Decoding length: 1024 (1 output token + 1023 from KVCache)
Use: Initiate conversation with prompt-processor and then token generator for subsequent iterations.
QNN-SDK: "2.19"
applicable_scenarios:
- Dialogue
- Content Generation
Expand Down
1 change: 0 additions & 1 deletion qai_hub_models/models/controlnet_quantized/info.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ source_repo: https://github.com/lllyasviel/ControlNet
technical_details:
Input: Text prompt and input image as a reference
Conditioning Input: Canny-Edge
QNN-SDK: '2.19'
Text Encoder Number of parameters: 340M
UNet Number of parameters: 865M
VAE Decoder Number of parameters: 83M
Expand Down
11 changes: 1 addition & 10 deletions qai_hub_models/models/convnext_tiny/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,18 +172,9 @@ def export_model(

# 5. Download the model asset to a local file
if not skip_downloading:
if target_runtime == TargetRuntime.QNN:
target_runtime_extension = "so"
elif target_runtime == TargetRuntime.TFLITE:
target_runtime_extension = "tflite"
elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
target_runtime_extension = "onnx"

os.makedirs(output_path, exist_ok=True)
target_model: hub.Model = compile_job.get_target_model() # type: ignore
target_model.download(
str(output_path / f"{model_name}.{target_runtime_extension}")
)
target_model.download(str(output_path / model_name))

# 6. Summarize the results from profiling and inference
if not skip_summary and not skip_profiling:
Expand Down
17 changes: 2 additions & 15 deletions qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,6 @@ def export_model(
source_model = model.convert_to_hub_source_model(
target_runtime, output_path, input_spec
)
if target_runtime == TargetRuntime.TFLITE:
quant_calibration_data = None
else:
quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)

# 2. Compile the model to an on-device asset
model_compile_options = model.get_hub_compile_options(
Expand All @@ -134,7 +130,7 @@ def export_model(
input_specs=input_spec,
device=hub_device,
name=model_name,
calibration_data=quant_calibration_data,
calibration_data=model.get_calibration_data(target_runtime),
options=model_compile_options,
)
compile_job = cast(hub.client.CompileJob, submitted_compile_job)
Expand Down Expand Up @@ -177,18 +173,9 @@ def export_model(

# 5. Download the model asset to a local file
if not skip_downloading:
if target_runtime == TargetRuntime.QNN:
target_runtime_extension = "so"
elif target_runtime == TargetRuntime.TFLITE:
target_runtime_extension = "tflite"
elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
target_runtime_extension = "onnx"

os.makedirs(output_path, exist_ok=True)
target_model: hub.Model = compile_job.get_target_model() # type: ignore
target_model.download(
str(output_path / f"{model_name}.{target_runtime_extension}")
)
target_model.download(str(output_path / model_name))

# 6. Summarize the results from profiling and inference
if not skip_summary and not skip_profiling:
Expand Down
17 changes: 2 additions & 15 deletions qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,6 @@ def export_model(
source_model = model.convert_to_hub_source_model(
target_runtime, output_path, input_spec
)
if target_runtime == TargetRuntime.TFLITE:
quant_calibration_data = None
else:
quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)

# 2. Compile the model to an on-device asset
model_compile_options = model.get_hub_compile_options(
Expand All @@ -134,7 +130,7 @@ def export_model(
input_specs=input_spec,
device=hub_device,
name=model_name,
calibration_data=quant_calibration_data,
calibration_data=model.get_calibration_data(target_runtime),
options=model_compile_options,
)
compile_job = cast(hub.client.CompileJob, submitted_compile_job)
Expand Down Expand Up @@ -177,18 +173,9 @@ def export_model(

# 5. Download the model asset to a local file
if not skip_downloading:
if target_runtime == TargetRuntime.QNN:
target_runtime_extension = "so"
elif target_runtime == TargetRuntime.TFLITE:
target_runtime_extension = "tflite"
elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
target_runtime_extension = "onnx"

os.makedirs(output_path, exist_ok=True)
target_model: hub.Model = compile_job.get_target_model() # type: ignore
target_model.download(
str(output_path / f"{model_name}.{target_runtime_extension}")
)
target_model.download(str(output_path / model_name))

# 6. Summarize the results from profiling and inference
if not skip_summary and not skip_profiling:
Expand Down
Loading

0 comments on commit 357d57d

Please sign in to comment.