v0.13.0

See https://github.com/quic/ai-hub-models/releases/v0.13.0 for changelog. Signed-off-by: QAIHM Team <[email protected]>
quic · Sep 10, 2024 · 357d57d · 357d57d
1 parent d661a75
commit 357d57d
Show file tree

Hide file tree

Showing 128 changed files with 1,288 additions and 1,290 deletions.
diff --git a/.gitignore b/.gitignore
@@ -115,3 +115,7 @@ dmypy.json
 qai_hub_models/**/README.md
 # Hugging Face Model Cards (these are autogenerated)
 qai_hub_models/**/HF_MODEL_CARD.md
+
+# Allow following files
+# TODO: #12151 wt shared llama binary support in export.py
+!qai_hub_models/models/llama_v2_7b_chat_quantized/gen_ondevice_llama/README.md
diff --git a/qai_hub_models/_version.py b/qai_hub_models/_version.py
@@ -2,4 +2,4 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-__version__ = "0.12.2"
+__version__ = "0.13.0"
diff --git a/qai_hub_models/models/_shared/llama/app.py b/qai_hub_models/models/_shared/llama/app.py
@@ -36,12 +36,14 @@ def __init__(
         num_past_key_val_heads: int,
         model_split_map: Dict[int, Tuple[int, int]],
         is_token_generator: bool = False,
+        is_bundled_kvcache: bool = True,
     ):
         self.num_splits = num_splits
         self.is_token_generator = is_token_generator
         self.num_past_key_val_heads = num_past_key_val_heads
         self.model_split_map = model_split_map
         self.model_type = "TokenGenerator" if is_token_generator else "PromptProcessor"
+        self.is_bundled_kvcache = is_bundled_kvcache
 
     def __call__(
         self,
@@ -91,9 +93,10 @@ def forward_tg(
                 model = self.load_model_part(i)
             print(f"Running {self.model_type} {i}/{self.num_splits}")
             layer_start, layer_end = self.model_split_map[i]
-            num_of_key_vals = (
-                self.num_past_key_val_heads * 2 * (layer_end - layer_start)
+            num_past_key_val_head = (
+                1 if self.is_bundled_kvcache else self.num_past_key_val_heads
             )
+            num_of_key_vals = num_past_key_val_head * 2 * (layer_end - layer_start)
 
             end_past_key_offset = start_past_key_offset + num_of_key_vals
             past_values = past_key_values[start_past_key_offset:end_past_key_offset]
@@ -157,12 +160,14 @@ def __init__(
         num_past_key_val_heads: int,
         model_split_map: Dict[int, Tuple[int, int]],
         is_token_generator: bool = False,
+        is_bundled_kvcache: bool = True,
     ):
         super().__init__(
             len(hub_model_ids),
             num_past_key_val_heads,
             model_split_map,
             is_token_generator=is_token_generator,
+            is_bundled_kvcache=is_bundled_kvcache,
         )
         self.models = []
         for i, model_id in enumerate(hub_model_ids):
@@ -202,6 +207,7 @@ def __init__(
         num_past_key_val_heads: int,
         model_split_map: Dict[int, Tuple[int, int]],
         is_token_generator: bool = False,
+        is_bundled_kvcache: bool = True,
     ):
         self.models = models
         self.num_splits = num_splits
@@ -211,6 +217,7 @@ def __init__(
             num_past_key_val_heads=num_past_key_val_heads,
             model_split_map=model_split_map,
             is_token_generator=is_token_generator,
+            is_bundled_kvcache=is_bundled_kvcache,
         )
 
     def load_model_part(self, model_part: int):
@@ -262,7 +269,11 @@ def __init__(
         self.num_past_key_val_heads = num_past_key_val_heads
 
     def generate_output_prompt(
-        self, input_prompt: str, max_seq_len: int, max_output_tokens: int
+        self,
+        input_prompt: str,
+        max_seq_len: int,
+        max_output_tokens: int,
+        bundled_kvcache: bool = True,
     ):
         input_prompt_processed = self.get_input_prompt_with_tags(
             user_input_prompt=input_prompt
@@ -302,6 +313,7 @@ def generate_output_prompt(
             output[1:],
             past_key_start=0,
             num_of_past_key_heads=self.num_past_key_val_heads,
+            bundled_kvcache=bundled_kvcache,
         ).values()
         output_prompt = self.tokenizer.decode(output_token)
         print()

diff --git a/qai_hub_models/models/_shared/llama/demo.py b/qai_hub_models/models/_shared/llama/demo.py
@@ -46,6 +46,7 @@ def llama_chat_demo(
     default_prompt: str = DEFAULT_USER_PROMPT,
     is_test: bool = False,
     available_target_runtimes: List[TargetRuntime] = [TargetRuntime.QNN],
+    bundled_kvcache: bool = True,
 ):
     """
     Shared Chat Demo App to generate output for provided input prompt
@@ -125,13 +126,15 @@ def llama_chat_demo(
             num_splits=num_splits,
             num_past_key_val_heads=num_key_val_heads,
             model_split_map=model_split_map,
+            is_bundled_kvcache=bundled_kvcache,
         )
         token_generator = LlamaModelPipeline(
             model_cls.from_pretrained(),
             num_splits=num_splits,
             num_past_key_val_heads=num_key_val_heads,
             model_split_map=model_split_map,
             is_token_generator=True,
+            is_bundled_kvcache=bundled_kvcache,
         )
     else:
         hub_model_ids = args.hub_model_id.split(",")
@@ -156,6 +159,7 @@ def llama_chat_demo(
             get_model_class=get_model_class,
             num_past_key_val_heads=num_key_val_heads,
             model_split_map=model_split_map,
+            is_bundled_kvcache=bundled_kvcache,
         )
         token_generator = OnDeviceLlamaModelPipeline(
             hub_model_ids[num_splits:],
@@ -165,6 +169,7 @@ def llama_chat_demo(
             num_past_key_val_heads=num_key_val_heads,
             model_split_map=model_split_map,
             is_token_generator=True,
+            is_bundled_kvcache=bundled_kvcache,
         )
 
     has_model_access(hf_repo_name, hf_repo_url)
@@ -182,4 +187,5 @@ def llama_chat_demo(
         args.prompt,
         max_seq_len=args.prompt_processor_input_seq_len,
         max_output_tokens=args.max_output_tokens,
+        bundled_kvcache=bundled_kvcache,
     )
diff --git a/qai_hub_models/models/_shared/llama/model.py b/qai_hub_models/models/_shared/llama/model.py
@@ -30,10 +30,24 @@ def get_hidden_layer_range_from_split(split_part: int, model_split_map: dict):
 
 
 def get_past_key_names(
-    start: int = 0, end: int = 8, num_of_past_key_heads=32, suffix=""
+    start: int = 0,
+    end: int = 8,
+    num_of_past_key_heads: int = 32,
+    suffix: str = "",
+    bundled_kvcache: bool = True,
 ):
     past_key_val_name = []
 
+    if bundled_kvcache:
+        # Key and Values are concatanated on batch dimension
+        for i in range(start, end):
+            past_key_val_name += [
+                f"past_key_{i}{suffix}",
+                f"past_value_{i}{suffix}",
+            ]
+        return past_key_val_name
+
+    # Key and Values are separate for each head
     for i in range(start, end):
         cache_names = [
             f"past_key_{i}_h{j}{suffix}" for j in range(num_of_past_key_heads)
@@ -118,11 +132,27 @@ def get_past_keyval_with_shift(
     past_key_start: int,
     num_of_past_key_heads: int = 32,
     new_key_suffix: str = "",
+    bundled_kvcache: bool = True,
 ):
     """
     Clip past key value to feed next iteration
     """
     tg_inputs = {}
+    if bundled_kvcache:
+        # Key and Values are concatanated on batch dimension
+        for i in range(0, len(past_key_vals), 2):
+            l_num = i // 2
+            past_key_num = l_num + past_key_start
+            tg_inputs[f"past_key_{past_key_num}{new_key_suffix}"] = past_key_vals[i][
+                :, :, :, 1:
+            ].detach()
+
+            tg_inputs[f"past_value_{past_key_num}{new_key_suffix}"] = past_key_vals[
+                i + 1
+            ][:, :, 1:, :].detach()
+        return tg_inputs
+
+    # Key and Values separate for each head
     total_key_val = num_of_past_key_heads * 2
     for i in range(0, len(past_key_vals), total_key_val):
         l_num = i // total_key_val
@@ -138,16 +168,34 @@ def get_past_keyval_with_shift(
             tg_inputs[f"past_value_{past_key_num}_h{j}{new_key_suffix}"] = val[
                 :, :, 1:, :
             ].detach()
-
     return tg_inputs
 
 
 def make_torch_compatible_past_key_values(
-    decode_layers, past_key_val_per_layer, *past_values_flattened
+    decode_layers: int,
+    past_key_val_per_layer: int,
+    bundled_kvcache: bool = True,
+    *past_values_flattened,
 ):
     past_key_values = []
     total_past_entries = len(past_values_flattened)
 
+    if bundled_kvcache:
+        # Key and Value are concatanated on batch dimension
+        if decode_layers * 2 != total_past_entries:
+            raise RuntimeError(
+                "Incorrect number of past key-values provided for model."
+                f"Expecting {decode_layers * 2}, got {total_past_entries}."
+            )
+
+        for i in range(0, total_past_entries, 2):
+            past_key_values.append(
+                (past_values_flattened[i], past_values_flattened[i + 1])
+            )
+        return tuple(past_key_values)
+
+    # Key and Value are separate for each head
+
     # past values consists of
     # 1. k decode/hidden layers
     # 2. each decode layer has 2 entries: key and value
@@ -248,6 +296,7 @@ def get_output_names(
         start: int = 0,
         end: int = 8,
         past_key_val_heads: int = 32,
+        bundled_kvcache: bool = True,
         output_name: str = "",
     ) -> List[str]:
         # Clipped hidden layers are named same as first part for all parts
@@ -256,7 +305,11 @@ def get_output_names(
 
         output_list = [output_name if output_name else f"layers_{end - 1}_add_out_0"]
         output_list += get_past_key_names(
-            start, end, num_of_past_key_heads=past_key_val_heads, suffix="_out"
+            start,
+            end,
+            num_of_past_key_heads=past_key_val_heads,
+            bundled_kvcache=bundled_kvcache,
+            suffix="_out",
         )
         return output_list
 

diff --git a/qai_hub_models/models/aotgan/export.py b/qai_hub_models/models/aotgan/export.py
@@ -172,18 +172,9 @@ def export_model(
 
     # 5. Download the model asset to a local file
     if not skip_downloading:
-        if target_runtime == TargetRuntime.QNN:
-            target_runtime_extension = "so"
-        elif target_runtime == TargetRuntime.TFLITE:
-            target_runtime_extension = "tflite"
-        elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
-            target_runtime_extension = "onnx"
-
         os.makedirs(output_path, exist_ok=True)
         target_model: hub.Model = compile_job.get_target_model()  # type: ignore
-        target_model.download(
-            str(output_path / f"{model_name}.{target_runtime_extension}")
-        )
+        target_model.download(str(output_path / model_name))
 
     # 6. Summarize the results from profiling and inference
     if not skip_summary and not skip_profiling:

diff --git a/qai_hub_models/models/baichuan_7b_quantized/info.yaml b/qai_hub_models/models/baichuan_7b_quantized/info.yaml
@@ -31,7 +31,6 @@ technical_details:
   Token generator output: 1 output token + KVCache for next iteration
   Decoding length: 1024 (1 output token + 1023 from KVCache)
   Use: Initiate conversation with prompt-processor and then token generator for subsequent iterations.
-  QNN-SDK: "2.19"
 applicable_scenarios:
   - Dialogue
   - Content Generation

diff --git a/qai_hub_models/models/controlnet_quantized/info.yaml b/qai_hub_models/models/controlnet_quantized/info.yaml
@@ -18,7 +18,6 @@ source_repo: https://github.com/lllyasviel/ControlNet
 technical_details:
   Input: Text prompt and input image as a reference
   Conditioning Input: Canny-Edge
-  QNN-SDK: '2.19'
   Text Encoder Number of parameters: 340M
   UNet Number of parameters: 865M
   VAE Decoder Number of parameters: 83M

diff --git a/qai_hub_models/models/convnext_tiny/export.py b/qai_hub_models/models/convnext_tiny/export.py
@@ -172,18 +172,9 @@ def export_model(
 
     # 5. Download the model asset to a local file
     if not skip_downloading:
-        if target_runtime == TargetRuntime.QNN:
-            target_runtime_extension = "so"
-        elif target_runtime == TargetRuntime.TFLITE:
-            target_runtime_extension = "tflite"
-        elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
-            target_runtime_extension = "onnx"
-
         os.makedirs(output_path, exist_ok=True)
         target_model: hub.Model = compile_job.get_target_model()  # type: ignore
-        target_model.download(
-            str(output_path / f"{model_name}.{target_runtime_extension}")
-        )
+        target_model.download(str(output_path / model_name))
 
     # 6. Summarize the results from profiling and inference
     if not skip_summary and not skip_profiling:

diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py
@@ -119,10 +119,6 @@ def export_model(
     source_model = model.convert_to_hub_source_model(
         target_runtime, output_path, input_spec
     )
-    if target_runtime == TargetRuntime.TFLITE:
-        quant_calibration_data = None
-    else:
-        quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
@@ -134,7 +130,7 @@ def export_model(
         input_specs=input_spec,
         device=hub_device,
         name=model_name,
-        calibration_data=quant_calibration_data,
+        calibration_data=model.get_calibration_data(target_runtime),
         options=model_compile_options,
     )
     compile_job = cast(hub.client.CompileJob, submitted_compile_job)
@@ -177,18 +173,9 @@ def export_model(
 
     # 5. Download the model asset to a local file
     if not skip_downloading:
-        if target_runtime == TargetRuntime.QNN:
-            target_runtime_extension = "so"
-        elif target_runtime == TargetRuntime.TFLITE:
-            target_runtime_extension = "tflite"
-        elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
-            target_runtime_extension = "onnx"
-
         os.makedirs(output_path, exist_ok=True)
         target_model: hub.Model = compile_job.get_target_model()  # type: ignore
-        target_model.download(
-            str(output_path / f"{model_name}.{target_runtime_extension}")
-        )
+        target_model.download(str(output_path / model_name))
 
     # 6. Summarize the results from profiling and inference
     if not skip_summary and not skip_profiling:

diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py
@@ -119,10 +119,6 @@ def export_model(
     source_model = model.convert_to_hub_source_model(
         target_runtime, output_path, input_spec
     )
-    if target_runtime == TargetRuntime.TFLITE:
-        quant_calibration_data = None
-    else:
-        quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
@@ -134,7 +130,7 @@ def export_model(
         input_specs=input_spec,
         device=hub_device,
         name=model_name,
-        calibration_data=quant_calibration_data,
+        calibration_data=model.get_calibration_data(target_runtime),
         options=model_compile_options,
     )
     compile_job = cast(hub.client.CompileJob, submitted_compile_job)
@@ -177,18 +173,9 @@ def export_model(
 
     # 5. Download the model asset to a local file
     if not skip_downloading:
-        if target_runtime == TargetRuntime.QNN:
-            target_runtime_extension = "so"
-        elif target_runtime == TargetRuntime.TFLITE:
-            target_runtime_extension = "tflite"
-        elif target_runtime in {TargetRuntime.ONNX, TargetRuntime.PRECOMPILED_QNN_ONNX}:
-            target_runtime_extension = "onnx"
-
         os.makedirs(output_path, exist_ok=True)
         target_model: hub.Model = compile_job.get_target_model()  # type: ignore
-        target_model.download(
-            str(output_path / f"{model_name}.{target_runtime_extension}")
-        )
+        target_model.download(str(output_path / model_name))
 
     # 6. Summarize the results from profiling and inference
     if not skip_summary and not skip_profiling: