v0.4.1

See https://github.com/quic/ai-hub-models/releases/v0.4.1 for changelog. Signed-off-by: QAIHM Team <[email protected]>
quic · Apr 2, 2024 · 8d239fa · 8d239fa
1 parent 953bd55
commit 8d239fa
Show file tree

Hide file tree

Showing 124 changed files with 2,425 additions and 2,701 deletions.
diff --git a/README.md b/README.md
@@ -349,7 +349,6 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | | | | |
 | **Pose Estimation**
 | [HRNetPose](https://aihub.qualcomm.com/models/hrnet_pose) | [qai_hub_models.models.hrnet_pose](qai_hub_models/models/hrnet_pose/README.md) | ✔️ | ✔️ | ✔️
-| [HRNetPoseQuantized](https://aihub.qualcomm.com/models/hrnet_pose_quantized) | [qai_hub_models.models.hrnet_pose_quantized](qai_hub_models/models/hrnet_pose_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [LiteHRNet](https://aihub.qualcomm.com/models/litehrnet) | [qai_hub_models.models.litehrnet](qai_hub_models/models/litehrnet/README.md) | ✔️ | ✔️ | ✔️
 | [MediaPipe-Pose-Estimation](https://aihub.qualcomm.com/models/mediapipe_pose) | [qai_hub_models.models.mediapipe_pose](qai_hub_models/models/mediapipe_pose/README.md) | ✔️ | ✔️ | ✔️
 | [OpenPose](https://aihub.qualcomm.com/models/openpose) | [qai_hub_models.models.openpose](qai_hub_models/models/openpose/README.md) | ✔️ | ✔️ | ✔️

diff --git a/qai_hub_models/_version.py b/qai_hub_models/_version.py
@@ -2,4 +2,4 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-__version__ = "0.4.0"
+__version__ = "0.4.1"
diff --git a/qai_hub_models/models/_shared/whisper/app.py b/qai_hub_models/models/_shared/whisper/app.py
@@ -34,15 +34,17 @@ def __init__(self, whisper: Whisper):
         decoder = whisper.decoder.to("cpu")
         encoder = whisper.encoder.to("cpu")
         self.num_decoder_blocks = whisper.num_decoder_blocks
+        self.num_decoder_heads = whisper.num_decoder_heads
         self.attention_dim = whisper.attention_dim
+        self.max_decode_len = whisper.max_decode_len
 
         # Wraps torch Module so it takes np ndarray as input and outputs
         if isinstance(encoder, torch.nn.Module):
             self.encoder = TorchNumpyAdapter(encoder)
         else:
             self.encoder = encoder
         if isinstance(decoder, torch.nn.Module):
-            self.decoder = TorchNumpyAdapter(decoder)
+            self.decoder = TorchNumpyAdapter(decoder.eval())
         else:
             self.decoder = decoder
 
@@ -67,18 +69,27 @@ def transcribe(self, mel_input: np.ndarray) -> str:
         # coreml only takes float tensors
         x = np.array([[TOKEN_SOT]])
         decoded_tokens = [TOKEN_SOT]
-        cache_tensor = np.array([], dtype=np.float32).reshape(
-            (1, 0, self.attention_dim)
-        )
+        sample_len = self.max_decode_len  # max # of tokens to sample
+        cache_tensor = np.zeros((1, sample_len, self.attention_dim)).astype(np.float32)
         self_attn_cache = [cache_tensor] * 2 * self.num_decoder_blocks
 
-        sample_len = 224  # max # of tokens to sample
         sum_logprobs = 0
         for i in range(sample_len):
-            decoder_out = self.decoder(x, *cross_attn_cache, *self_attn_cache)
+            # Using i to index inside the decoder model hurts the
+            # the model performance.
+            # index - used to get positional embedding correctly.
+            index = torch.zeros([1, 1], dtype=torch.int32)
+            index[0, 0] = i
+            # Use mask to get the k_cache updated with new key
+            mask = torch.zeros(1, sample_len, self.attention_dim, dtype=torch.bool)
+            mask[:, i, :] = 1
+            decoder_out = self.decoder(
+                x, index, mask, *cross_attn_cache, *self_attn_cache
+            )
             # logit has shape (1, decoded_len, 51864)
             logits = decoder_out[0]
             self_attn_cache = decoder_out[1:]  # type: ignore
+
             # logit has shape (51864,)
             logits = logits[0, -1]  # consider only the last token
 

diff --git a/qai_hub_models/models/_shared/whisper/model.py b/qai_hub_models/models/_shared/whisper/model.py
@@ -29,11 +29,14 @@ def __init__(
         decoder: Callable[..., Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]],
         num_decoder_blocks: int,
         attention_dim: int,
+        num_heads: int,
     ):
         self.encoder = encoder
         self.decoder = decoder
         self.num_decoder_blocks = num_decoder_blocks
         self.attention_dim = attention_dim
+        self.num_decoder_heads = num_heads
+        self.max_decode_len = MAX_DECODE_LEN
 
     @classmethod
     def from_pretrained(cls, model: str = "tiny.en"):
@@ -46,7 +49,8 @@ def from_source_model(cls, whisper_model: Any):
         decoder = WhisperDecoderInf(whisper_model.decoder)
         num_decoder_blocks = len(decoder.blocks)
         attention_dim = decoder.attention_dim
-        return cls(encoder, decoder, num_decoder_blocks, attention_dim)  # type: ignore
+        num_heads = decoder.num_heads
+        return cls(encoder, decoder, num_decoder_blocks, attention_dim, num_heads)  # type: ignore
 
 
 class WhisperEncoderInf(BaseModel):
@@ -120,13 +124,30 @@ def __init__(self, model: whisper.model.TextDecoder):
     def attention_dim(self):
         return self.blocks[0].attn_ln.weight.shape[0]
 
-    def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
+    @property
+    def num_heads(self):
+        return self.blocks[0].attn.n_head
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        index: torch.Tensor,
+        mask: torch.Tensor,
+        *kv_cache_args,
+        **kv_cache_kwargs,
+    ):
         """
         Args:
 
         - x: torch.LongTensor, shape = (batch_size, <= n_ctx)
             the text tokens
 
+        - index: torch.tensor, shape = (1, 1)
+            index to get the positional encoding for x.
+
+        - mask: torch.tensor, shape = (1, max_sample_length, attn_dim)
+            Mask helps create kv_cache while keeping the size consistent.
+
         - kv_cache_args: Tuple of length 4 * num_decoder_blocks. Elements are:
 
             b{i}_cross_attn_k: [1, 1500, attn_dim]
@@ -136,8 +157,8 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
 
             followed by
 
-            b{i}_self_attn_k: [1, decoded_len, attn_dim]
-            b{i}_self_attn_v: [1, decoded_len, attn_dim]
+            b{i}_self_attn_k: [1, max_sample_length, attn_dim]
+            b{i}_self_attn_v: [1, max_sample_length, attn_dim]
 
             for i = 0, ..., num_blocks
 
@@ -147,8 +168,10 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
         - b0_self_attn_k, b0_self_attn_v, b1_self_attn_k, ...: Updated self attn cache.
           2*num_decoder_blocks
         """
+
         if not kv_cache_args:
             kv_cache_args = list(kv_cache_kwargs.values())
+
         assert isinstance(self.token_embedding, torch.nn.Module)  # for mypy
         assert isinstance(self.ln, torch.nn.Module)  # for mypy
         assert isinstance(self.positional_embedding, torch.nn.Parameter)  # for mypy
@@ -163,16 +186,13 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
                     block.cross_attn.value: kv_cache_args[i * 2 + 1],
                 }
             )
-        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
-        x = (
-            self.token_embedding(x)
-            + self.positional_embedding[offset : offset + x.shape[-1]]
-        )
+
+        x = self.token_embedding(x) + self.positional_embedding[index.long()]
 
         # x shape: (1, 1, 384)
         kv_cache_new = []
         for block in self.blocks:
-            x, k_cache, v_cache = block(x, kv_cache=kv_cache)
+            x, k_cache, v_cache = block(x, index, mask, kv_cache=kv_cache)
             kv_cache_new.append(k_cache.float())
             kv_cache_new.append(v_cache.float())
 
@@ -188,33 +208,38 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
         return (logits,) + tuple(kv_cache_new)
 
     @staticmethod
-    def get_input_spec(num_blocks: int, attention_dim: int) -> InputSpec:
+    def get_input_spec(
+        num_blocks: int, attention_dim: int, num_heads: int
+    ) -> InputSpec:
         """
         Returns the input specification (name -> (shape, type). This can be
         used to submit profiling job on Qualcomm AI Hub.
         """
-        specs = dict(x=((1, 1), "int32"))
+        specs = dict(
+            x=((1, 1), "int32"),
+            index=((1, 1), "int32"),
+            mask=((1, MAX_DECODE_LEN, attention_dim), "int32"),
+        )
         for i in range(num_blocks):
             specs[f"b{i}_cross_attn_k"] = ((1, 1500, attention_dim), "float32")
             specs[f"b{i}_cross_attn_v"] = ((1, 1500, attention_dim), "float32")
 
-        # Use mean length for profiling
-        mean_decode_len = MAX_DECODE_LEN // 2
-
         for i in range(num_blocks):
             specs[f"b{i}_self_attn_k"] = (
-                (1, mean_decode_len, attention_dim),
+                (1, MAX_DECODE_LEN, attention_dim),
                 "float32",
             )
             specs[f"b{i}_self_attn_v"] = (
-                (1, mean_decode_len, attention_dim),
+                (1, MAX_DECODE_LEN, attention_dim),
                 "float32",
             )
 
         return specs
 
     def _get_input_spec_for_instance(self) -> InputSpec:
-        return self.__class__.get_input_spec(len(self.blocks), self.attention_dim)
+        return self.__class__.get_input_spec(
+            len(self.blocks), self.attention_dim, self.num_heads
+        )
 
     @classmethod
     def from_pretrained(cls):
@@ -250,6 +275,8 @@ def __init__(self, model: whisper.model.MultiHeadAttention, attn_type: str):
     def forward(
         self,
         x: torch.Tensor,
+        index: torch.Tensor,
+        mask: torch.Tensor,
         kv_cache: Dict[torch.nn.Module, torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
@@ -273,18 +300,19 @@ def forward(
         assert isinstance(self.value, torch.nn.Module)  # for mypy
         assert isinstance(self.out, torch.nn.Module)  # for mypy
         q = self.query(x)
-
         if self.attn_type == "self_attention":
             k_cache = kv_cache[self.key]
             v_cache = kv_cache[self.value]
-            k = self.key(x)
-            v = self.value(x)
-            k = torch.cat([k_cache, k], dim=1)
-            v = torch.cat([v_cache, v], dim=1)
+            k = torch.zeros(k_cache.shape)
+            v = torch.zeros(v_cache.shape)
+            k = mask * self.key(x) + k_cache
+            v = mask * self.value(x) + v_cache
+            new_index = torch.tensor([index[0, 0] + 1]).long()
+            wv = qkv_attention(q, k[:, :new_index], v[:, :new_index], self.n_head)
         else:  # cross_attention
             k, v = kv_cache[self.key], kv_cache[self.value]
+            wv = qkv_attention(q, k, v, self.n_head)
 
-        wv = qkv_attention(q, k, v, self.n_head)
         # Return updated kv cache
         return self.out(wv), k.detach(), v.detach()
 
@@ -300,14 +328,16 @@ def qkv_attention(
     Adapted from whisper.model.MultiHeadAttention.qkv_attention
     """
     n_batch, n_ctx, n_state = q.shape
+
     scale = (n_state // n_head) ** -0.25
     q = q.view(*q.shape[:2], n_head, -1).permute(0, 2, 1, 3) * scale
     k = k.view(*k.shape[:2], n_head, -1).permute(0, 2, 3, 1) * scale
     v = v.view(*v.shape[:2], n_head, -1).permute(0, 2, 1, 3)
 
     qk = q @ k
     if mask is not None:
-        qk = qk + mask[:n_ctx, :n_ctx]
+        qk = qk + mask
+    # Use negative infinity to mask the zeros when doing the softmax.
     qk = qk.float()
 
     w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
@@ -334,6 +364,8 @@ def __init__(self, model: whisper.model.ResidualAttentionBlock):
     def forward(
         self,
         x: torch.Tensor,
+        index: torch.Tensor,
+        mask: torch.Tensor,
         kv_cache: Dict[torch.nn.Module, torch.Tensor],
     ):
         """
@@ -347,13 +379,15 @@ def forward(
         assert isinstance(self.cross_attn, torch.nn.Module)  # for mypy
         assert isinstance(self.mlp, torch.nn.Module)  # for mypy
         assert isinstance(self.mlp_ln, torch.nn.Module)  # for mypy
-        x_attn, k_cache, v_cache = self.attn(self.attn_ln(x), kv_cache=kv_cache)
+        x_attn, k_cache, v_cache = self.attn(
+            self.attn_ln(x), index=index, mask=mask, kv_cache=kv_cache
+        )
         x = x + x_attn
         if self.cross_attn:
             # Ignore cross attn kv cache which is constant (pre-computed in
             # `WhisperCrossAttnKVCacheTorch`)
             x_cross_attn, _, _ = self.cross_attn(
-                self.cross_attn_ln(x), kv_cache=kv_cache
+                self.cross_attn_ln(x), index=index, mask=mask, kv_cache=kv_cache
             )
             x = x + x_cross_attn
         x = x + self.mlp(self.mlp_ln(x))

diff --git a/qai_hub_models/models/_shared/whisper/test_utils.py b/qai_hub_models/models/_shared/whisper/test_utils.py
@@ -13,6 +13,7 @@
 )
 from qai_hub_models.models._shared.whisper.demo import TEST_AUDIO_PATH
 from qai_hub_models.models._shared.whisper.model import (
+    MAX_DECODE_LEN,
     MEL_FILTER_PATH,
     Whisper,
     WhisperDecoderInf,
@@ -49,11 +50,16 @@ def run_test_wrapper_numerics(whisper_version):
     decoder = WhisperDecoderInf(model.decoder)
 
     cross_attn_cache = encoder(mel_input)
-    cache_tensor = np.array([], dtype=np.float32).reshape((1, 0, decoder.attention_dim))
-    self_attn_cache = [torch.from_numpy(cache_tensor)] * 2 * decoder.num_blocks
-
-    decoder_out = decoder(tokens, *cross_attn_cache, *self_attn_cache)
-    logits = decoder_out[0].detach().numpy()
+    sample_len = MAX_DECODE_LEN
+    cache_tensor = np.zeros([1, sample_len, decoder.attention_dim]).astype(np.float32)
+    index = torch.zeros([1, 1], dtype=torch.int32)
+    index[0, 0] = 0
+    mask = torch.zeros(1, sample_len, decoder.attention_dim, dtype=torch.bool)
+    mask[:, 0, :] = 1
+    self_attn_cache = [cache_tensor] * 2 * decoder.num_blocks
+    with torch.no_grad():
+        decoder_out = decoder(tokens, index, mask, *cross_attn_cache, *self_attn_cache)
+        logits = decoder_out[0].detach().numpy()
 
     np.testing.assert_allclose(logits_orig, logits)
 

diff --git a/qai_hub_models/models/_shared/yolo/app.py b/qai_hub_models/models/_shared/yolo/app.py
@@ -10,6 +10,7 @@
 import torch
 from PIL.Image import Image
 
+from qai_hub_models.models._shared.yolo.utils import detect_postprocess
 from qai_hub_models.utils.bounding_box_processing import batched_nms
 from qai_hub_models.utils.draw import draw_box_from_xyxy
 from qai_hub_models.utils.image_processing import app_to_net_image_inputs
@@ -38,6 +39,7 @@ def __init__(
         ],
         nms_score_threshold: float = 0.45,
         nms_iou_threshold: float = 0.7,
+        model_includes_postprocessing: bool = True,
     ):
         """
         Initialize a YoloObjectDetectionApp application.
@@ -63,10 +65,14 @@ def __init__(
 
             nms_iou_threshold
                 Intersection over Union threshold for non maximum suppression.
+
+            model_includes_postprocessing
+                Whether the model includes postprocessing steps beyond the detector.
         """
         self.model = model
         self.nms_score_threshold = nms_score_threshold
         self.nms_iou_threshold = nms_iou_threshold
+        self.model_includes_postprocessing = model_includes_postprocessing
 
     def check_image_size(self, pixel_values: torch.Tensor) -> None:
         """
@@ -120,7 +126,12 @@ class scores per batch multiplied by confidence: List element shape is [num_pred
         self.check_image_size(NCHW_fp32_torch_frames)
 
         # Run prediction
-        pred_boxes, pred_scores, pred_class_idx = self.model(NCHW_fp32_torch_frames)
+        if self.model_includes_postprocessing:
+            pred_boxes, pred_scores, pred_class_idx = self.model(NCHW_fp32_torch_frames)
+        else:
+            pred_boxes, pred_scores, pred_class_idx = self.pre_nms_postprocess(
+                self.model(NCHW_fp32_torch_frames)
+            )
 
         # Non Maximum Suppression on each batch
         pred_boxes, pred_scores, pred_class_idx = batched_nms(
@@ -148,3 +159,23 @@ class scores per batch multiplied by confidence: List element shape is [num_pred
                 )
 
         return NHWC_int_numpy_frames
+
+    def pre_nms_postprocess(
+        self, prediction: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Process the output of the YOLO detector for input to NMS.
+
+        Parameters:
+            detector_output: torch.Tensor
+                The output of Yolo detection model. Tensor shape varies by model implementation.
+
+        Returns:
+            boxes: torch.Tensor
+                Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2)
+            scores: torch.Tensor
+                class scores multiplied by confidence: Shape is [batch, num_preds]
+            class_idx: torch.tensor
+                Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction.
+        """
+        return detect_postprocess(prediction)