Skip to content

Commit

Permalink
v0.4.1
Browse files Browse the repository at this point in the history
See https://github.com/quic/ai-hub-models/releases/v0.4.1 for changelog.

Signed-off-by: QAIHM Team <[email protected]>
  • Loading branch information
qaihm-bot committed Apr 2, 2024
1 parent 953bd55 commit 8d239fa
Show file tree
Hide file tree
Showing 124 changed files with 2,425 additions and 2,701 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,6 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
| | | | |
| **Pose Estimation**
| [HRNetPose](https://aihub.qualcomm.com/models/hrnet_pose) | [qai_hub_models.models.hrnet_pose](qai_hub_models/models/hrnet_pose/README.md) | ✔️ | ✔️ | ✔️
| [HRNetPoseQuantized](https://aihub.qualcomm.com/models/hrnet_pose_quantized) | [qai_hub_models.models.hrnet_pose_quantized](qai_hub_models/models/hrnet_pose_quantized/README.md) | ✔️ | ✔️ | ✔️
| [LiteHRNet](https://aihub.qualcomm.com/models/litehrnet) | [qai_hub_models.models.litehrnet](qai_hub_models/models/litehrnet/README.md) | ✔️ | ✔️ | ✔️
| [MediaPipe-Pose-Estimation](https://aihub.qualcomm.com/models/mediapipe_pose) | [qai_hub_models.models.mediapipe_pose](qai_hub_models/models/mediapipe_pose/README.md) | ✔️ | ✔️ | ✔️
| [OpenPose](https://aihub.qualcomm.com/models/openpose) | [qai_hub_models.models.openpose](qai_hub_models/models/openpose/README.md) | ✔️ | ✔️ | ✔️
Expand Down
2 changes: 1 addition & 1 deletion qai_hub_models/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# ---------------------------------------------------------------------
__version__ = "0.4.0"
__version__ = "0.4.1"
23 changes: 17 additions & 6 deletions qai_hub_models/models/_shared/whisper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,17 @@ def __init__(self, whisper: Whisper):
decoder = whisper.decoder.to("cpu")
encoder = whisper.encoder.to("cpu")
self.num_decoder_blocks = whisper.num_decoder_blocks
self.num_decoder_heads = whisper.num_decoder_heads
self.attention_dim = whisper.attention_dim
self.max_decode_len = whisper.max_decode_len

# Wraps torch Module so it takes np ndarray as input and outputs
if isinstance(encoder, torch.nn.Module):
self.encoder = TorchNumpyAdapter(encoder)
else:
self.encoder = encoder
if isinstance(decoder, torch.nn.Module):
self.decoder = TorchNumpyAdapter(decoder)
self.decoder = TorchNumpyAdapter(decoder.eval())
else:
self.decoder = decoder

Expand All @@ -67,18 +69,27 @@ def transcribe(self, mel_input: np.ndarray) -> str:
# coreml only takes float tensors
x = np.array([[TOKEN_SOT]])
decoded_tokens = [TOKEN_SOT]
cache_tensor = np.array([], dtype=np.float32).reshape(
(1, 0, self.attention_dim)
)
sample_len = self.max_decode_len # max # of tokens to sample
cache_tensor = np.zeros((1, sample_len, self.attention_dim)).astype(np.float32)
self_attn_cache = [cache_tensor] * 2 * self.num_decoder_blocks

sample_len = 224 # max # of tokens to sample
sum_logprobs = 0
for i in range(sample_len):
decoder_out = self.decoder(x, *cross_attn_cache, *self_attn_cache)
# Using i to index inside the decoder model hurts the
# the model performance.
# index - used to get positional embedding correctly.
index = torch.zeros([1, 1], dtype=torch.int32)
index[0, 0] = i
# Use mask to get the k_cache updated with new key
mask = torch.zeros(1, sample_len, self.attention_dim, dtype=torch.bool)
mask[:, i, :] = 1
decoder_out = self.decoder(
x, index, mask, *cross_attn_cache, *self_attn_cache
)
# logit has shape (1, decoded_len, 51864)
logits = decoder_out[0]
self_attn_cache = decoder_out[1:] # type: ignore

# logit has shape (51864,)
logits = logits[0, -1] # consider only the last token

Expand Down
88 changes: 61 additions & 27 deletions qai_hub_models/models/_shared/whisper/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,14 @@ def __init__(
decoder: Callable[..., Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]],
num_decoder_blocks: int,
attention_dim: int,
num_heads: int,
):
self.encoder = encoder
self.decoder = decoder
self.num_decoder_blocks = num_decoder_blocks
self.attention_dim = attention_dim
self.num_decoder_heads = num_heads
self.max_decode_len = MAX_DECODE_LEN

@classmethod
def from_pretrained(cls, model: str = "tiny.en"):
Expand All @@ -46,7 +49,8 @@ def from_source_model(cls, whisper_model: Any):
decoder = WhisperDecoderInf(whisper_model.decoder)
num_decoder_blocks = len(decoder.blocks)
attention_dim = decoder.attention_dim
return cls(encoder, decoder, num_decoder_blocks, attention_dim) # type: ignore
num_heads = decoder.num_heads
return cls(encoder, decoder, num_decoder_blocks, attention_dim, num_heads) # type: ignore


class WhisperEncoderInf(BaseModel):
Expand Down Expand Up @@ -120,13 +124,30 @@ def __init__(self, model: whisper.model.TextDecoder):
def attention_dim(self):
return self.blocks[0].attn_ln.weight.shape[0]

def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
@property
def num_heads(self):
return self.blocks[0].attn.n_head

def forward(
self,
x: torch.Tensor,
index: torch.Tensor,
mask: torch.Tensor,
*kv_cache_args,
**kv_cache_kwargs,
):
"""
Args:
- x: torch.LongTensor, shape = (batch_size, <= n_ctx)
the text tokens
- index: torch.tensor, shape = (1, 1)
index to get the positional encoding for x.
- mask: torch.tensor, shape = (1, max_sample_length, attn_dim)
Mask helps create kv_cache while keeping the size consistent.
- kv_cache_args: Tuple of length 4 * num_decoder_blocks. Elements are:
b{i}_cross_attn_k: [1, 1500, attn_dim]
Expand All @@ -136,8 +157,8 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
followed by
b{i}_self_attn_k: [1, decoded_len, attn_dim]
b{i}_self_attn_v: [1, decoded_len, attn_dim]
b{i}_self_attn_k: [1, max_sample_length, attn_dim]
b{i}_self_attn_v: [1, max_sample_length, attn_dim]
for i = 0, ..., num_blocks
Expand All @@ -147,8 +168,10 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
- b0_self_attn_k, b0_self_attn_v, b1_self_attn_k, ...: Updated self attn cache.
2*num_decoder_blocks
"""

if not kv_cache_args:
kv_cache_args = list(kv_cache_kwargs.values())

assert isinstance(self.token_embedding, torch.nn.Module) # for mypy
assert isinstance(self.ln, torch.nn.Module) # for mypy
assert isinstance(self.positional_embedding, torch.nn.Parameter) # for mypy
Expand All @@ -163,16 +186,13 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
block.cross_attn.value: kv_cache_args[i * 2 + 1],
}
)
offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
x = (
self.token_embedding(x)
+ self.positional_embedding[offset : offset + x.shape[-1]]
)

x = self.token_embedding(x) + self.positional_embedding[index.long()]

# x shape: (1, 1, 384)
kv_cache_new = []
for block in self.blocks:
x, k_cache, v_cache = block(x, kv_cache=kv_cache)
x, k_cache, v_cache = block(x, index, mask, kv_cache=kv_cache)
kv_cache_new.append(k_cache.float())
kv_cache_new.append(v_cache.float())

Expand All @@ -188,33 +208,38 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs):
return (logits,) + tuple(kv_cache_new)

@staticmethod
def get_input_spec(num_blocks: int, attention_dim: int) -> InputSpec:
def get_input_spec(
num_blocks: int, attention_dim: int, num_heads: int
) -> InputSpec:
"""
Returns the input specification (name -> (shape, type). This can be
used to submit profiling job on Qualcomm AI Hub.
"""
specs = dict(x=((1, 1), "int32"))
specs = dict(
x=((1, 1), "int32"),
index=((1, 1), "int32"),
mask=((1, MAX_DECODE_LEN, attention_dim), "int32"),
)
for i in range(num_blocks):
specs[f"b{i}_cross_attn_k"] = ((1, 1500, attention_dim), "float32")
specs[f"b{i}_cross_attn_v"] = ((1, 1500, attention_dim), "float32")

# Use mean length for profiling
mean_decode_len = MAX_DECODE_LEN // 2

for i in range(num_blocks):
specs[f"b{i}_self_attn_k"] = (
(1, mean_decode_len, attention_dim),
(1, MAX_DECODE_LEN, attention_dim),
"float32",
)
specs[f"b{i}_self_attn_v"] = (
(1, mean_decode_len, attention_dim),
(1, MAX_DECODE_LEN, attention_dim),
"float32",
)

return specs

def _get_input_spec_for_instance(self) -> InputSpec:
return self.__class__.get_input_spec(len(self.blocks), self.attention_dim)
return self.__class__.get_input_spec(
len(self.blocks), self.attention_dim, self.num_heads
)

@classmethod
def from_pretrained(cls):
Expand Down Expand Up @@ -250,6 +275,8 @@ def __init__(self, model: whisper.model.MultiHeadAttention, attn_type: str):
def forward(
self,
x: torch.Tensor,
index: torch.Tensor,
mask: torch.Tensor,
kv_cache: Dict[torch.nn.Module, torch.Tensor],
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Expand All @@ -273,18 +300,19 @@ def forward(
assert isinstance(self.value, torch.nn.Module) # for mypy
assert isinstance(self.out, torch.nn.Module) # for mypy
q = self.query(x)

if self.attn_type == "self_attention":
k_cache = kv_cache[self.key]
v_cache = kv_cache[self.value]
k = self.key(x)
v = self.value(x)
k = torch.cat([k_cache, k], dim=1)
v = torch.cat([v_cache, v], dim=1)
k = torch.zeros(k_cache.shape)
v = torch.zeros(v_cache.shape)
k = mask * self.key(x) + k_cache
v = mask * self.value(x) + v_cache
new_index = torch.tensor([index[0, 0] + 1]).long()
wv = qkv_attention(q, k[:, :new_index], v[:, :new_index], self.n_head)
else: # cross_attention
k, v = kv_cache[self.key], kv_cache[self.value]
wv = qkv_attention(q, k, v, self.n_head)

wv = qkv_attention(q, k, v, self.n_head)
# Return updated kv cache
return self.out(wv), k.detach(), v.detach()

Expand All @@ -300,14 +328,16 @@ def qkv_attention(
Adapted from whisper.model.MultiHeadAttention.qkv_attention
"""
n_batch, n_ctx, n_state = q.shape

scale = (n_state // n_head) ** -0.25
q = q.view(*q.shape[:2], n_head, -1).permute(0, 2, 1, 3) * scale
k = k.view(*k.shape[:2], n_head, -1).permute(0, 2, 3, 1) * scale
v = v.view(*v.shape[:2], n_head, -1).permute(0, 2, 1, 3)

qk = q @ k
if mask is not None:
qk = qk + mask[:n_ctx, :n_ctx]
qk = qk + mask
# Use negative infinity to mask the zeros when doing the softmax.
qk = qk.float()

w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
Expand All @@ -334,6 +364,8 @@ def __init__(self, model: whisper.model.ResidualAttentionBlock):
def forward(
self,
x: torch.Tensor,
index: torch.Tensor,
mask: torch.Tensor,
kv_cache: Dict[torch.nn.Module, torch.Tensor],
):
"""
Expand All @@ -347,13 +379,15 @@ def forward(
assert isinstance(self.cross_attn, torch.nn.Module) # for mypy
assert isinstance(self.mlp, torch.nn.Module) # for mypy
assert isinstance(self.mlp_ln, torch.nn.Module) # for mypy
x_attn, k_cache, v_cache = self.attn(self.attn_ln(x), kv_cache=kv_cache)
x_attn, k_cache, v_cache = self.attn(
self.attn_ln(x), index=index, mask=mask, kv_cache=kv_cache
)
x = x + x_attn
if self.cross_attn:
# Ignore cross attn kv cache which is constant (pre-computed in
# `WhisperCrossAttnKVCacheTorch`)
x_cross_attn, _, _ = self.cross_attn(
self.cross_attn_ln(x), kv_cache=kv_cache
self.cross_attn_ln(x), index=index, mask=mask, kv_cache=kv_cache
)
x = x + x_cross_attn
x = x + self.mlp(self.mlp_ln(x))
Expand Down
16 changes: 11 additions & 5 deletions qai_hub_models/models/_shared/whisper/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from qai_hub_models.models._shared.whisper.demo import TEST_AUDIO_PATH
from qai_hub_models.models._shared.whisper.model import (
MAX_DECODE_LEN,
MEL_FILTER_PATH,
Whisper,
WhisperDecoderInf,
Expand Down Expand Up @@ -49,11 +50,16 @@ def run_test_wrapper_numerics(whisper_version):
decoder = WhisperDecoderInf(model.decoder)

cross_attn_cache = encoder(mel_input)
cache_tensor = np.array([], dtype=np.float32).reshape((1, 0, decoder.attention_dim))
self_attn_cache = [torch.from_numpy(cache_tensor)] * 2 * decoder.num_blocks

decoder_out = decoder(tokens, *cross_attn_cache, *self_attn_cache)
logits = decoder_out[0].detach().numpy()
sample_len = MAX_DECODE_LEN
cache_tensor = np.zeros([1, sample_len, decoder.attention_dim]).astype(np.float32)
index = torch.zeros([1, 1], dtype=torch.int32)
index[0, 0] = 0
mask = torch.zeros(1, sample_len, decoder.attention_dim, dtype=torch.bool)
mask[:, 0, :] = 1
self_attn_cache = [cache_tensor] * 2 * decoder.num_blocks
with torch.no_grad():
decoder_out = decoder(tokens, index, mask, *cross_attn_cache, *self_attn_cache)
logits = decoder_out[0].detach().numpy()

np.testing.assert_allclose(logits_orig, logits)

Expand Down
33 changes: 32 additions & 1 deletion qai_hub_models/models/_shared/yolo/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import torch
from PIL.Image import Image

from qai_hub_models.models._shared.yolo.utils import detect_postprocess
from qai_hub_models.utils.bounding_box_processing import batched_nms
from qai_hub_models.utils.draw import draw_box_from_xyxy
from qai_hub_models.utils.image_processing import app_to_net_image_inputs
Expand Down Expand Up @@ -38,6 +39,7 @@ def __init__(
],
nms_score_threshold: float = 0.45,
nms_iou_threshold: float = 0.7,
model_includes_postprocessing: bool = True,
):
"""
Initialize a YoloObjectDetectionApp application.
Expand All @@ -63,10 +65,14 @@ def __init__(
nms_iou_threshold
Intersection over Union threshold for non maximum suppression.
model_includes_postprocessing
Whether the model includes postprocessing steps beyond the detector.
"""
self.model = model
self.nms_score_threshold = nms_score_threshold
self.nms_iou_threshold = nms_iou_threshold
self.model_includes_postprocessing = model_includes_postprocessing

def check_image_size(self, pixel_values: torch.Tensor) -> None:
"""
Expand Down Expand Up @@ -120,7 +126,12 @@ class scores per batch multiplied by confidence: List element shape is [num_pred
self.check_image_size(NCHW_fp32_torch_frames)

# Run prediction
pred_boxes, pred_scores, pred_class_idx = self.model(NCHW_fp32_torch_frames)
if self.model_includes_postprocessing:
pred_boxes, pred_scores, pred_class_idx = self.model(NCHW_fp32_torch_frames)
else:
pred_boxes, pred_scores, pred_class_idx = self.pre_nms_postprocess(
self.model(NCHW_fp32_torch_frames)
)

# Non Maximum Suppression on each batch
pred_boxes, pred_scores, pred_class_idx = batched_nms(
Expand Down Expand Up @@ -148,3 +159,23 @@ class scores per batch multiplied by confidence: List element shape is [num_pred
)

return NHWC_int_numpy_frames

def pre_nms_postprocess(
self, prediction: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Process the output of the YOLO detector for input to NMS.
Parameters:
detector_output: torch.Tensor
The output of Yolo detection model. Tensor shape varies by model implementation.
Returns:
boxes: torch.Tensor
Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2)
scores: torch.Tensor
class scores multiplied by confidence: Shape is [batch, num_preds]
class_idx: torch.tensor
Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction.
"""
return detect_postprocess(prediction)
Loading

0 comments on commit 8d239fa

Please sign in to comment.