v0.15.0

See https://github.com/quic/ai-hub-models/releases/v0.15.0 for changelog. Signed-off-by: QAIHM Team <[email protected]>
quic · Oct 10, 2024 · 1c5de3b · 1c5de3b
1 parent 431126c
commit 1c5de3b
Show file tree

Hide file tree

Showing 146 changed files with 3,541 additions and 1,008 deletions.
diff --git a/README.md b/README.md
@@ -359,6 +359,7 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | [MediaPipe-Face-Detection](https://aihub.qualcomm.com/models/mediapipe_face) | [qai_hub_models.models.mediapipe_face](qai_hub_models/models/mediapipe_face/README.md) | ✔️ | ✔️ | ✔️
 | [MediaPipe-Face-Detection-Quantized](https://aihub.qualcomm.com/models/mediapipe_face_quantized) | [qai_hub_models.models.mediapipe_face_quantized](qai_hub_models/models/mediapipe_face_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [MediaPipe-Hand-Detection](https://aihub.qualcomm.com/models/mediapipe_hand) | [qai_hub_models.models.mediapipe_hand](qai_hub_models/models/mediapipe_hand/README.md) | ✔️ | ✔️ | ✔️
+| [YOLOv11-Detection](qai_hub_models/models/yolov11_det/README.md) | [qai_hub_models.models.yolov11_det](qai_hub_models/models/yolov11_det/README.md) | ✔️ | ✔️ | ✔️
 | [YOLOv8-Detection](https://aihub.qualcomm.com/models/yolov8_det) | [qai_hub_models.models.yolov8_det](qai_hub_models/models/yolov8_det/README.md) | ✔️ | ✔️ | ✔️
 | [YOLOv8-Detection-Quantized](https://aihub.qualcomm.com/models/yolov8_det_quantized) | [qai_hub_models.models.yolov8_det_quantized](qai_hub_models/models/yolov8_det_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [Yolo-NAS](https://aihub.qualcomm.com/models/yolonas) | [qai_hub_models.models.yolonas](qai_hub_models/models/yolonas/README.md) | ✔️ | ✔️ | ✔️
@@ -368,6 +369,7 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | [Yolo-v7-Quantized](https://aihub.qualcomm.com/models/yolov7_quantized) | [qai_hub_models.models.yolov7_quantized](qai_hub_models/models/yolov7_quantized/README.md) | ✔️ | ✔️ | ✔️
 | | | | |
 | **Pose Estimation**
+| [FaceMap_3DMM](qai_hub_models/models/facemap_3dmm/README.md) | [qai_hub_models.models.facemap_3dmm](qai_hub_models/models/facemap_3dmm/README.md) | ✔️ | ✔️ | ✔️
 | [HRNetPose](https://aihub.qualcomm.com/models/hrnet_pose) | [qai_hub_models.models.hrnet_pose](qai_hub_models/models/hrnet_pose/README.md) | ✔️ | ✔️ | ✔️
 | [HRNetPoseQuantized](https://aihub.qualcomm.com/models/hrnet_pose_quantized) | [qai_hub_models.models.hrnet_pose_quantized](qai_hub_models/models/hrnet_pose_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [LiteHRNet](https://aihub.qualcomm.com/models/litehrnet) | [qai_hub_models.models.litehrnet](qai_hub_models/models/litehrnet/README.md) | ✔️ | ✔️ | ✔️

diff --git a/qai_hub_models/_version.py b/qai_hub_models/_version.py
@@ -2,4 +2,4 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-__version__ = "0.14.0"
+__version__ = "0.15.0"
diff --git a/qai_hub_models/asset_bases.yaml b/qai_hub_models/asset_bases.yaml
@@ -11,3 +11,4 @@ example_use: qai_hub_models/models/{model_id}#example--usage
 huggingface_path: qualcomm/{model_name}
 models_website_url: https://aihub.qualcomm.com
 models_website_relative_path: models/{model_id}
+email_template: qai_hub_models/scripts/templates/email_template.txt
diff --git a/qai_hub_models/models/_shared/stable_diffusion/app.py b/qai_hub_models/models/_shared/stable_diffusion/app.py
@@ -4,13 +4,14 @@
 # ---------------------------------------------------------------------
 from __future__ import annotations
 
-from typing import Any, Tuple
+from typing import Any, Dict, List, Tuple
 
 import diffusers
 import torch
 from diffusers.models.embeddings import get_timestep_embedding
 from transformers import CLIPTokenizer
 
+from qai_hub_models.models.protocols import ExecutableModelProtocol
 from qai_hub_models.utils.inference import OnDeviceModel
 
 OUT_H, OUT_W = 512, 512
@@ -34,9 +35,9 @@ class StableDiffusionApp:
 
     def __init__(
         self,
-        text_encoder: OnDeviceModel | torch.nn.Module,
-        vae_decoder: OnDeviceModel | torch.nn.Module,
-        unet: OnDeviceModel | torch.nn.Module,
+        text_encoder: ExecutableModelProtocol,
+        vae_decoder: ExecutableModelProtocol,
+        unet: ExecutableModelProtocol,
         tokenizer: CLIPTokenizer | Any,
         scheduler: diffusers.DPMSolverMultistepScheduler,
         time_embedding: diffusers.embeddings.TimeEmbedding,
@@ -68,26 +69,13 @@ def __init__(
         """
 
         self.text_encoder = text_encoder
-        self.vae_decoder = vae_decoder
         self.unet = unet
+        self.vae_decoder = vae_decoder
         self.tokenizer = tokenizer
         self.scheduler = scheduler
         self.time_embedding = time_embedding
         self.channel_last_latent = channel_last_latent
 
-    def get_time_embedding(self, timestep):
-        """
-        Since these time embeddings aren't dependent on prompt, they can be
-        pre-computed (for a pre-defined set of timesteps) in deployment and
-        skip these computation. We include them in demo for better clarity.
-        """
-        timestep = torch.tensor([timestep])
-        # TODO: pull 320 from UNet block output dim
-        t_emb = get_timestep_embedding(timestep, 320, True, 0)
-        emb = self.time_embedding(t_emb)
-
-        return emb
-
     def _encode_text_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Takes a text prompt and returns a tensor with its text embedding.
@@ -127,13 +115,20 @@ def _encode_text_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         # Embed using the text encoder neural network
         # Encode input and empty prompt in one go
         print(f"\nExtracting embeddings (inference on TextEncoder)\n{'-' * 50}")
-        embeddings = self.text_encoder(
-            [
-                text_input.input_ids.type(torch.int32),
-                uncond_input.input_ids.type(torch.int32),
-            ]
-        )
-        cond_embeddings, uncond_embeddings = torch.split(embeddings, 1, 0)
+        if isinstance(self.text_encoder, OnDeviceModel):
+            # Batch data into one inference job
+            embeddings = self.text_encoder(
+                [
+                    text_input.input_ids.int(),
+                    uncond_input.input_ids.int(),
+                ]
+            )
+            cond_embeddings, uncond_embeddings = torch.split(embeddings, 1, 0)
+        else:
+            cond_embeddings = self.text_encoder(text_input.input_ids.type(torch.int32))
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.type(torch.int32)
+            )
         return cond_embeddings, uncond_embeddings
 
     def predict(self, *args, **kwargs):
@@ -188,16 +183,110 @@ def generate_image(
 
         # Encode text prompt
         cond_embeddings, uncond_embeddings = self._encode_text_prompt(prompt)
-        self.scheduler.set_timesteps(num_steps)
-        self.scheduler.config.prediction_type = "epsilon"
+
+        latents = run_diffusion_steps_on_latents(
+            unet=self.unet,
+            scheduler=self.scheduler,
+            time_embedding=self.time_embedding,
+            cond_embeddings=cond_embeddings,
+            uncond_embeddings=uncond_embeddings,
+            num_steps=num_steps,
+            seed=seed,
+            guidance_scale=guidance_scale,
+            channel_last_latent=self.channel_last_latent,
+        )
+        image = self.vae_decoder(latents)
+        return image
+
+
+def get_time_embedding(
+    time_embedding: diffusers.embeddings.TimeEmbedding, timestep: int
+) -> torch.Tensor:
+    """
+    Since these time embeddings aren't dependent on prompt, they can be
+    pre-computed (for a pre-defined set of timesteps) in deployment and
+    skip these computation. We include them in demo for better clarity.
+    """
+    timestep = torch.tensor([timestep])
+    # TODO: pull 320 from UNet block output dim
+    t_emb = get_timestep_embedding(timestep, 320, True, 0)
+    emb = time_embedding(t_emb)
+
+    return emb
+
+
+def run_diffusion_steps_on_latents(
+    unet: ExecutableModelProtocol,
+    scheduler: diffusers.DPMSolverMultistepScheduler,
+    time_embedding: diffusers.embeddings.TimeEmbedding,
+    cond_embeddings: torch.Tensor,
+    uncond_embeddings: torch.Tensor,
+    num_steps: int = 20,
+    seed: int = 0,
+    guidance_scale: float = 7.5,
+    channel_last_latent: bool = False,
+    return_all_steps: bool = False,
+) -> torch.Tensor | Tuple[torch.Tensor, Dict[str, List[torch.Tensor]]]:
+    """
+        Parameters
+        ----------
+        prompt:
+            The text prompt to generate an image from.
+        num_steps:
+            The number of steps to run the diffusion process for. Higher value
+            may lead to better image quality.
+        seed:
+            The seed to use for the random number generator.
+        guidance_scale:
+            Classifier-free guidance is a method that allows us to control how
+            strongly the image generation is guided by the prompt. This is done
+            by always processing two samples at once: an unconditional (using a
+            text embedding of an empty prompt) and a conditional (using a text
+            embedding of the provided prompt). Given the noise prediction of
+            both of these, we linearly interpolate between them based on the
+            guidance_scale. A guidance scale of 0 is the same as using an empty
+            prompt. A guidance scale of 1 turns off classifier-free guidance
+            and is computationally less expensive since it only processes one
+            sample at a time. Intuitively you may think the rest of guidance
+            scales are between 0 and 1, but it is common to use a scale greater
+            than 1 as a method of amplifying the prompt's influence on the
+            image, pushing it further away from the unconditional sample.
+
+        channel_last_latent:
+            True if unet outputs latent of shape like (1, 64, 64, 4). False
+            for (1, 4, 64, 64). channel_last_latent=False for Huggingface's
+            model.
+
+        return_all_steps:
+            True to return all intermediate latents (shape depending on
+            channel_last_latent) and time_emb.
+
+        Returns
+        -------
+        torch.Tensor
+            Final latent to be converted to RGB image by VAE decoder.
+
+        Dict[str, List[torch.Tensor]]
+            Intermediate inputs. keys are: ["latent", "time_emb"]. Each list is
+            of length `num_steps`. This is useful for calibration.
+
+
+    Use cases
+    - generate calibration data for unet (using hf model). Need time_emb etc
+    - Partial diffusion step (e.g., run last step only)?
+    - Full evaluation (quantsim, tetra inference job etc)
+    """
+    with torch.no_grad():
+        scheduler.set_timesteps(num_steps)
+        scheduler.config.prediction_type = "epsilon"
 
         # Channel last input
         latents_shape = (1, 4, OUT_H // 8, OUT_W // 8)
 
         generator = torch.manual_seed(seed)
         latents = torch.randn(latents_shape, generator=generator)
 
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = latents * scheduler.init_noise_sigma
 
         # Helper method to go back and forth from channel-first to channel-last
         def _make_channel_last_torch(input_tensor):
@@ -206,31 +295,41 @@ def _make_channel_last_torch(input_tensor):
         def _make_channel_first_torch(input_tensor):
             return torch.permute(torch.Tensor(input_tensor), [0, 3, 1, 2])
 
-        for i, t in enumerate(self.scheduler.timesteps):
+        # Export for calibration purpose
+        unet_inputs = dict(latent=[], time_emb=[])
+
+        for i, t in enumerate(scheduler.timesteps):
             print(f"\nStep: {i + 1}\n{'-' * 10}")
-            time_emb = self.get_time_embedding(t)
-            latent_model_input = self.scheduler.scale_model_input(latents, t)
-            if self.channel_last_latent:
+            time_emb = get_time_embedding(time_embedding, t)
+            latent_model_input = scheduler.scale_model_input(latents, t)
+            if channel_last_latent:
                 latent_model_input = _make_channel_last_torch(latent_model_input)
+            unet_inputs["latent"].append(latent_model_input)
+            unet_inputs["time_emb"].append(time_emb)
 
-            print(f"\nDenoising image in latent space (inference on UNet)\n{'-' * 50}")
             # Denoise image in latent space
-            noise = self.unet(
-                [latent_model_input, latent_model_input],
-                [time_emb, time_emb],
-                [cond_embeddings, uncond_embeddings],
-            )
-
-            noise_cond, noise_uncond = torch.split(noise, 1, 0)
+            if isinstance(unet, OnDeviceModel):
+                # Batch data into one inference job
+                noise = unet(
+                    [latent_model_input, latent_model_input],
+                    [time_emb, time_emb],
+                    [cond_embeddings, uncond_embeddings],
+                )
+
+                noise_cond, noise_uncond = torch.split(noise, 1, 0)
+            else:
+                noise_cond = unet(latent_model_input, time_emb, cond_embeddings)
+                noise_uncond = unet(latent_model_input, time_emb, uncond_embeddings)
             noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond)
 
-            if self.channel_last_latent:
+            if channel_last_latent:
                 noise_pred = _make_channel_first_torch(noise_pred)
-            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
 
-        print(f"\nDecoding generated image (inference on VAEDecoder)\n{'-' * 50}")
-        # Decode generated image from latent space
-        if self.channel_last_latent:
-            latents = _make_channel_last_torch(latents)
-        image = self.vae_decoder(latents)
-        return image
+            # Decode generated image from latent space
+            if channel_last_latent:
+                latents = _make_channel_last_torch(latents)
+
+        if return_all_steps:
+            return latents, unet_inputs
+        return latents
diff --git a/qai_hub_models/models/_shared/yolo/model.py b/qai_hub_models/models/_shared/yolo/model.py
@@ -0,0 +1,60 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import torch
+
+from qai_hub_models.models._shared.yolo.utils import (
+    box_transform_xywh2xyxy_split_input,
+    transform_box_layout_xywh2xyxy,
+)
+
+
+def yolo_detect_postprocess(
+    boxes: torch.Tensor,
+    scores: torch.Tensor,
+    use_quantized_postprocessing: bool = False,
+):
+    """
+    Post processing to break newer ultralytics yolo models (e.g. Yolov8, Yolo11) detector output into multiple, consumable tensors (eg. for NMS).
+        such as bounding boxes, scores and classes.
+
+    Parameters:
+        boxes: torch.Tensor
+            Shape is [batch, 4, num_preds] where 4 == [x_center, y_center, w, h]
+        scores: torch.Tensor
+            Shape is [batch, num_classes, num_preds]
+            Each element represents the probability that a given box is
+                an instance of a given class.
+        use_quantized_postprocessing: bool
+            If post-processing a non-quantized model, need to split the bounding box
+                processing into multiple smaller tensors due to NPU limitations.
+            If quantized, the entire processing can be done on a single tensor.
+
+    Returns:
+        boxes: torch.Tensor
+            Bounding box locations. Shape is [batch, num preds, 4] where 4 == (x1, y1, x2, y2)
+        scores: torch.Tensor
+            class scores multiplied by confidence: Shape is [batch, num_preds]
+        class_idx: torch.tensor
+            Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction.
+    """
+    # Break output into parts
+    boxes = torch.permute(boxes, [0, 2, 1])
+    scores = torch.permute(scores, [0, 2, 1])
+
+    # Convert boxes to (x1, y1, x2, y2)
+    # Doing transform in fp16 requires special logic to keep on NPU
+    if use_quantized_postprocessing:
+        boxes = box_transform_xywh2xyxy_split_input(boxes[..., 0:2], boxes[..., 2:4])
+    else:
+        boxes = transform_box_layout_xywh2xyxy(boxes)
+
+    # Get class ID of most likely score.
+    scores, class_idx = torch.max(scores, -1, keepdim=False)
+
+    # Quantized model runtime doesn't like int32 outputs, so cast class idx to uint8.
+    # This is a no-op for coco models, but for datasets with >255 classes, this
+    # should be float32 for the unquantized model.
+    class_dtype = torch.uint8 if use_quantized_postprocessing else torch.float32
+    return boxes, scores, class_idx.to(class_dtype)
diff --git a/qai_hub_models/models/aotgan/export.py b/qai_hub_models/models/aotgan/export.py
@@ -195,7 +195,7 @@ def export_model(
         )
 
     if not skip_summary:
-        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent, hub_device)
 
     return (compile_job, profile_job, inference_job)