diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 633a8882..cb4e51f1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,6 +37,7 @@ repos: args: [--allow-multiple-documents] - id: trailing-whitespace exclude: '\.diff$' + args: [--markdown-linebreak-ext=md] - id: check-added-large-files args: ['--maxkb=1024'] - id: check-merge-conflict @@ -56,6 +57,7 @@ repos: rev: v0.9.0.6 hooks: - id: shellcheck + exclude: '\.yml$' - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: diff --git a/README.md b/README.md index eb316a71..716bd959 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,21 @@ memory etc.) and ready to deploy on Qualcomm® devices. * Access the models through [Hugging Face](https://huggingface.co/qualcomm). * [Sign up](https://myaccount.qualcomm.com/signup) to run these models on hosted Qualcomm® devices. +Supported **python package host machine** Operating Systems: +- Linux (x86, ARM) +- Windows (x86) +- Windows (ARM-- ONLY via x86 Python, not ARM Python) +- MacOS (x86, ARM) + Supported runtimes * [TensorFlow Lite](https://www.tensorflow.org/lite) * [Qualcomm AI Engine Direct](https://www.qualcomm.com/developer/artificial-intelligence#overview) +* [ONNX](https://onnxruntime.ai/docs/execution-providers/QNN-ExecutionProvider.html) -Supported operating systems: -* Android 11+ +Models can be deployed on: +* Android +* Windows +* Linux Supported compute units * CPU, GPU, NPU (includes [Hexagon DSP](https://developer.qualcomm.com/software/hexagon-dsp-sdk/dsp-processor), [HTP](https://developer.qualcomm.com/hardware/qualcomm-innovators-development-kit/ai-resources-overview/ai-hardware-cores-accelerators)) @@ -28,12 +37,13 @@ Supported precision Supported chipsets * [Snapdragon 845](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-845-mobile-platform), [Snapdragon 855/855+](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-855-mobile-platform), [Snapdragon 865/865+](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-865-plus-5g-mobile-platform), [Snapdragon 888/888+](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-888-5g-mobile-platform) -* [Snapdragon 8 Gen 1](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-1-mobile-platform), [Snapdragon 8 Gen 2](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-2-mobile-platform), [Snapdragon 8 Gen 3](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-3-mobile-platform) +* [Snapdragon 8 Gen 1](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-1-mobile-platform), [Snapdragon 8 Gen 2](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-2-mobile-platform), [Snapdragon 8 Gen 3](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-3-mobile-platform), [Snapdragon X Elite](https://www.qualcomm.com/products/mobile/snapdragon/pcs-and-tablets/snapdragon-x-elite) Select supported devices * Samsung Galaxy S21 Series, Galaxy S22 Series, Galaxy S23 Series, Galaxy S24 Series * Xiaomi 12, 13 * Google Pixel 3, 4, 5 +* Snapdragon X Elite CRD (Compute Reference Device) and many more. @@ -261,6 +271,8 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE | | | | | | **Image Classification** | [ConvNext-Tiny](https://aihub.qualcomm.com/models/convnext_tiny) | [qai_hub_models.models.convnext_tiny](qai_hub_models/models/convnext_tiny/README.md) | ✔️ | ✔️ | ✔️ +| [ConvNext-Tiny-w8a16-Quantized](qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md) | [qai_hub_models.models.convnext_tiny_w8a16_quantized](qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [ConvNext-Tiny-w8a8-Quantized](qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md) | [qai_hub_models.models.convnext_tiny_w8a8_quantized](qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md) | ✔️ | ✔️ | ✔️ | [DenseNet-121](https://aihub.qualcomm.com/models/densenet121) | [qai_hub_models.models.densenet121](qai_hub_models/models/densenet121/README.md) | ✔️ | ✔️ | ✔️ | [EfficientNet-B0](https://aihub.qualcomm.com/models/efficientnet_b0) | [qai_hub_models.models.efficientnet_b0](qai_hub_models/models/efficientnet_b0/README.md) | ✔️ | ✔️ | ✔️ | [GoogLeNet](https://aihub.qualcomm.com/models/googlenet) | [qai_hub_models.models.googlenet](qai_hub_models/models/googlenet/README.md) | ✔️ | ✔️ | ✔️ @@ -321,7 +333,8 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE | [DeepLabV3-Plus-MobileNet](https://aihub.qualcomm.com/models/deeplabv3_plus_mobilenet) | [qai_hub_models.models.deeplabv3_plus_mobilenet](qai_hub_models/models/deeplabv3_plus_mobilenet/README.md) | ✔️ | ✔️ | ✔️ | [DeepLabV3-Plus-MobileNet-Quantized](https://aihub.qualcomm.com/models/deeplabv3_plus_mobilenet_quantized) | [qai_hub_models.models.deeplabv3_plus_mobilenet_quantized](qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md) | ✔️ | ✔️ | ✔️ | [DeepLabV3-ResNet50](https://aihub.qualcomm.com/models/deeplabv3_resnet50) | [qai_hub_models.models.deeplabv3_resnet50](qai_hub_models/models/deeplabv3_resnet50/README.md) | ✔️ | ✔️ | ✔️ -| [FCN_ResNet50](https://aihub.qualcomm.com/models/fcn_resnet50) | [qai_hub_models.models.fcn_resnet50](qai_hub_models/models/fcn_resnet50/README.md) | ✔️ | ✔️ | ✔️ +| [FCN-ResNet50](https://aihub.qualcomm.com/models/fcn_resnet50) | [qai_hub_models.models.fcn_resnet50](qai_hub_models/models/fcn_resnet50/README.md) | ✔️ | ✔️ | ✔️ +| [FCN-ResNet50-Quantized](https://aihub.qualcomm.com/models/fcn_resnet50_quantized) | [qai_hub_models.models.fcn_resnet50_quantized](qai_hub_models/models/fcn_resnet50_quantized/README.md) | ✔️ | ✔️ | ✔️ | [FFNet-122NS-LowRes](https://aihub.qualcomm.com/models/ffnet_122ns_lowres) | [qai_hub_models.models.ffnet_122ns_lowres](qai_hub_models/models/ffnet_122ns_lowres/README.md) | ✔️ | ✔️ | ✔️ | [FFNet-40S](https://aihub.qualcomm.com/models/ffnet_40s) | [qai_hub_models.models.ffnet_40s](qai_hub_models/models/ffnet_40s/README.md) | ✔️ | ✔️ | ✔️ | [FFNet-40S-Quantized](https://aihub.qualcomm.com/models/ffnet_40s_quantized) | [qai_hub_models.models.ffnet_40s_quantized](qai_hub_models/models/ffnet_40s_quantized/README.md) | ✔️ | ✔️ | ✔️ @@ -347,6 +360,8 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE | [MediaPipe-Hand-Detection](https://aihub.qualcomm.com/models/mediapipe_hand) | [qai_hub_models.models.mediapipe_hand](qai_hub_models/models/mediapipe_hand/README.md) | ✔️ | ✔️ | ✔️ | [YOLOv8-Detection](https://aihub.qualcomm.com/models/yolov8_det) | [qai_hub_models.models.yolov8_det](qai_hub_models/models/yolov8_det/README.md) | ✔️ | ✔️ | ✔️ | [YOLOv8-Detection-Quantized](https://aihub.qualcomm.com/models/yolov8_det_quantized) | [qai_hub_models.models.yolov8_det_quantized](qai_hub_models/models/yolov8_det_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Yolo-NAS](https://aihub.qualcomm.com/models/yolonas) | [qai_hub_models.models.yolonas](qai_hub_models/models/yolonas/README.md) | ✔️ | ✔️ | ✔️ +| [Yolo-NAS-Quantized](https://aihub.qualcomm.com/models/yolonas_quantized) | [qai_hub_models.models.yolonas_quantized](qai_hub_models/models/yolonas_quantized/README.md) | ✔️ | ✔️ | ✔️ | [Yolo-v6](https://aihub.qualcomm.com/models/yolov6) | [qai_hub_models.models.yolov6](qai_hub_models/models/yolov6/README.md) | ✔️ | ✔️ | ✔️ | [Yolo-v7](https://aihub.qualcomm.com/models/yolov7) | [qai_hub_models.models.yolov7](qai_hub_models/models/yolov7/README.md) | ✔️ | ✔️ | ✔️ | [Yolo-v7-Quantized](https://aihub.qualcomm.com/models/yolov7_quantized) | [qai_hub_models.models.yolov7_quantized](qai_hub_models/models/yolov7_quantized/README.md) | ✔️ | ✔️ | ✔️ @@ -356,6 +371,10 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE | [LiteHRNet](https://aihub.qualcomm.com/models/litehrnet) | [qai_hub_models.models.litehrnet](qai_hub_models/models/litehrnet/README.md) | ✔️ | ✔️ | ✔️ | [MediaPipe-Pose-Estimation](https://aihub.qualcomm.com/models/mediapipe_pose) | [qai_hub_models.models.mediapipe_pose](qai_hub_models/models/mediapipe_pose/README.md) | ✔️ | ✔️ | ✔️ | [OpenPose](https://aihub.qualcomm.com/models/openpose) | [qai_hub_models.models.openpose](qai_hub_models/models/openpose/README.md) | ✔️ | ✔️ | ✔️ +| [Posenet-Mobilenet](qai_hub_models/models/posenet_mobilenet/README.md) | [qai_hub_models.models.posenet_mobilenet](qai_hub_models/models/posenet_mobilenet/README.md) | ✔️ | ✔️ | ✔️ +| | | | | +| **Depth Estimation** +| [Midas-V2](qai_hub_models/models/midas/README.md) | [qai_hub_models.models.midas](qai_hub_models/models/midas/README.md) | ✔️ | ✔️ | ✔️ ### Audio @@ -386,7 +405,9 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE | | | | | | **Image Generation** | [ControlNet](https://aihub.qualcomm.com/models/controlnet_quantized) | [qai_hub_models.models.controlnet_quantized](qai_hub_models/models/controlnet_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [Stable-Diffusion](https://aihub.qualcomm.com/models/stable_diffusion_quantized) | [qai_hub_models.models.stable_diffusion_quantized](qai_hub_models/models/stable_diffusion_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Riffusion](qai_hub_models/models/riffusion_quantized/README.md) | [qai_hub_models.models.riffusion_quantized](qai_hub_models/models/riffusion_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Stable-Diffusion-v1.5](https://aihub.qualcomm.com/models/stable_diffusion_v1_5_quantized) | [qai_hub_models.models.stable_diffusion_v1_5_quantized](qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Stable-Diffusion-v2.1](qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md) | [qai_hub_models.models.stable_diffusion_v2_1_quantized](qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Text Generation** | [Baichuan-7B](https://aihub.qualcomm.com/models/baichuan_7b_quantized) | [qai_hub_models.models.baichuan_7b_quantized](qai_hub_models/models/baichuan_7b_quantized/README.md) | ✔️ | ✔️ | ✔️ diff --git a/apps/android/ImageClassification/README.md b/apps/android/ImageClassification/README.md index 29945ae2..e5916678 100644 --- a/apps/android/ImageClassification/README.md +++ b/apps/android/ImageClassification/README.md @@ -85,5 +85,5 @@ Also, you can use AI-HUB Model name as mentioned in models directory, to directl You can also select the model provided in the list menu during the execution of build_apk.py without specifying the model name and model path. ``` - python build_apk.py -q "" + python build_apk.py -q "" ``` diff --git a/qai_hub_models/_version.py b/qai_hub_models/_version.py index 21c6afdf..f9e86596 100644 --- a/qai_hub_models/_version.py +++ b/qai_hub_models/_version.py @@ -2,4 +2,4 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -__version__ = "0.5.1" +__version__ = "0.6.0" diff --git a/qai_hub_models/datasets/bsd300.py b/qai_hub_models/datasets/bsd300.py index a6d534b7..8a2c9bf5 100644 --- a/qai_hub_models/datasets/bsd300.py +++ b/qai_hub_models/datasets/bsd300.py @@ -32,20 +32,17 @@ class BSD300Dataset(BaseDataset): def __init__(self, scaling_factor=4): self.bsd_path = BSD300_ASSET.path(extracted=True) - self.images_path = os.path.join(self.bsd_path, "images/train") + self.images_path = self.bsd_path / "images" / "train" BaseDataset.__init__(self, self.bsd_path) self.scaling_factor = scaling_factor def _validate_data(self) -> bool: - images_path = os.path.join(self.dataset_path, "images/train") - # Check image path exists - if not os.path.exists(images_path): + if not self.images_path.exists(): return False # Ensure the correct number of images are there - files = os.listdir(images_path) - images = [f for f in files if ".jpg" in f] + images = [f for f in self.images_path.iterdir() if ".jpg" in f.name] if len(images) != DATASET_LENGTH: return False @@ -53,18 +50,18 @@ def _validate_data(self) -> bool: def _prepare_data(self): # Rename images to be more friendly to enumeration - directory = os.path.join(self.dataset_path, "images/train") - files = os.listdir(directory) - for i, filename in enumerate(files): - if filename.endswith(".jpg"): + # directory = os.path.join(self.dataset_path, "images/train") + # files = os.listdir(directory) + for i, filepath in enumerate(self.images_path.iterdir()): + if filepath.name.endswith(".jpg"): # Open the image and convert it to png try: - with Image.open(os.path.join(directory, filename)) as img: - img.save(os.path.join(directory, f"img_{i + 1:03d}_HR.jpg")) + with Image.open(filepath) as img: + img.save(self.images_path / f"img_{i + 1:03d}_HR.jpg") # delete the old image - os.remove(os.path.join(directory, filename)) + os.remove(filepath) except ValueError: - print(f"File {filename} does not exist!") + print(f"File {filepath} does not exist!") def __len__(self): return DATASET_LENGTH diff --git a/qai_hub_models/datasets/common.py b/qai_hub_models/datasets/common.py index ff2bf47b..4579cf6d 100644 --- a/qai_hub_models/datasets/common.py +++ b/qai_hub_models/datasets/common.py @@ -7,6 +7,7 @@ import os import shutil from abc import ABC, abstractmethod +from pathlib import Path from typing import final from torch.utils.data import Dataset @@ -17,17 +18,17 @@ class BaseDataset(Dataset, ABC): Base class to be extended by Datasets used in this repo for quantizing models. """ - def __init__(self, dataset_path: str): - self.dataset_path = dataset_path + def __init__(self, dataset_path: str | Path): + self.dataset_path = Path(dataset_path) self.download_data() @final def download_data(self) -> None: if self._validate_data(): return - if os.path.exists(self.dataset_path): + if self.dataset_path.exists(): # Data is corrupted, delete and re-download - if os.path.isdir(self.dataset_path): + if self.dataset_path.is_dir(): shutil.rmtree(self.dataset_path) else: os.remove(self.dataset_path) @@ -49,4 +50,4 @@ def _validate_data(self) -> bool: """ Validates data downloaded on disk. By default just checks that folder exists. """ - return os.path.exists(self.dataset_path) + return self.dataset_path.exists() diff --git a/qai_hub_models/datasets/imagenet.py b/qai_hub_models/datasets/imagenet.py new file mode 100644 index 00000000..96d8ff71 --- /dev/null +++ b/qai_hub_models/datasets/imagenet.py @@ -0,0 +1,94 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import os +import subprocess + +from torchvision.datasets import ImageNet + +from qai_hub_models.datasets.common import BaseDataset +from qai_hub_models.utils.asset_loaders import CachedWebDatasetAsset +from qai_hub_models.utils.image_processing import IMAGENET_TRANSFORM + +IMAGENET_FOLDER_NAME = "imagenet" +IMAGENET_VERSION = 1 + +IMAGENET_ASSET = CachedWebDatasetAsset( + "https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar", + IMAGENET_FOLDER_NAME, + IMAGENET_VERSION, + "ILSVRC2012_img_val.tar", +) +DEVKIT_NAME = "ILSVRC2012_devkit_t12.tar.gz" +DEVKIT_ASSET = CachedWebDatasetAsset( + f"https://image-net.org/data/ILSVRC/2012/{DEVKIT_NAME}", + IMAGENET_FOLDER_NAME, + IMAGENET_VERSION, + DEVKIT_NAME, +) +VAL_PREP_ASSET = CachedWebDatasetAsset( + "https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh", + IMAGENET_FOLDER_NAME, + IMAGENET_VERSION, + "valprep.sh", +) + + +class ImagenetDataset(BaseDataset, ImageNet): + """ + Wrapper class for using the Imagenet validation dataset: https://www.image-net.org/ + """ + + def __init__(self): + """ + A direct download link for the validation set is not available. + Users should download the validation dataset manually and pass the local filepath + as an argument here. After this is done once, it will be symlinked to an + internal location and doesn't need to be passed again. + + input_data_path: Local filepath to imagenet validation set. + """ + BaseDataset.__init__(self, IMAGENET_ASSET.path().parent) + ImageNet.__init__( + self, + root=self.dataset_path, + split="val", + transform=IMAGENET_TRANSFORM, + ) + + def _validate_data(self) -> bool: + val_path = self.dataset_path / "val" + if not (self.dataset_path / DEVKIT_NAME).exists(): + print("Missing Devkit.") + return False + + subdirs = [filepath for filepath in val_path.iterdir() if filepath.is_dir()] + if len(subdirs) != 1000: + print(f"Expected 1000 subdirectories but got {len(subdirs)}") + return False + + total_images = 0 + for subdir in subdirs: + total_images += len(list(subdir.iterdir())) + + if total_images != 50000: + print(f"Expected 50000 images but got {total_images}") + return False + return True + + def _download_data(self) -> None: + val_path = self.dataset_path / "val" + os.makedirs(val_path, exist_ok=True) + + IMAGENET_ASSET.fetch(extract=True) + DEVKIT_ASSET.fetch() + VAL_PREP_ASSET.fetch() + + os.rename(VAL_PREP_ASSET.path(), val_path / VAL_PREP_ASSET.path().name) + for filepath in self.dataset_path.iterdir(): + if filepath.name.endswith(".JPEG"): + os.rename(filepath, val_path / filepath.name) + + print("Moving images to appropriate class folder. This may take a few minutes.") + subprocess.call(f"sh {VAL_PREP_ASSET.path().name}", shell=True, cwd=val_path) diff --git a/qai_hub_models/datasets/imagenette.py b/qai_hub_models/datasets/imagenette.py index 31f4d24c..f9b92fa6 100644 --- a/qai_hub_models/datasets/imagenette.py +++ b/qai_hub_models/datasets/imagenette.py @@ -9,14 +9,16 @@ from qai_hub_models.datasets.common import BaseDataset from qai_hub_models.utils.asset_loaders import CachedWebDatasetAsset +from qai_hub_models.utils.image_processing import IMAGENET_TRANSFORM IMAGENETTE_FOLDER_NAME = "imagenette2-320" IMAGENETTE_VERSION = 1 +DEVKIT_NAME = "ILSVRC2012_devkit_t12.tar.gz" DEVKIT_ASSET = CachedWebDatasetAsset( - "https://image-net.org/data/ILSVRC/2012/ILSVRC2012_devkit_t12.tar.gz", + f"https://image-net.org/data/ILSVRC/2012/{DEVKIT_NAME}", IMAGENETTE_FOLDER_NAME, IMAGENETTE_VERSION, - "ILSVRC2012_devkit_t12.tar.gz", + DEVKIT_NAME, ) IMAGENETTE_ASSET = CachedWebDatasetAsset( "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz", @@ -51,11 +53,6 @@ class ImagenetteDataset(BaseDataset, ImageNet): def __init__(self): BaseDataset.__init__(self, str(IMAGENETTE_ASSET.path(extracted=True))) - # Avoid circular import - from qai_hub_models.models._shared.imagenet_classifier.app import ( - IMAGENET_TRANSFORM, - ) - ImageNet.__init__( self, root=IMAGENETTE_ASSET.path(), @@ -77,18 +74,18 @@ def _validate_data(self) -> bool: return False # Check val data exists - val_data_path = os.path.join(self.dataset_path, "val") - if not os.path.exists(val_data_path): + val_data_path = self.dataset_path / "val" + if not val_data_path.exists(): return False # Ensure 10 classes - subdirs = os.listdir(val_data_path) + subdirs = list(val_data_path.iterdir()) if len(subdirs) != 10: return False # Ensure >= 300 samples per classes for subdir in subdirs: - if len(os.listdir(os.path.join(val_data_path, subdir))) < 300: + if len(list(subdir.iterdir())) < 300: return False return True @@ -97,6 +94,6 @@ def _download_data(self) -> None: devkit_path = DEVKIT_ASSET.fetch() devkit_st = os.stat(devkit_path) os.chmod(devkit_path, devkit_st.st_mode | stat.S_IEXEC) - target_path = IMAGENETTE_ASSET.path() / os.path.basename(DEVKIT_ASSET.path()) - if not os.path.exists(target_path): + target_path = IMAGENETTE_ASSET.path() / DEVKIT_NAME + if not target_path.exists(): os.symlink(DEVKIT_ASSET.path(), target_path) diff --git a/qai_hub_models/datasets/pascal_voc.py b/qai_hub_models/datasets/pascal_voc.py index a7f5b9ea..1da92aa9 100644 --- a/qai_hub_models/datasets/pascal_voc.py +++ b/qai_hub_models/datasets/pascal_voc.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -from pathlib import Path from typing import Tuple import numpy as np @@ -36,7 +35,7 @@ def __init__(self, split: str = "train", image_size: Tuple[int, int] = (224, 224 assert split in ["train", "val", "trainval"] self.split = split - base_path = Path(self.dataset_path) / "VOC2012" + base_path = self.dataset_path / "VOC2012" image_dir = base_path / "JPEGImages" category_dir = base_path / "SegmentationClass" splits_dir = base_path / "ImageSets" / "Segmentation" diff --git a/qai_hub_models/evaluators/image_evaluator.py b/qai_hub_models/evaluators/segmentation_evaluator.py similarity index 91% rename from qai_hub_models/evaluators/image_evaluator.py rename to qai_hub_models/evaluators/segmentation_evaluator.py index a5439a5d..4f2adfb8 100644 --- a/qai_hub_models/evaluators/image_evaluator.py +++ b/qai_hub_models/evaluators/segmentation_evaluator.py @@ -10,7 +10,7 @@ class SegmentationOutputEvaluator(BaseEvaluator): - """Evaluator for comparing a batched image output.""" + """Evaluator for comparing segmentation output against ground truth.""" def __init__(self, num_classes): self.num_classes = num_classes @@ -18,6 +18,7 @@ def __init__(self, num_classes): def add_batch(self, output: torch.Tensor, gt: torch.Tensor): # This evaluator supports only 1 output tensor at a time. + output = output.argmax(1).cpu() assert gt.shape == output.shape self.confusion_matrix += self._generate_matrix(gt, output) @@ -62,3 +63,6 @@ def _generate_matrix(self, gt_image, pre_image): count = torch.bincount(label, minlength=self.num_classes**2) confusion_matrix = count.reshape(self.num_classes, self.num_classes) return confusion_matrix + + def get_accuracy_score(self) -> float: + return self.Mean_Intersection_over_Union() diff --git a/qai_hub_models/global_requirements.txt b/qai_hub_models/global_requirements.txt index 15343116..567fddcb 100644 --- a/qai_hub_models/global_requirements.txt +++ b/qai_hub_models/global_requirements.txt @@ -4,18 +4,22 @@ # - Then install this requirements file # That should create an environment that works for every single model. +Deprecated==1.2.11 PySoundFile; sys_platform == 'win32' albumentations==0.5.2 av==10.0.0 basicsr==1.4.2 -click==8.0 +click==8.1.7 +data-gradients==0.3.1 datasets==2.14.5 diffusers[torch]==0.21.4 easydict==1.10 +einops==0.3.2 ffmpeg==1.4 ftfy==6.1.1 hydra-core==1.3.0 imageio[ffmpeg]==2.31.5 +imagesize==1.4.1 kornia==0.5.0 librosa==0.10.1 matplotlib==3.7.4 @@ -26,19 +30,23 @@ object-detection-metrics==0.4.post1 openai-whisper==20230314 pycocotools==2.0.7 pytorch-lightning==1.6.0 +rapidfuzz==3.8.1 regex==2023.12.25 scikit-image==0.21.0 scikit-learn==1.1.3 scipy==1.8.1 seaborn==0.11.0 sentencepiece==0.2.0 +shapely==2.0.3 soundfile==0.12.1 +stringcase==1.2.0 tflite==2.10.0 thop==0.1.1.post2209072238 timm==0.9.11 tensorboard==2.13.0 torchaudio==0.13.1 transformers==4.27.4 +treelib==1.6.1 tucker-conv==1.0.1 ultralytics==8.0.193 webdataset==0.2.86 diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py b/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py index d1eb4a7c..e9c02522 100644 --- a/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py +++ b/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py @@ -5,7 +5,7 @@ import torch.nn.functional as F from torch import Tensor -from qai_hub_models.evaluators.image_evaluator import SegmentationOutputEvaluator +from qai_hub_models.evaluators.segmentation_evaluator import SegmentationOutputEvaluator class CityscapesSegmentationEvaluator(SegmentationOutputEvaluator): @@ -15,8 +15,4 @@ class CityscapesSegmentationEvaluator(SegmentationOutputEvaluator): def add_batch(self, output: Tensor, gt: Tensor): output_match_size = F.interpolate(output, gt.shape[1:3], mode="bilinear") - output_class = output_match_size.argmax(1).cpu() - return super().add_batch(output_class, gt) - - def get_accuracy_score(self) -> float: - return super().Mean_Intersection_over_Union() + return super().add_batch(output_match_size, gt) diff --git a/qai_hub_models/models/_shared/convnext_tiny_quantized/__init__.py b/qai_hub_models/models/_shared/convnext_tiny_quantized/__init__.py new file mode 100644 index 00000000..21a22b31 --- /dev/null +++ b/qai_hub_models/models/_shared/convnext_tiny_quantized/__init__.py @@ -0,0 +1,4 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- diff --git a/qai_hub_models/models/_shared/convnext_tiny_quantized/model.py b/qai_hub_models/models/_shared/convnext_tiny_quantized/model.py new file mode 100644 index 00000000..c098f281 --- /dev/null +++ b/qai_hub_models/models/_shared/convnext_tiny_quantized/model.py @@ -0,0 +1,126 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +from abc import abstractmethod +from pathlib import Path + +import torch +import torch.nn as nn +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim +from torchvision.models.convnext import LayerNorm2d as ConvNextLayerNorm2d +from torchvision.ops.misc import Permute + +from qai_hub_models.models._shared.common import replace_module_recursively +from qai_hub_models.models.convnext_tiny.model import DEFAULT_WEIGHTS, ConvNextTiny +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config +from qai_hub_models.utils.quantization_aimet import ( + constrain_quantized_inputs_to_image_range, +) + + +# The ConvNext LayerNorm uses a functional LayerNorm that is currently not +# automatically handled by AIMET (AIMET-3928). With this fix, the LayerNorms +# will not get quantization observers. +class AIMETLayerNorm2d(nn.Sequential): + def __init__(self, orig_layer_norm: ConvNextLayerNorm2d): + layer_norm = nn.LayerNorm( + orig_layer_norm.normalized_shape, + eps=orig_layer_norm.eps, + elementwise_affine=orig_layer_norm.elementwise_affine, + ) + layer_norm.bias = orig_layer_norm.bias + layer_norm.weight = orig_layer_norm.weight + super().__init__( + Permute([0, 2, 3, 1]), + layer_norm, + Permute([0, 3, 1, 2]), + ) + + +class ConvNextTinyQuantizableBase(AIMETQuantizableMixin, ConvNextTiny): + def __init__( + self, + quant_sim_model: QuantizationSimModel, + ) -> None: + # Input is already normalized by sim_model. Disable it in the wrapper model. + ConvNextTiny.__init__(self, quant_sim_model.model, normalize_input=False) + AIMETQuantizableMixin.__init__( + self, + quant_sim_model, + ) + + @classmethod + @abstractmethod + def _default_aimet_encodings(cls) -> str | Path: + """ + Default AIMET encodings path. + """ + ... + + @classmethod + @abstractmethod + def _output_bw(cls) -> int: + """ + Quantization bitwidth of activations. + """ + ... + + @classmethod + def from_pretrained( + cls, + weights: str = DEFAULT_WEIGHTS, + aimet_encodings: str | None = "DEFAULT", + ) -> "ConvNextTinyQuantizableBase": + """ + Parameters: + weights: + Weights of the model. See Torchvision ConvNext for information of + the format of this object. + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + # Load Model + model = ConvNextTiny.from_pretrained(weights=weights) + + replace_module_recursively( + model, + ConvNextLayerNorm2d, + AIMETLayerNorm2d, + ) + + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) + equalize_model(model, input_shape) + + sim = QuantizationSimModel( + model, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=cls._output_bw(), + config_file=get_default_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + constrain_quantized_inputs_to_image_range(sim) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = cls._default_aimet_encodings() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/_shared/deeplab/evaluator.py b/qai_hub_models/models/_shared/deeplab/evaluator.py deleted file mode 100644 index 32a836c1..00000000 --- a/qai_hub_models/models/_shared/deeplab/evaluator.py +++ /dev/null @@ -1,24 +0,0 @@ -# --------------------------------------------------------------------- -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# --------------------------------------------------------------------- -from torch import Tensor - -from qai_hub_models.evaluators.image_evaluator import SegmentationOutputEvaluator - - -class DeepLabV3Evaluator(SegmentationOutputEvaluator): - """ - Evaluates the output of DeepLabV3Plus - - Expected data format for this evaluator: - * output has the same shape & meaning as output of any deeplabV3 forward() function. - * gt is argmax'd on the first dimension (see add_batch). - """ - - def add_batch(self, output: Tensor, gt: Tensor): - output = output.argmax(1).cpu() - return super().add_batch(output, gt) - - def get_accuracy_score(self) -> float: - return super().Mean_Intersection_over_Union() diff --git a/qai_hub_models/models/_shared/deeplab/model.py b/qai_hub_models/models/_shared/deeplab/model.py index 75d45bbe..7b7d1351 100644 --- a/qai_hub_models/models/_shared/deeplab/model.py +++ b/qai_hub_models/models/_shared/deeplab/model.py @@ -5,7 +5,7 @@ import torch from qai_hub_models.evaluators.base_evaluators import BaseEvaluator -from qai_hub_models.models._shared.deeplab.evaluator import DeepLabV3Evaluator +from qai_hub_models.evaluators.segmentation_evaluator import SegmentationOutputEvaluator from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.image_processing import normalize_image_torchvision from qai_hub_models.utils.input_spec import InputSpec @@ -24,7 +24,7 @@ def __init__( self.normalize_input = normalize_input def get_evaluator(self) -> BaseEvaluator: - return DeepLabV3Evaluator(NUM_CLASSES) + return SegmentationOutputEvaluator(NUM_CLASSES) def forward(self, image): """ diff --git a/qai_hub_models/models/_shared/fastsam/demo.py b/qai_hub_models/models/_shared/fastsam/demo.py index 59281888..bd6544d1 100644 --- a/qai_hub_models/models/_shared/fastsam/demo.py +++ b/qai_hub_models/models/_shared/fastsam/demo.py @@ -5,7 +5,6 @@ from __future__ import annotations import os -import tempfile from typing import Type from PIL import Image @@ -17,7 +16,11 @@ get_on_device_demo_parser, validate_on_device_demo_args, ) -from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image +from qai_hub_models.utils.asset_loaders import ( + CachedWebAsset, + load_image, + qaihm_temp_dir, +) from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.display import display_or_save_image @@ -46,7 +49,7 @@ def fastsam_demo( image = load_image(args.image) - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: image_path = os.path.join(tmpdir, "inp_image.jpg") image.save(image_path) pred, prompt_process = app.segment_image(image_path) diff --git a/qai_hub_models/models/_shared/imagenet_classifier/app.py b/qai_hub_models/models/_shared/imagenet_classifier/app.py index ac0424ad..bee0e780 100644 --- a/qai_hub_models/models/_shared/imagenet_classifier/app.py +++ b/qai_hub_models/models/_shared/imagenet_classifier/app.py @@ -6,18 +6,11 @@ import torch from PIL.Image import Image -from torchvision import transforms -from qai_hub_models.models._shared.imagenet_classifier.model import IMAGENET_DIM from qai_hub_models.models.protocols import ExecutableModelProtocol -from qai_hub_models.utils.image_processing import normalize_image_transform - -IMAGENET_TRANSFORM = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(IMAGENET_DIM), - transforms.ToTensor(), - ] +from qai_hub_models.utils.image_processing import ( + IMAGENET_TRANSFORM, + normalize_image_transform, ) diff --git a/qai_hub_models/models/_shared/imagenet_classifier/model.py b/qai_hub_models/models/_shared/imagenet_classifier/model.py index 3e0f904b..ac4e1b4d 100644 --- a/qai_hub_models/models/_shared/imagenet_classifier/model.py +++ b/qai_hub_models/models/_shared/imagenet_classifier/model.py @@ -12,13 +12,15 @@ from qai_hub_models.evaluators.base_evaluators import BaseEvaluator from qai_hub_models.evaluators.classification_evaluator import ClassificationEvaluator from qai_hub_models.utils.base_model import BaseModel -from qai_hub_models.utils.image_processing import normalize_image_torchvision +from qai_hub_models.utils.image_processing import ( + IMAGENET_DIM, + normalize_image_torchvision, +) from qai_hub_models.utils.input_spec import InputSpec from qai_hub_models.utils.quantization import get_image_quantization_samples MODEL_ASSET_VERSION = 1 MODEL_ID = __name__.split(".")[-2] -IMAGENET_DIM = 224 class ImagenetClassifier(BaseModel): diff --git a/qai_hub_models/models/_shared/mediapipe/app.py b/qai_hub_models/models/_shared/mediapipe/app.py index 05ffcb6d..a121d4f9 100644 --- a/qai_hub_models/models/_shared/mediapipe/app.py +++ b/qai_hub_models/models/_shared/mediapipe/app.py @@ -566,7 +566,7 @@ def _draw_box_and_roi( # Draw detector bounding box draw_box_from_xyxy(NHWC_int_numpy_frame, box[0], box[1], (255, 0, 0), 1) # Draw detector keypoints - draw_points(NHWC_int_numpy_frame, kp) + draw_points(NHWC_int_numpy_frame, kp, size=30) # Draw region of interest box computed from the detector box & keypoints # (this is the input to the landmark detector) draw_box_from_corners(NHWC_int_numpy_frame, roi, (0, 255, 0)) diff --git a/qai_hub_models/models/stable_diffusion_quantized/app.py b/qai_hub_models/models/_shared/stable_diffusion/app.py similarity index 78% rename from qai_hub_models/models/stable_diffusion_quantized/app.py rename to qai_hub_models/models/_shared/stable_diffusion/app.py index 48d33849..8246d90a 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/app.py +++ b/qai_hub_models/models/_shared/stable_diffusion/app.py @@ -2,10 +2,16 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -from typing import Any, Callable, Tuple +from __future__ import annotations +from typing import Any, Tuple + +import diffusers import torch from diffusers.models.embeddings import get_timestep_embedding +from transformers import CLIPTokenizer + +from qai_hub_models.utils.inference import HubModel OUT_H, OUT_W = 512, 512 @@ -28,12 +34,13 @@ class StableDiffusionApp: def __init__( self, - text_encoder: Callable[..., Tuple[torch.Tensor, ...]], - vae_decoder: Callable[..., Tuple[torch.Tensor, ...]], - unet: Callable[..., Tuple[torch.Tensor, ...]], - tokenizer: Any, - scheduler: Any, - time_embedding: Any, + text_encoder: HubModel | torch.nn.Module, + vae_decoder: HubModel | torch.nn.Module, + unet: HubModel | torch.nn.Module, + tokenizer: CLIPTokenizer | Any, + scheduler: diffusers.DPMSolverMultistepScheduler, + time_embedding: diffusers.embeddings.TimeEmbedding, + channel_last_latent: bool, ): """ Initializes StableDiffusionApp with required neural networks for end-to-end pipeline. @@ -55,6 +62,9 @@ def __init__( Updates latent space during each iteration. time_embedding: Projects time-step into embedding used during denoising in latent space. + channel_last_latent: + True if unet outputs latent of shape like (1, 64, 64, 4). False + for (1, 4, 64, 64) """ self.text_encoder = text_encoder @@ -63,21 +73,39 @@ def __init__( self.tokenizer = tokenizer self.scheduler = scheduler self.time_embedding = time_embedding + self.channel_last_latent = channel_last_latent def get_time_embedding(self, timestep): + """ + Since these time embeddings aren't dependent on prompt, they can be + pre-computed (for a pre-defined set of timesteps) in deployment and + skip these computation. We include them in demo for better clarity. + """ timestep = torch.tensor([timestep]) + # TODO: pull 320 from UNet block output dim t_emb = get_timestep_embedding(timestep, 320, True, 0) emb = self.time_embedding(t_emb) return emb - def _encode_text_prompt(self, prompt: str) -> torch.Tensor: + def _encode_text_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]: """ Takes a text prompt and returns a tensor with its text embedding. Parameters ---------- prompt: The text prompt to encode. + + Returns + ------- + cond_embedding + + uncond_embedding + + Note that uncond_embedding is the same for any prompt (since it's not + conditioned on the prompt). So in deploymenet this should be + cached instead of computed every time. We compute it here for better + clarity. """ # Tokenize input prompt text_input = self.tokenizer( @@ -153,9 +181,9 @@ def generate_image( Returns ------- torch.Tensor - The generated image in RGB scaled in [0, 1] with tensor shape (H, - W, 3). The height and the width may depend on the underlying Stable - Diffusion version, but is typically 512x512. + The generated image in RGB scaled in [0, 1] with tensor shape + (OUT_H, OUT_W, 3). The height and the width may depend on the + underlying Stable Diffusion version, but is typically 512x512. """ # Encode text prompt @@ -182,7 +210,8 @@ def _make_channel_first_torch(input_tensor): print(f"\nStep: {i + 1}\n{'-' * 10}") time_emb = self.get_time_embedding(t) latent_model_input = self.scheduler.scale_model_input(latents, t) - latent_model_input = _make_channel_last_torch(latent_model_input) + if self.channel_last_latent: + latent_model_input = _make_channel_last_torch(latent_model_input) print(f"\nDenoising image in latent space (inference on UNet)\n{'-' * 50}") # Denoise image in latent space @@ -195,11 +224,13 @@ def _make_channel_first_torch(input_tensor): noise_cond, noise_uncond = torch.split(noise, 1, 0) noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond) - noise_pred = _make_channel_first_torch(noise_pred) + if self.channel_last_latent: + noise_pred = _make_channel_first_torch(noise_pred) latents = self.scheduler.step(noise_pred, t, latents).prev_sample print(f"\nDecoding generated image (inference on VAEDecoder)\n{'-' * 50}") # Decode generated image from latent space - latents_vae = _make_channel_last_torch(latents) - image = self.vae_decoder(latents_vae) + if self.channel_last_latent: + latents = _make_channel_last_torch(latents) + image = self.vae_decoder(latents) return image diff --git a/qai_hub_models/models/stable_diffusion_quantized/demo.py b/qai_hub_models/models/_shared/stable_diffusion/demo.py similarity index 70% rename from qai_hub_models/models/stable_diffusion_quantized/demo.py rename to qai_hub_models/models/_shared/stable_diffusion/demo.py index d95ec30e..d09f2461 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/demo.py +++ b/qai_hub_models/models/_shared/stable_diffusion/demo.py @@ -2,22 +2,19 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +from __future__ import annotations + import argparse +from typing import Any +import diffusers import numpy as np import qai_hub as hub -from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel +from diffusers import DPMSolverMultistepScheduler from PIL import Image from transformers import CLIPTokenizer -from qai_hub_models.models.stable_diffusion_quantized.app import StableDiffusionApp -from qai_hub_models.models.stable_diffusion_quantized.model import ( - MODEL_ASSET_VERSION, - MODEL_ID, - ClipVITTextEncoder, - Unet, - VAEDecoder, -) +from qai_hub_models.models._shared.stable_diffusion.app import StableDiffusionApp from qai_hub_models.utils.args import add_output_dir_arg from qai_hub_models.utils.base_model import BasePrecompiledModel from qai_hub_models.utils.display import display_or_save_image @@ -29,11 +26,13 @@ def _get_hub_model( + model_id: str, + model_asset_version: str, input_model: BasePrecompiledModel, model_name: str, ignore_cached_model: bool = False, device_name=DEFAULT_DEVICE_NAME, -): +) -> HubModel: if not can_access_qualcomm_ai_hub(): raise RuntimeError( "Stable-diffusion on-device demo requires access to QAI-Hub.\n" @@ -42,8 +41,8 @@ def _get_hub_model( # Upload model uploaded_model = get_uploaded_precompiled_model( input_model.get_target_model_path(), - MODEL_ID, - MODEL_ASSET_VERSION, + model_id, + model_asset_version, model_name, ignore_cached_model=ignore_cached_model, ) @@ -53,7 +52,25 @@ def _get_hub_model( # Run Stable Diffuison end-to-end on a given prompt. The demo will output an # AI-generated image based on the description in the prompt. -def main(is_test: bool = False): +def stable_diffusion_demo( + model_id: str, + model_asset_version: str, + text_encoder: BasePrecompiledModel, + unet: BasePrecompiledModel, + vae_decoder: BasePrecompiledModel, + tokenizer: CLIPTokenizer | Any, + scheduler: DPMSolverMultistepScheduler, + time_embedding: diffusers.embeddings.TimeEmbedding, + channel_last_latent: bool = True, + is_test: bool = False, +): + """ + Generate an image by running text_encoder, unet, vae_decoder via AI Hub + inference job on target physical device, and tokenizer, scheduler, and + time embedding in torch locally. + + See parser arguments for parameters. + """ parser = argparse.ArgumentParser() parser.add_argument( "--prompt", @@ -64,7 +81,7 @@ def main(is_test: bool = False): "--num-steps", default=5, type=int, - help="The number of diffusion iteration steps (higher means better quality).", + help="The number of diffusion steps (higher means better quality).", ) parser.add_argument( "--seed", @@ -110,50 +127,47 @@ def main(is_test: bool = False): print(f"Downloading model assets\n{'-' * 35}") # Load target models - text_encoder = ClipVITTextEncoder.from_precompiled() - unet = Unet.from_precompiled() - vae_decoder = VAEDecoder.from_precompiled() # Create three HubModel instances to prepare for on-device inference. # This is similar to initializing PyTorch model to call forward method later. # Instead of forward, we later submit inference_jobs on QAI-Hub for # on-device evaluation. print(f"Uploading model assets on QAI-Hub\n{'-' * 35}") - text_encoder = _get_hub_model( - text_encoder, "text_encoder", args.ignore_cached_model, args.device_name + hub_text_encoder = _get_hub_model( + model_id, + model_asset_version, + text_encoder, + "text_encoder", + args.ignore_cached_model, + args.device_name, ) - unet = _get_hub_model(unet, "unet", args.ignore_cached_model, args.device_name) - vae_decoder = _get_hub_model( - vae_decoder, "vae_decoder", args.ignore_cached_model, args.device_name + hub_unet = _get_hub_model( + model_id, + model_asset_version, + unet, + "unet", + args.ignore_cached_model, + args.device_name, ) - - # Create tokenizer, scheduler and time_embedding required - # for stable-diffusion pipeline. - tokenizer = CLIPTokenizer.from_pretrained( - "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer", revision="main" + hub_vae_decoder = _get_hub_model( + model_id, + model_asset_version, + vae_decoder, + "vae_decoder", + args.ignore_cached_model, + args.device_name, ) - scheduler = DPMSolverMultistepScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - num_train_timesteps=1000, - ) - - time_embedding = UNet2DConditionModel.from_pretrained( - "runwayml/stable-diffusion-v1-5", subfolder="unet" - ).time_embedding - # Load Application app = StableDiffusionApp( - text_encoder=text_encoder, - vae_decoder=vae_decoder, - unet=unet, + text_encoder=hub_text_encoder, + vae_decoder=hub_vae_decoder, + unet=hub_unet, tokenizer=tokenizer, scheduler=scheduler, time_embedding=time_embedding, + channel_last_latent=channel_last_latent, ) - # Generate image image = app.generate_image( args.prompt, num_steps=args.num_steps, @@ -165,7 +179,3 @@ def main(is_test: bool = False): if not is_test: display_or_save_image(pil_img, args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/qai_hub_models/models/stable_diffusion_quantized/test.py b/qai_hub_models/models/_shared/stable_diffusion/test_utils.py similarity index 54% rename from qai_hub_models/models/stable_diffusion_quantized/test.py rename to qai_hub_models/models/_shared/stable_diffusion/test_utils.py index b0cc4bf5..29e61824 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/test.py +++ b/qai_hub_models/models/_shared/stable_diffusion/test_utils.py @@ -2,29 +2,15 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -import tempfile +from qai_hub_models.utils.asset_loaders import qaihm_temp_dir -import pytest -from qai_hub_models.models.stable_diffusion_quantized.demo import main as demo_main -from qai_hub_models.models.stable_diffusion_quantized.export import export_model -from qai_hub_models.models.stable_diffusion_quantized.model import ( - StableDiffusionQuantized, -) - - -def test_from_precompiled(): - StableDiffusionQuantized.from_precompiled() - - -@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") -@pytest.mark.slow_cloud -def test_export(): - with tempfile.TemporaryDirectory() as tmpdir: +def export_for_component(export_model, component_name: str): + with qaihm_temp_dir() as tmpdir: exported_jobs = export_model( # Testing text_encoder as it's smallest model in # Stable-Diffusion pipeline - components=["text_encoder"], + components=[component_name], skip_inferencing=True, skip_downloading=True, skip_summary=True, @@ -38,9 +24,3 @@ def test_export(): profile_job, inference_job = jobs[0], jobs[1] assert profile_job is not None assert inference_job is None - - -@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") -@pytest.mark.slow_cloud -def test_demo(): - demo_main(is_test=True) diff --git a/qai_hub_models/models/_shared/video_classifier/demo.py b/qai_hub_models/models/_shared/video_classifier/demo.py index 99ce64a8..2a0974f9 100644 --- a/qai_hub_models/models/_shared/video_classifier/demo.py +++ b/qai_hub_models/models/_shared/video_classifier/demo.py @@ -4,13 +4,12 @@ # --------------------------------------------------------------------- from __future__ import annotations -import tempfile from typing import Type from qai_hub_models.models._shared.video_classifier.app import KineticsClassifierApp from qai_hub_models.models._shared.video_classifier.model import KineticsClassifier from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args -from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_path +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_path, qaihm_temp_dir # @@ -35,7 +34,7 @@ def kinetics_classifier_demo( model = model_from_cli_args(model_type, args) app = KineticsClassifierApp(model) print("Model Loaded") - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: dst_path = load_path(args.video, tmpdir) predictions = app.predict(path=str(dst_path)) top5_classes = ", ".join(predictions) diff --git a/qai_hub_models/models/_shared/whisper/app.py b/qai_hub_models/models/_shared/whisper/app.py index b99f54ba..27deac3d 100644 --- a/qai_hub_models/models/_shared/whisper/app.py +++ b/qai_hub_models/models/_shared/whisper/app.py @@ -36,7 +36,7 @@ def __init__(self, whisper: Whisper): self.num_decoder_blocks = whisper.num_decoder_blocks self.num_decoder_heads = whisper.num_decoder_heads self.attention_dim = whisper.attention_dim - self.max_decode_len = whisper.max_decode_len + self.mean_decode_len = whisper.mean_decode_len # Wraps torch Module so it takes np ndarray as input and outputs if isinstance(encoder, torch.nn.Module): @@ -64,14 +64,28 @@ def transcribe(self, mel_input: np.ndarray) -> str: - transcribed texts """ - cross_attn_cache = self.encoder(mel_input) + k_cache_cross, v_cache_cross = self.encoder(mel_input) # Start decoding # coreml only takes float tensors x = np.array([[TOKEN_SOT]]) decoded_tokens = [TOKEN_SOT] - sample_len = self.max_decode_len # max # of tokens to sample - cache_tensor = np.zeros((1, sample_len, self.attention_dim)).astype(np.float32) - self_attn_cache = [cache_tensor] * 2 * self.num_decoder_blocks + sample_len = self.mean_decode_len # mean # of tokens to sample + k_cache_self = np.zeros( + ( + self.num_decoder_blocks, + self.num_decoder_heads, + self.attention_dim // self.num_decoder_heads, + sample_len, + ) + ).astype(np.float32) + v_cache_self = np.zeros( + ( + self.num_decoder_blocks, + self.num_decoder_heads, + sample_len, + self.attention_dim // self.num_decoder_heads, + ) + ).astype(np.float32) sum_logprobs = 0 for i in range(sample_len): @@ -80,15 +94,13 @@ def transcribe(self, mel_input: np.ndarray) -> str: # index - used to get positional embedding correctly. index = torch.zeros([1, 1], dtype=torch.int32) index[0, 0] = i - # Use mask to get the k_cache updated with new key - mask = torch.zeros(1, sample_len, self.attention_dim, dtype=torch.bool) - mask[:, i, :] = 1 decoder_out = self.decoder( - x, index, mask, *cross_attn_cache, *self_attn_cache + x, index, k_cache_cross, v_cache_cross, k_cache_self, v_cache_self ) # logit has shape (1, decoded_len, 51864) logits = decoder_out[0] - self_attn_cache = decoder_out[1:] # type: ignore + k_cache_self = decoder_out[1] + v_cache_self = decoder_out[2] # logit has shape (51864,) logits = logits[0, -1] # consider only the last token diff --git a/qai_hub_models/models/_shared/whisper/model.py b/qai_hub_models/models/_shared/whisper/model.py index 06e2a0d3..1ea0acb4 100644 --- a/qai_hub_models/models/_shared/whisper/model.py +++ b/qai_hub_models/models/_shared/whisper/model.py @@ -13,7 +13,14 @@ from qai_hub_models.utils.base_model import BaseModel, CollectionModel, TargetRuntime from qai_hub_models.utils.input_spec import InputSpec -MAX_DECODE_LEN = 224 +# The official default max decoded length is 448. We use mean decoded length 224 for benchmarking purpose +MEAN_DECODE_LEN = 224 + +# The number of 20ms audio contexts in 30 seconds of audio +AUDIO_EMB_LEN = 1500 + +# The number of Mel features per audio context +N_MELS = 80 MODEL_ID = "whisper_asr_shared" MODEL_ASSET_VERSION = 1 @@ -36,7 +43,7 @@ def __init__( self.num_decoder_blocks = num_decoder_blocks self.attention_dim = attention_dim self.num_decoder_heads = num_heads - self.max_decode_len = MAX_DECODE_LEN + self.mean_decode_len = MEAN_DECODE_LEN @classmethod def from_pretrained(cls, model: str = "tiny.en"): @@ -63,16 +70,40 @@ class WhisperEncoderInf(BaseModel): def __init__(self, model: whisper.model.Whisper): super().__init__() - self.model = model + self.encoder = model.encoder + dims = model.dims + + states_per_head = dims.n_audio_state // dims.n_audio_head + scale = states_per_head**-0.25 + + self.cross_attn_key_list = torch.nn.ModuleList( + [ + SplitLinear(block.cross_attn.key, dims.n_audio_head, scale) + for block in model.decoder.blocks + ] + ) + self.cross_attn_value_list = torch.nn.ModuleList( + [ + SplitLinear(block.cross_attn.value, dims.n_audio_head) + for block in model.decoder.blocks + ] + ) def forward(self, audio: torch.Tensor) -> List[torch.Tensor]: - # Return 2 * self.num_blocks tensors (k, v for each block) - encoder_out = self.model.encoder(audio) - res = [] - for residual_block in self.model.decoder.blocks: - res.append(residual_block.cross_attn.key(encoder_out)) - res.append(residual_block.cross_attn.value(encoder_out)) - return res + # Return cross attention key and value cache tensors + encoder_out = self.encoder(audio) + k_cache = torch.cat( + [ + key(encoder_out, transpose=True).unsqueeze(0) + for key in self.cross_attn_key_list + ], + dim=0, + ) + v_cache = torch.cat( + [value(encoder_out).unsqueeze(0) for value in self.cross_attn_value_list], + dim=0, + ) + return k_cache, v_cache @staticmethod def get_input_spec() -> InputSpec: @@ -80,7 +111,7 @@ def get_input_spec() -> InputSpec: Returns the input specification (name -> (shape, type). This can be used to submit profiling job on Qualcomm AI Hub. """ - return dict(audio=((1, 80, 3000), "float32")) + return dict(audio=((1, N_MELS, AUDIO_EMB_LEN * 2), "float32")) @classmethod def from_pretrained(cls): @@ -92,7 +123,12 @@ def get_hub_profile_options( profile_options = super().get_hub_profile_options( target_runtime, other_profile_options ) - return profile_options + " --max_profiler_iterations 10" + " --compute_unit gpu" + if ( + target_runtime == TargetRuntime.TFLITE + and "--compute_unit" not in profile_options + ): + profile_options = profile_options + " --compute_unit gpu" + return profile_options + " --max_profiler_iterations 10" class WhisperDecoderInf(BaseModel): @@ -105,20 +141,47 @@ class WhisperDecoderInf(BaseModel): 2. kv cache inputs are required, not optional """ - def __init__(self, model: whisper.model.TextDecoder): + def __init__( + self, model: whisper.model.TextDecoder, max_decode_len: int = MEAN_DECODE_LEN + ): super().__init__() assert isinstance(model, whisper.model.TextDecoder) + self.max_decode_len = max_decode_len + # Wraps `ResidualAttentionBlock` in # `ResidualAttentionBlockWrapper` self.blocks = torch.nn.ModuleList( [ResidualAttentionBlockWrapper(b) for b in model.blocks] ) - self.num_blocks = len(self.blocks) + for m in ["token_embedding", "ln"]: self.add_module(m, getattr(model, m)) - for p in ["positional_embedding"]: - self.register_parameter(p, getattr(model, p)) + + # Replace `whisper.model.TextDecoder.positional_embedding` (nn.Parameter) with nn.Embedding for easier lookup + self.positional_embedding = torch.nn.Embedding( + max_decode_len, self.token_embedding.weight.shape[1] + ) + self.positional_embedding.weight = torch.nn.Parameter( + model.positional_embedding[:max_decode_len, :] + ) + + self.logits = torch.nn.Linear( + self.token_embedding.weight.shape[1], + self.token_embedding.weight.shape[0], + bias=False, + ) + self.logits.weight = self.token_embedding.weight + + # Since kv cache is a fixed size, mask out elements + # that correspond to not yet used entries. + # The kv cache for current token is inserted at the last + # index, with the previous cache shifted down by one element. + self.mask = torch.nn.Embedding(max_decode_len, max_decode_len) + mask = torch.zeros([max_decode_len, max_decode_len], dtype=torch.float32) + for c_idx in range(0, max_decode_len): + mask[c_idx, 0 : max_decode_len - c_idx - 1] = -100 + self.mask.weight = torch.nn.Parameter(mask) @property def attention_dim(self): @@ -128,13 +191,18 @@ def attention_dim(self): def num_heads(self): return self.blocks[0].attn.n_head + @property + def num_blocks(self): + return len(self.blocks) + def forward( self, x: torch.Tensor, index: torch.Tensor, - mask: torch.Tensor, - *kv_cache_args, - **kv_cache_kwargs, + k_cache_cross: torch.Tensor, + v_cache_cross: torch.Tensor, + k_cache_self: torch.Tensor, + v_cache_self: torch.Tensor, ): """ Args: @@ -145,56 +213,54 @@ def forward( - index: torch.tensor, shape = (1, 1) index to get the positional encoding for x. - - mask: torch.tensor, shape = (1, max_sample_length, attn_dim) - Mask helps create kv_cache while keeping the size consistent. - - - kv_cache_args: Tuple of length 4 * num_decoder_blocks. Elements are: - - b{i}_cross_attn_k: [1, 1500, attn_dim] - b{i}_cross_attn_v: [1, 1500, attn_dim] + - k_cache_cross: key cache for cross attention: + [num_blocks, num_heads, attn_dim/num_heads, AUDIO_EMB_LEN] - for i = 0, ..., num_blocks + - v_cache_cross: value cache for cross attention: + [num_blocks, num_heads, AUDIO_EMB_LEN, attn_dim/num_heads] - followed by + - k_cache_self: key cache for self attention: + [num_blocks, num_heads, attn_dim/num_heads, self.max_decode_len] + pass zeros for first call (index 0), otherwise pass in + previous decoder output - b{i}_self_attn_k: [1, max_sample_length, attn_dim] - b{i}_self_attn_v: [1, max_sample_length, attn_dim] - - for i = 0, ..., num_blocks + - v_cache_self: value cache for self attention: + [num_blocks, num_heads, self.max_decode_len, attn_dim/num_heads] + pass zeros for first call (index 0), otherwise pass in + previous decoder output Returns: - logits: of shape [1, 1, 51864] - - b0_self_attn_k, b0_self_attn_v, b1_self_attn_k, ...: Updated self attn cache. - 2*num_decoder_blocks + - k_cache_self_new: updated key cache for self attention + - v_cache_self_new: updated value cache for self attention """ - if not kv_cache_args: - kv_cache_args = list(kv_cache_kwargs.values()) - assert isinstance(self.token_embedding, torch.nn.Module) # for mypy assert isinstance(self.ln, torch.nn.Module) # for mypy - assert isinstance(self.positional_embedding, torch.nn.Parameter) # for mypy + assert isinstance(self.positional_embedding, torch.nn.Embedding) # for mypy # Set up kv_cache kv_cache = {} # torch.nn.Module -> torch.Tensor for i, block in enumerate(self.blocks): kv_cache.update( { - block.attn.key: kv_cache_args[2 * self.num_blocks + i * 2], - block.attn.value: kv_cache_args[2 * self.num_blocks + i * 2 + 1], - block.cross_attn.key: kv_cache_args[i * 2], - block.cross_attn.value: kv_cache_args[i * 2 + 1], + block.attn.key: k_cache_self[i : i + 1], + block.attn.value: v_cache_self[i : i + 1], + block.cross_attn.key: k_cache_cross[i : i + 1], + block.cross_attn.value: v_cache_cross[i : i + 1], } ) - x = self.token_embedding(x) + self.positional_embedding[index.long()] + x = self.token_embedding(x) + self.positional_embedding(index) + mask = self.mask(index) # x shape: (1, 1, 384) - kv_cache_new = [] - for block in self.blocks: - x, k_cache, v_cache = block(x, index, mask, kv_cache=kv_cache) - kv_cache_new.append(k_cache.float()) - kv_cache_new.append(v_cache.float()) + k_cache_new = [] + v_cache_new = [] + for block_idx in range(self.num_blocks): + x, k_cache, v_cache = self.blocks[block_idx](x, mask, kv_cache=kv_cache) + k_cache_new.append(k_cache.float()) + v_cache_new.append(v_cache.float()) x = self.ln(x) logits = ( @@ -203,9 +269,9 @@ def forward( self.token_embedding.weight.to(x.dtype), 0, 1 # type: ignore ) ).float() + logits = self.logits(x).float() - # shape: [1, 1, 51864] - return (logits,) + tuple(kv_cache_new) + return logits, torch.cat(k_cache_new), torch.cat(v_cache_new) @staticmethod def get_input_spec( @@ -218,21 +284,23 @@ def get_input_spec( specs = dict( x=((1, 1), "int32"), index=((1, 1), "int32"), - mask=((1, MAX_DECODE_LEN, attention_dim), "int32"), - ) - for i in range(num_blocks): - specs[f"b{i}_cross_attn_k"] = ((1, 1500, attention_dim), "float32") - specs[f"b{i}_cross_attn_v"] = ((1, 1500, attention_dim), "float32") - - for i in range(num_blocks): - specs[f"b{i}_self_attn_k"] = ( - (1, MAX_DECODE_LEN, attention_dim), + k_cache_cross=( + (num_blocks, num_heads, attention_dim // num_heads, AUDIO_EMB_LEN), "float32", - ) - specs[f"b{i}_self_attn_v"] = ( - (1, MAX_DECODE_LEN, attention_dim), + ), + v_cache_cross=( + (num_blocks, num_heads, AUDIO_EMB_LEN, attention_dim // num_heads), "float32", - ) + ), + k_cache_self=( + (num_blocks, num_heads, attention_dim // num_heads, MEAN_DECODE_LEN), + "float32", + ), + v_cache_self=( + (num_blocks, num_heads, MEAN_DECODE_LEN, attention_dim // num_heads), + "float32", + ), + ) return specs @@ -246,6 +314,50 @@ def from_pretrained(cls): return Whisper.from_pretrained().decoder +class SplitLinear(torch.nn.Module): + def __init__(self, linear: torch.nn.Module, num_splits: int, scale: float = 1.0): + """ + Split Linear operation into multiple instances + Multi-head cross attention + Uses pre-computed cross kv cache passed as input to the + decoder model + """ + super().__init__() + weight = linear.weight + has_bias = False if linear.bias is None else True + if has_bias: + bias = linear.bias.reshape(num_splits, -1) * scale + split_weight = weight.reshape(num_splits, -1, weight.shape[1]) * scale + self.split_linears = torch.nn.ModuleList( + [ + torch.nn.Linear( + split_weight.shape[1], split_weight.shape[2], bias=has_bias + ) + for split_idx in range(num_splits) + ] + ) + for split_idx in range(num_splits): + self.split_linears[split_idx].weight = torch.nn.Parameter( + split_weight[split_idx, :, :] + ) + if has_bias: + self.split_linears[split_idx].bias = torch.nn.Parameter(bias[split_idx]) + + def forward(self, x: torch.Tensor, transpose: bool = False): + """ + produces output with dimension + [num_splits, input rows, output_features / num_splits] + If transpose is True, will transpose last two indices + """ + if transpose: + x = torch.cat( + [spl(x).transpose(-1, -2) for spl in self.split_linears], dim=-3 + ) + else: + x = torch.cat([spl(x) for spl in self.split_linears], dim=-3) + return x + + class MHAWrapper(torch.nn.Module): """ Wrapper around whisper.model.MultiHeadAttention to leverage kv cache for @@ -275,7 +387,6 @@ def __init__(self, model: whisper.model.MultiHeadAttention, attn_type: str): def forward( self, x: torch.Tensor, - index: torch.Tensor, mask: torch.Tensor, kv_cache: Dict[torch.nn.Module, torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -285,30 +396,36 @@ def forward( - x: shape [1, 1, attention_dim]. Input feature. - kv_cache: 4 * num_decoder_blocks entries representing self attention - and cross attention from all attention blocks. Each entry of shape - [1, decoded_len, attention_dim]. We'd only use cache relevant to this - particular attention layer and ignore other entries in the dict. + and cross attention from all attention blocks. Each k entry of shape + [1, num_heads, attention_dim // num_heads, context_len] and + each v entry of shape + [1, num_heads, context_len, attention_dim // num_heads]. + We'd only use cache relevant to this particular attention layer + and ignore other entries in the dict. Returns: - x_out: attention output - - updated k, v cache: of shape [1, decoded_len+1, attention_dim] + - updated k, v cache: with same shape as input """ assert isinstance(self.query, torch.nn.Module) # for mypy assert isinstance(self.key, torch.nn.Module) # for mypy assert isinstance(self.value, torch.nn.Module) # for mypy assert isinstance(self.out, torch.nn.Module) # for mypy q = self.query(x) + q = q.view(q.shape[0], self.n_head, 1, -1) if self.attn_type == "self_attention": k_cache = kv_cache[self.key] v_cache = kv_cache[self.value] - k = torch.zeros(k_cache.shape) - v = torch.zeros(v_cache.shape) - k = mask * self.key(x) + k_cache - v = mask * self.value(x) + v_cache - new_index = torch.tensor([index[0, 0] + 1]).long() - wv = qkv_attention(q, k[:, :new_index], v[:, :new_index], self.n_head) + k = self.key(x).unsqueeze(3) + k = k.view(k.shape[0], self.n_head, -1, 1) + v = self.value(x).unsqueeze(2) + v = v.view(k.shape[0], self.n_head, 1, -1) + # shift kv cache and insert new k and v entries + k = torch.cat((k_cache[:, :, :, 1:], k), dim=-1) + v = torch.cat((v_cache[:, :, 1:, :], v), dim=-2) + wv = qkv_attention(q, k, v, self.n_head, mask=mask) else: # cross_attention k, v = kv_cache[self.key], kv_cache[self.value] wv = qkv_attention(q, k, v, self.n_head) @@ -327,21 +444,17 @@ def qkv_attention( """ Adapted from whisper.model.MultiHeadAttention.qkv_attention """ - n_batch, n_ctx, n_state = q.shape - - scale = (n_state // n_head) ** -0.25 - q = q.view(*q.shape[:2], n_head, -1).permute(0, 2, 1, 3) * scale - k = k.view(*k.shape[:2], n_head, -1).permute(0, 2, 3, 1) * scale - v = v.view(*v.shape[:2], n_head, -1).permute(0, 2, 1, 3) - - qk = q @ k - if mask is not None: - qk = qk + mask - # Use negative infinity to mask the zeros when doing the softmax. - qk = qk.float() - - w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype) - return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + wv_list = [] + # Split heads in qkv calculation + for h in range(n_head): + qk = q[:, h : h + 1, :, :] @ k[:, h : h + 1, :, :] + if mask is not None: + qk = qk + mask + w = torch.nn.functional.softmax(qk, dim=-1) + wv_list.append(w @ v[:, h : h + 1, :, :]) + wv = torch.cat(wv_list, dim=1) + wv = wv.view(wv.shape[0], 1, -1) + return wv class ResidualAttentionBlockWrapper(torch.nn.Module): @@ -357,14 +470,26 @@ def __init__(self, model: whisper.model.ResidualAttentionBlock): assert isinstance(model, whisper.model.ResidualAttentionBlock) # Wraps `MultiheadAttention` to `MultiheadAttentionWrapper` self.attn = MHAWrapper(model.attn, "self_attention") + + states_per_head = model.attn.query.weight.shape[0] // model.attn.n_head + scale = states_per_head**-0.25 self.cross_attn = MHAWrapper(model.cross_attn, "cross_attention") + + # Apply scale for qkv to parameters + with torch.no_grad(): + self.attn.query.weight *= scale + self.attn.query.bias *= scale + self.attn.key.weight *= scale + self.cross_attn.query.weight *= scale + self.cross_attn.query.bias *= scale + self.cross_attn.key.weight *= scale + for m in ["attn_ln", "cross_attn_ln", "mlp", "mlp_ln"]: self.add_module(m, getattr(model, m)) def forward( self, x: torch.Tensor, - index: torch.Tensor, mask: torch.Tensor, kv_cache: Dict[torch.nn.Module, torch.Tensor], ): @@ -380,14 +505,14 @@ def forward( assert isinstance(self.mlp, torch.nn.Module) # for mypy assert isinstance(self.mlp_ln, torch.nn.Module) # for mypy x_attn, k_cache, v_cache = self.attn( - self.attn_ln(x), index=index, mask=mask, kv_cache=kv_cache + self.attn_ln(x), mask=mask, kv_cache=kv_cache ) x = x + x_attn if self.cross_attn: # Ignore cross attn kv cache which is constant (pre-computed in # `WhisperCrossAttnKVCacheTorch`) x_cross_attn, _, _ = self.cross_attn( - self.cross_attn_ln(x), index=index, mask=mask, kv_cache=kv_cache + self.cross_attn_ln(x), mask=mask, kv_cache=kv_cache ) x = x + x_cross_attn x = x + self.mlp(self.mlp_ln(x)) diff --git a/qai_hub_models/models/_shared/whisper/test_utils.py b/qai_hub_models/models/_shared/whisper/test_utils.py index b3657a76..9b4fb089 100644 --- a/qai_hub_models/models/_shared/whisper/test_utils.py +++ b/qai_hub_models/models/_shared/whisper/test_utils.py @@ -13,7 +13,7 @@ ) from qai_hub_models.models._shared.whisper.demo import TEST_AUDIO_PATH from qai_hub_models.models._shared.whisper.model import ( - MAX_DECODE_LEN, + MEAN_DECODE_LEN, MEL_FILTER_PATH, Whisper, WhisperDecoderInf, @@ -49,19 +49,36 @@ def run_test_wrapper_numerics(whisper_version): encoder = WhisperEncoderInf(model) decoder = WhisperDecoderInf(model.decoder) - cross_attn_cache = encoder(mel_input) - sample_len = MAX_DECODE_LEN - cache_tensor = np.zeros([1, sample_len, decoder.attention_dim]).astype(np.float32) + k_cache_cross, v_cache_cross = encoder(mel_input) + sample_len = MEAN_DECODE_LEN + + k_cache_self = torch.zeros( + ( + decoder.num_blocks, + decoder.num_heads, + decoder.attention_dim // decoder.num_heads, + sample_len, + ), + dtype=torch.float32, + ) + v_cache_self = torch.zeros( + ( + decoder.num_blocks, + decoder.num_heads, + sample_len, + decoder.attention_dim // decoder.num_heads, + ), + dtype=torch.float32, + ) index = torch.zeros([1, 1], dtype=torch.int32) index[0, 0] = 0 - mask = torch.zeros(1, sample_len, decoder.attention_dim, dtype=torch.bool) - mask[:, 0, :] = 1 - self_attn_cache = [cache_tensor] * 2 * decoder.num_blocks with torch.no_grad(): - decoder_out = decoder(tokens, index, mask, *cross_attn_cache, *self_attn_cache) + decoder_out = decoder( + tokens, index, k_cache_cross, v_cache_cross, k_cache_self, v_cache_self + ) logits = decoder_out[0].detach().numpy() - np.testing.assert_allclose(logits_orig, logits) + np.testing.assert_allclose(logits_orig, logits, rtol=5e-3) def run_test_transcribe(whisper_version): diff --git a/qai_hub_models/models/_shared/yolo/demo.py b/qai_hub_models/models/_shared/yolo/demo.py index 9da62ba3..046bed7d 100644 --- a/qai_hub_models/models/_shared/yolo/demo.py +++ b/qai_hub_models/models/_shared/yolo/demo.py @@ -29,6 +29,7 @@ def yolo_detection_demo( default_image: str | CachedWebAsset, stride_multiple: int | None = None, is_test: bool = False, + default_score_threshold: float = 0.45, ): # Demo parameters parser = get_model_cli_parser(model_type) @@ -40,7 +41,7 @@ def yolo_detection_demo( parser.add_argument( "--score-threshold", type=float, - default=0.45, + default=default_score_threshold, help="Score threshold for NonMaximumSuppression", ) parser.add_argument( diff --git a/qai_hub_models/models/aotgan/README.md b/qai_hub_models/models/aotgan/README.md index 5e0f84a4..dd02b51a 100644 --- a/qai_hub_models/models/aotgan/README.md +++ b/qai_hub_models/models/aotgan/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/a a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/aotgan/export.py b/qai_hub_models/models/aotgan/export.py index 031cded6..9b628a56 100644 --- a/qai_hub_models/models/aotgan/export.py +++ b/qai_hub_models/models/aotgan/export.py @@ -120,12 +120,17 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image,mask" + + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image,mask" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +168,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image,mask", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image,mask", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +201,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) diff --git a/qai_hub_models/models/aotgan/perf.yaml b/qai_hub_models/models/aotgan/perf.yaml index a83056a2..7f8e4c42 100644 --- a/qai_hub_models/models/aotgan/perf.yaml +++ b/qai_hub_models/models/aotgan/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: AOT-GAN performance_metrics: - torchscript_onnx_tflite: - inference_time: 172218.0 - throughput: 5.806593968110186 + inference_time: 164598.0 + throughput: 6.075407963644759 estimated_peak_memory_range: - min: 3301376 - max: 6608312 + min: 4349952 + max: 7789760 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 235 - job_id: jlpeelxop + job_id: jmg9werlp job_status: Passed torchscript_onnx_qnn: - inference_time: 162527.0 - throughput: 6.15282383850068 + inference_time: 164540.0 + throughput: 6.077549532028686 estimated_peak_memory_range: - min: 4247552 - max: 34036840 + min: 4341760 + max: 36913480 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,7 +63,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 275 - job_id: jz5w21z35 + job_id: jz57x3mlg job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -76,7 +78,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jnp1yv18p + job_id: jegn384r5 job_status: Failed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.302216Z' + timestamp: '2024-05-20T16:35:27.553176Z' - torchscript_onnx_tflite: - inference_time: 126778.0 - throughput: 7.887803877644386 + inference_time: 120809.0 + throughput: 8.277528992045294 estimated_peak_memory_range: - min: 2174976 - max: 256099504 + min: 2879488 + max: 222384800 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 235 - job_id: jygzo4yo5 + job_id: jnp1ex92g job_status: Passed torchscript_onnx_qnn: - inference_time: 119306.0 - throughput: 8.381808123648433 + inference_time: 121163.0 + throughput: 8.253344667926678 estimated_peak_memory_range: - min: 3887104 - max: 166111904 + min: 3252224 + max: 144610800 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,7 +116,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 275 - job_id: jmg9jx2w5 + job_id: jqp4v07vp job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -129,7 +131,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jvgdez4r5 + job_id: joprejr95 job_status: Failed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.302592Z' + timestamp: '2024-05-20T16:35:27.684430Z' - torchscript_onnx_tflite: - inference_time: 171670.0 - throughput: 5.825129609133803 + inference_time: 161130.0 + throughput: 6.206168931918326 estimated_peak_memory_range: - min: 3219456 - max: 6614600 + min: 3170304 + max: 13340440 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 235 - job_id: jopr8dz05 + job_id: jvgdolkep job_status: Passed torchscript_onnx_qnn: - inference_time: 162527.0 - throughput: 6.15282383850068 + inference_time: 164457.0 + throughput: 6.080616817769995 estimated_peak_memory_range: - min: 4337664 - max: 32953256 + min: 4214784 + max: 29715440 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 275 - job_id: j1p80rlkg + job_id: jo5m3y7wg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.302905Z' + timestamp: '2024-05-20T16:35:27.816239Z' + - torchscript_onnx_qnn: + inference_time: 145454.0 + throughput: 6.87502578134668 + estimated_peak_memory_range: + min: 4202496 + max: 4202496 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 275 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 275 + job_id: j0pxy2q1g + job_status: Passed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jep2ln14g + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jqpy60l75 + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:27.949164Z' diff --git a/qai_hub_models/models/baichuan_7b_quantized/README.md b/qai_hub_models/models/baichuan_7b_quantized/README.md index 0e5427aa..cff30e46 100644 --- a/qai_hub_models/models/baichuan_7b_quantized/README.md +++ b/qai_hub_models/models/baichuan_7b_quantized/README.md @@ -15,6 +15,8 @@ a hosted Qualcomm® device. + + ## License - The license for the original implementation of Baichuan-7B can be found [here](https://github.com/baichuan-inc/Baichuan-7B/blob/main/LICENSE). @@ -29,3 +31,25 @@ a hosted Qualcomm® device. * For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). +## Usage and Limitations + +This model may not be used for or in connection with any of the following applications: + +- Accessing essential private and public services and benefits; +- Administration of justice and democratic processes; +- Assessing or recognizing the emotional state of a person; +- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics; +- Education and vocational training; +- Employment and workers management; +- Exploitation of the vulnerabilities of persons resulting in harmful behavior; +- General purpose social scoring; +- Law enforcement; +- Management and operation of critical infrastructure; +- Migration, asylum and border control management; +- Predictive policing; +- Real-time remote biometric identification in public spaces; +- Recommender systems of social media platforms; +- Scraping of facial images (from the internet or otherwise); and/or +- Subliminal manipulation + + diff --git a/qai_hub_models/models/controlnet_quantized/README.md b/qai_hub_models/models/controlnet_quantized/README.md index db6e93dc..2c155773 100644 --- a/qai_hub_models/models/controlnet_quantized/README.md +++ b/qai_hub_models/models/controlnet_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/c a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/controlnet_quantized/info.yaml b/qai_hub_models/models/controlnet_quantized/info.yaml index 97f9fa98..5c000cba 100644 --- a/qai_hub_models/models/controlnet_quantized/info.yaml +++ b/qai_hub_models/models/controlnet_quantized/info.yaml @@ -29,7 +29,7 @@ applicable_scenarios: - Image Editing - Content Creation related_models: - - stable_diffusion_quantized + - stable_diffusion_v1_5_quantized form_factors: - Phone - Tablet diff --git a/qai_hub_models/models/controlnet_quantized/test.py b/qai_hub_models/models/controlnet_quantized/test.py index 18c31392..d7b23999 100644 --- a/qai_hub_models/models/controlnet_quantized/test.py +++ b/qai_hub_models/models/controlnet_quantized/test.py @@ -2,13 +2,12 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -import tempfile - import pytest from qai_hub_models.models.controlnet_quantized.demo import main as demo_main from qai_hub_models.models.controlnet_quantized.export import export_model from qai_hub_models.models.controlnet_quantized.model import ControlNetQuantized +from qai_hub_models.utils.asset_loaders import qaihm_temp_dir def test_from_precompiled(): @@ -18,7 +17,7 @@ def test_from_precompiled(): @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") @pytest.mark.slow_cloud def test_export(): - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: exported_jobs = export_model( # Testing text_encoder as it's smallest model in # ControlNet pipeline diff --git a/qai_hub_models/models/convnext_tiny/README.md b/qai_hub_models/models/convnext_tiny/README.md index bd6e6674..9e71c767 100644 --- a/qai_hub_models/models/convnext_tiny/README.md +++ b/qai_hub_models/models/convnext_tiny/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/c a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/convnext_tiny/export.py b/qai_hub_models/models/convnext_tiny/export.py index f00fe11b..54ff513c 100644 --- a/qai_hub_models/models/convnext_tiny/export.py +++ b/qai_hub_models/models/convnext_tiny/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/convnext_tiny/perf.yaml b/qai_hub_models/models/convnext_tiny/perf.yaml index f8254acd..9d8dec94 100644 --- a/qai_hub_models/models/convnext_tiny/perf.yaml +++ b/qai_hub_models/models/convnext_tiny/perf.yaml @@ -8,6 +8,7 @@ aggregated: - Google Pixel 4 - Google Pixel 4a - Google Pixel 5a 5G + - QCS8550 (Proxy) - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -21,30 +22,63 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: + - Qcs8550 - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ConvNext-Tiny performance_metrics: - torchscript_onnx_tflite: - inference_time: 11504.0 - throughput: 86.92628650904034 + inference_time: 5710.0 + throughput: 175.13134851138355 + estimated_peak_memory_range: + min: 49152 + max: 2555016 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 328 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 328 + job_id: j2p0l7w6p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 3790.0 + throughput: 263.85224274406335 + estimated_peak_memory_range: + min: 86016 + max: 170428944 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 223 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 223 + job_id: jn5q3on4p + job_status: Passed + torchscript_onnx_ort: + inference_time: 16263.0 + throughput: 61.48927012236365 estimated_peak_memory_range: min: 32768 - max: 2493040 + max: 155815696 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 380 + layers_on_npu: 189 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 380 - job_id: jlpeoyo7g + total_layers: 189 + job_id: jwgo3qxxg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -53,36 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-02T15:30:19.195043Z' + timestamp: '2024-05-20T16:35:27.983972Z' + - torchscript_onnx_tflite: + inference_time: 3967.0 + throughput: 252.07965717166624 + estimated_peak_memory_range: + min: 16384 + max: 210597920 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 328 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 328 + job_id: j1p8zvnxp + job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 2727.0 + throughput: 366.70333700036673 estimated_peak_memory_range: min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + max: 88673616 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 223 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: '' - job_status: Skipped - - torchscript_onnx_tflite: - inference_time: 8139.0 - throughput: 122.86521685710775 + total_layers: 223 + job_id: j1gl3rd8g + job_status: Passed + torchscript_onnx_ort: + inference_time: 11672.0 + throughput: 85.67511994516792 estimated_peak_memory_range: - min: 20480 - max: 209217264 + min: 618496 + max: 58874592 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 380 + layers_on_npu: 189 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 380 - job_id: jygz2n2zg + total_layers: 189 + job_id: j1pvvx8jp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -91,19 +140,95 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-02T15:30:19.195057Z' + timestamp: '2024-05-20T16:35:27.984000Z' + - torchscript_onnx_tflite: + inference_time: 5754.0 + throughput: 173.79214459506431 + estimated_peak_memory_range: + min: 24576 + max: 2610064 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 328 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 328 + job_id: jogk3m125 + job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 3773.0 + throughput: 265.041081367612 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 90112 + max: 202074560 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 223 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: '' - job_status: Skipped + total_layers: 223 + job_id: j1p3e2dl5 + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:27.984018Z' + - torchscript_onnx_qnn: + inference_time: 3953.0 + throughput: 252.97242600556538 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 223 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 223 + job_id: jw56nlx0g + job_status: Passed + torchscript_onnx_ort: + inference_time: 17160.0 + throughput: 58.27505827505828 + estimated_peak_memory_range: + min: 441618432 + max: 441618432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 189 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 189 + job_id: j7gje49x5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 36452.0 + throughput: 27.4333369911116 + estimated_peak_memory_range: + min: 1425408 + max: 1425408 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 202 + total_layers: 202 + job_id: jlpek3q1p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:27.984040Z' diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md b/qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md new file mode 100644 index 00000000..4266dbef --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md @@ -0,0 +1,56 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ConvNext-Tiny-w8a16-Quantized: Imagenet classifier and general purpose backbone](#) + +ConvNextTiny is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ConvNext-Tiny-w8a16-Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](#). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.convnext_tiny_w8a16_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.convnext_tiny_w8a16_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of ConvNext-Tiny-w8a16-Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/__init__.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/__init__.py new file mode 100644 index 00000000..599858c4 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ConvNextTinyW8A16Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/conftest.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/conftest.py new file mode 100644 index 00000000..1f2c01d5 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/conftest.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import inspect + +import pytest + +from qai_hub_models.models.convnext_tiny_w8a16_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +# Instantiate the model only once for all tests. +# Mock from_pretrained to always return the initialized model. +# This speeds up tests and limits memory leaks. +@pytest.fixture(scope="module", autouse=True) +def cached_from_pretrained(): + with pytest.MonkeyPatch.context() as mp: + pretrained_cache = {} + from_pretrained = Model.from_pretrained + sig = inspect.signature(from_pretrained) + + @skip_clone_repo_check + def _cached_from_pretrained(*args, **kwargs): + cache_key = str(args) + str(kwargs) + model = pretrained_cache.get(cache_key, None) + if model: + return model + else: + model = from_pretrained(*args, **kwargs) + pretrained_cache[cache_key] = model + return model + + _cached_from_pretrained.__signature__ = sig + + mp.setattr(Model, "from_pretrained", _cached_from_pretrained) + yield mp diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/demo.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/demo.py new file mode 100644 index 00000000..927a3d6c --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/demo.py @@ -0,0 +1,17 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import ( + MODEL_ID, + ConvNextTinyW8A16Quantizable, +) + + +def main(is_test: bool = False): + imagenet_demo(ConvNextTinyW8A16Quantizable, MODEL_ID, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py new file mode 100644 index 00000000..7dcb6b96 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py @@ -0,0 +1,227 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.convnext_tiny_w8a16_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "convnext_tiny_w8a16_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "convnext_tiny_w8a16_quantized", + "ConvNext-Tiny-w8a16-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + channel_last_flags, hub_device + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub_device, + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub_device, + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub_device, + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics( + inference_job, inference_result, torch_out, metrics="psnr,top1,top5" + ) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_ort=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/info.yaml b/qai_hub_models/models/convnext_tiny_w8a16_quantized/info.yaml new file mode 100644 index 00000000..5370c05d --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/info.yaml @@ -0,0 +1,42 @@ +name: ConvNext-Tiny-w8a16-Quantized +# id must match with the model dir name in qai_hub_models +id: convnext_tiny_w8a16_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: ConvNextTiny is a machine learning model that can classify images from + the Imagenet dataset. It can also be used as a backbone in building more complex + models for specific use cases. +use_case: Image Classification +tags: + - quantized +research_paper: https://arxiv.org/abs/2201.03545 +research_paper_title: A ConvNet for the 2020s +license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py +technical_details: + Model checkpoint: Imagenet + Input resolution: 224x224 + Number of parameters: 28.6M + Model size: 28 MB + Precision: w8a16 (8-bit weights, 16-bit activations) +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +form_factors: + - Phone + - Tablet + - IoT +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +deploy_license_type: AI Model Hub License +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/model.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/model.py new file mode 100644 index 00000000..3da97038 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/model.py @@ -0,0 +1,37 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from pathlib import Path + +from aimet_torch.quantsim import QuantizationSimModel + +from qai_hub_models.models._shared.convnext_tiny_quantized.model import ( + ConvNextTinyQuantizableBase, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +DEFAULT_ENCODINGS = "convnext_tiny_w8a16_quantized_encodings.json" + + +class ConvNextTinyW8A16Quantizable(ConvNextTinyQuantizableBase): + def __init__( + self, + quant_sim_model: QuantizationSimModel, + ) -> None: + ConvNextTinyQuantizableBase.__init__(self, quant_sim_model) + + @classmethod + def _default_aimet_encodings(cls) -> str | Path: + return CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + + @classmethod + def _output_bw(cls) -> int: + return 16 diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/test.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/test.py new file mode 100644 index 00000000..2931e003 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/test.py @@ -0,0 +1,31 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, +) +from qai_hub_models.models.convnext_tiny_w8a16_quantized.demo import main as demo_main +from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ConvNextTinyW8A16Quantizable, +) +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_task(): + run_imagenet_classifier_test( + ConvNextTinyW8A16Quantizable.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + probability_threshold=0.56, + diff_tol=0.06, + ) + + +@skip_clone_repo_check +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md b/qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md new file mode 100644 index 00000000..3f86e66c --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md @@ -0,0 +1,56 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ConvNext-Tiny-w8a8-Quantized: Imagenet classifier and general purpose backbone](#) + +ConvNextTiny is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ConvNext-Tiny-w8a8-Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](#). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.convnext_tiny_w8a8_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.convnext_tiny_w8a8_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of ConvNext-Tiny-w8a8-Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/__init__.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/__init__.py new file mode 100644 index 00000000..13778437 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ConvNextTinyW8A8Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/conftest.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/conftest.py new file mode 100644 index 00000000..e737cdbc --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/conftest.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import inspect + +import pytest + +from qai_hub_models.models.convnext_tiny_w8a8_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +# Instantiate the model only once for all tests. +# Mock from_pretrained to always return the initialized model. +# This speeds up tests and limits memory leaks. +@pytest.fixture(scope="module", autouse=True) +def cached_from_pretrained(): + with pytest.MonkeyPatch.context() as mp: + pretrained_cache = {} + from_pretrained = Model.from_pretrained + sig = inspect.signature(from_pretrained) + + @skip_clone_repo_check + def _cached_from_pretrained(*args, **kwargs): + cache_key = str(args) + str(kwargs) + model = pretrained_cache.get(cache_key, None) + if model: + return model + else: + model = from_pretrained(*args, **kwargs) + pretrained_cache[cache_key] = model + return model + + _cached_from_pretrained.__signature__ = sig + + mp.setattr(Model, "from_pretrained", _cached_from_pretrained) + yield mp diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/demo.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/demo.py new file mode 100644 index 00000000..adc48957 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/demo.py @@ -0,0 +1,17 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import ( + MODEL_ID, + ConvNextTinyW8A8Quantizable, +) + + +def main(is_test: bool = False): + imagenet_demo(ConvNextTinyW8A8Quantizable, MODEL_ID, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py new file mode 100644 index 00000000..8d9cca73 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py @@ -0,0 +1,227 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.convnext_tiny_w8a8_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "convnext_tiny_w8a8_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "convnext_tiny_w8a8_quantized", + "ConvNext-Tiny-w8a8-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + channel_last_flags, hub_device + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub_device, + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub_device, + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub_device, + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics( + inference_job, inference_result, torch_out, metrics="psnr,top1,top5" + ) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_ort=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/info.yaml b/qai_hub_models/models/convnext_tiny_w8a8_quantized/info.yaml new file mode 100644 index 00000000..b3770255 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/info.yaml @@ -0,0 +1,42 @@ +name: ConvNext-Tiny-w8a8-Quantized +# id must match with the model dir name in qai_hub_models +id: convnext_tiny_w8a8_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: ConvNextTiny is a machine learning model that can classify images from + the Imagenet dataset. It can also be used as a backbone in building more complex + models for specific use cases. +use_case: Image Classification +tags: + - quantized +research_paper: https://arxiv.org/abs/2201.03545 +research_paper_title: A ConvNet for the 2020s +license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py +technical_details: + Model checkpoint: Imagenet + Input resolution: 224x224 + Number of parameters: 28.6M + Model size: 28 MB + Precision: w8a8 (8-bit weights, 8-bit activations) +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +form_factors: + - Phone + - Tablet + - IoT +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +deploy_license_type: AI Model Hub License +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/model.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/model.py new file mode 100644 index 00000000..5e332910 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/model.py @@ -0,0 +1,37 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from pathlib import Path + +from aimet_torch.quantsim import QuantizationSimModel + +from qai_hub_models.models._shared.convnext_tiny_quantized.model import ( + ConvNextTinyQuantizableBase, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 + +DEFAULT_ENCODINGS = "convnext_tiny_w8a8_quantized_encodings.json" + + +class ConvNextTinyW8A8Quantizable(ConvNextTinyQuantizableBase): + def __init__( + self, + quant_sim_model: QuantizationSimModel, + ) -> None: + ConvNextTinyQuantizableBase.__init__(self, quant_sim_model) + + @classmethod + def _default_aimet_encodings(cls) -> str | Path: + return CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + + @classmethod + def _output_bw(cls) -> int: + return 8 diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/test.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/test.py new file mode 100644 index 00000000..b7fedd53 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/test.py @@ -0,0 +1,31 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, +) +from qai_hub_models.models.convnext_tiny_w8a8_quantized.demo import main as demo_main +from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ConvNextTinyW8A8Quantizable, +) +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@skip_clone_repo_check +def test_task(): + run_imagenet_classifier_test( + ConvNextTinyW8A8Quantizable.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + probability_threshold=0.56, + diff_tol=0.06, + ) + + +@skip_clone_repo_check +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/ddrnet23_slim/README.md b/qai_hub_models/models/ddrnet23_slim/README.md index fab4f087..22b47996 100644 --- a/qai_hub_models/models/ddrnet23_slim/README.md +++ b/qai_hub_models/models/ddrnet23_slim/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/ddrnet23_slim/export.py b/qai_hub_models/models/ddrnet23_slim/export.py index 2e17af1d..da2e4cdc 100644 --- a/qai_hub_models/models/ddrnet23_slim/export.py +++ b/qai_hub_models/models/ddrnet23_slim/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ddrnet23_slim/perf.yaml b/qai_hub_models/models/ddrnet23_slim/perf.yaml index bd99b239..8c0b1142 100644 --- a/qai_hub_models/models/ddrnet23_slim/perf.yaml +++ b/qai_hub_models/models/ddrnet23_slim/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DDRNet23-Slim performance_metrics: - torchscript_onnx_tflite: - inference_time: 6651.0 - throughput: 150.35333032626673 + inference_time: 6617.0 + throughput: 151.1258878645912 estimated_peak_memory_range: - min: 1007616 - max: 2683032 + min: 212992 + max: 2249480 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,23 +48,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 131 - job_id: j0pxndrl5 + job_id: jogk3mj25 job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 9555.0 + throughput: 104.65724751439038 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 11808768 + max: 48661000 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 155 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jegnl7qq5 - job_status: Failed + total_layers: 155 + job_id: jw56nlk0g + job_status: Passed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -70,13 +72,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.356614Z' + timestamp: '2024-05-20T16:35:28.530240Z' - torchscript_onnx_tflite: - inference_time: 4569.0 - throughput: 218.8662727073758 + inference_time: 4661.0 + throughput: 214.5462347135808 estimated_peak_memory_range: - min: 16384 - max: 71802832 + min: 40960 + max: 74706752 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,23 +86,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 131 - job_id: jo5mqdk9p + job_id: jn5q3oj4p job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 6060.0 + throughput: 165.01650165016503 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 2203648 + max: 39944288 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 155 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jopr8nd75 - job_status: Failed + total_layers: 155 + job_id: j1p3e2yl5 + job_status: Passed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -108,13 +110,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.356678Z' + timestamp: '2024-05-20T16:35:28.530261Z' - torchscript_onnx_tflite: - inference_time: 6682.0 - throughput: 149.655791679138 + inference_time: 6700.0 + throughput: 149.2537313432836 estimated_peak_memory_range: - min: 1011712 - max: 3063152 + min: 1019904 + max: 2922360 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +124,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 131 - job_id: jnp1y108p + job_id: j1gl3rj8g job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +133,42 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.356703Z' + timestamp: '2024-05-20T16:35:28.530273Z' + - torchscript_onnx_ort: + inference_time: 9528.0 + throughput: 104.95382031905962 + estimated_peak_memory_range: + min: 9854976 + max: 9854976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 155 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 155 + job_id: jwgo3qjxg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 38162.0 + throughput: 26.20407735443635 + estimated_peak_memory_range: + min: 104890368 + max: 104890368 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 105 + total_layers: 105 + job_id: j1pvvxjjp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.530292Z' diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md b/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md index 175a743c..49ab4d78 100644 --- a/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md +++ b/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py b/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py index 42105651..0f531913 100644 --- a/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py +++ b/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml b/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml index b2d5c52e..6a911252 100644 --- a/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml +++ b/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DeepLabV3-Plus-MobileNet performance_metrics: - torchscript_onnx_tflite: - inference_time: 13206.0 - throughput: 75.72315614114797 + inference_time: 13090.0 + throughput: 76.39419404125286 estimated_peak_memory_range: - min: 21012480 - max: 37177032 + min: 20566016 + max: 22394640 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 98 - job_id: jz57xlxvg + job_id: j7gje4jx5 job_status: Passed torchscript_onnx_qnn: - inference_time: 12804.0 - throughput: 78.10059356451109 + inference_time: 12915.0 + throughput: 77.42934572202864 estimated_peak_memory_range: - min: 1888256 - max: 20259784 + min: 2191360 + max: 18354728 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,7 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 124 - job_id: jo5m363dg + job_id: jz5wqno65 + job_status: Passed + torchscript_onnx_ort: + inference_time: 18188.0 + throughput: 54.98130635583902 + estimated_peak_memory_range: + min: 46182400 + max: 80384080 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jz57x3zlg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-30T00:18:21.085559Z' + timestamp: '2024-05-20T16:35:28.552289Z' - torchscript_onnx_tflite: - inference_time: 9587.0 - throughput: 104.30791697089809 + inference_time: 9659.0 + throughput: 103.53038616834041 estimated_peak_memory_range: - min: 49152 - max: 66968480 + min: 45056 + max: 67351648 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 98 - job_id: jqp4vdv8p + job_id: jlpek3j1p job_status: Passed torchscript_onnx_qnn: - inference_time: 9430.0 - throughput: 106.04453870625663 + inference_time: 9460.0 + throughput: 105.70824524312897 estimated_peak_memory_range: - min: 3174400 - max: 56418592 + min: 3194880 + max: 56272272 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,7 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 124 - job_id: jegn3m3k5 + job_id: jmg9wevlp + job_status: Passed + torchscript_onnx_ort: + inference_time: 14020.0 + throughput: 71.32667617689016 + estimated_peak_memory_range: + min: 51036160 + max: 82578320 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jqp4v0qvp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-30T00:18:21.085754Z' + timestamp: '2024-05-20T16:35:28.552316Z' - torchscript_onnx_tflite: - inference_time: 13237.0 - throughput: 75.54581853894386 + inference_time: 13152.0 + throughput: 76.03406326034063 estimated_peak_memory_range: - min: 22167552 - max: 24453856 + min: 19988480 + max: 21886552 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 98 - job_id: j0pxy6y3g + job_id: jygzrk1k5 job_status: Passed torchscript_onnx_qnn: - inference_time: 12986.0 - throughput: 77.00600646850454 + inference_time: 12898.0 + throughput: 77.53140021708792 estimated_peak_memory_range: - min: 3194880 - max: 26458984 + min: 3207168 + max: 24171704 primary_compute_unit: NPU precision: fp16 layer_info: @@ -137,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 124 - job_id: jopre2e05 + job_id: jvgdolwep job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -146,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-30T00:18:21.085930Z' + timestamp: '2024-05-20T16:35:28.552334Z' + - torchscript_onnx_qnn: + inference_time: 16530.0 + throughput: 60.49606775559589 + estimated_peak_memory_range: + min: 3170304 + max: 3170304 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 124 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 124 + job_id: jnp1ex02g + job_status: Passed + torchscript_onnx_ort: + inference_time: 16738.0 + throughput: 59.7442944198829 + estimated_peak_memory_range: + min: 107229184 + max: 107229184 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: j0pxy2v1g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 192375.0 + throughput: 5.198180636777128 + estimated_peak_memory_range: + min: 387981312 + max: 387981312 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jo5m3yrwg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.552356Z' diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md index 82b1c2b3..4595e199 100644 --- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md +++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py index 9ba9af15..a4dc4b5c 100644 --- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py +++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py @@ -124,12 +124,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -171,8 +175,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -200,8 +206,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -213,7 +223,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml index 71558a21..37b1f507 100644 --- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml +++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DeepLabV3-Plus-MobileNet-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 3523.0 - throughput: 283.84899233607723 + inference_time: 3349.0 + throughput: 298.59659599880564 estimated_peak_memory_range: min: 12288 - max: 2061128 + max: 1753112 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 99 - job_id: j2p0l2l9p + job_id: jegn382r5 job_status: Passed torchscript_onnx_qnn: - inference_time: 5308.0 - throughput: 188.39487565938205 + inference_time: 5370.0 + throughput: 186.21973929236498 estimated_peak_memory_range: min: 806912 - max: 9579664 + max: 8194984 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,7 +69,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 100 - job_id: jw56nzn6g + job_id: jqpy60e75 + job_status: Passed + torchscript_onnx_ort: + inference_time: 18506.0 + throughput: 54.03652869339674 + estimated_peak_memory_range: + min: 102789120 + max: 122435512 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 122 + layers_on_gpu: 0 + layers_on_cpu: 51 + total_layers: 173 + job_id: jn5q3o84p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-30T00:18:21.431467Z' + timestamp: '2024-05-20T16:35:28.582800Z' - torchscript_onnx_tflite: - inference_time: 2623.0 - throughput: 381.2428516965307 + inference_time: 2567.0 + throughput: 389.5597974289053 estimated_peak_memory_range: min: 12288 - max: 58004960 + max: 57529696 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 99 - job_id: j1p8zmzkp + job_id: joprejk95 job_status: Passed torchscript_onnx_qnn: - inference_time: 3894.0 - throughput: 256.8053415511043 + inference_time: 3971.0 + throughput: 251.82573659027952 estimated_peak_memory_range: - min: 802816 - max: 58260400 + min: 962560 + max: 56354464 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,7 +122,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 100 - job_id: j1p3e1e35 + job_id: j2p0l7y6p + job_status: Passed + torchscript_onnx_ort: + inference_time: 13687.0 + throughput: 73.06202966318405 + estimated_peak_memory_range: + min: 80236544 + max: 138756336 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 122 + layers_on_gpu: 0 + layers_on_cpu: 51 + total_layers: 173 + job_id: j1gl3rn8g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,13 +146,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-30T00:18:21.431668Z' + timestamp: '2024-05-20T16:35:28.582825Z' - torchscript_onnx_tflite: - inference_time: 15123.0 - throughput: 66.12444620776301 + inference_time: 3337.0 + throughput: 299.6703626011387 estimated_peak_memory_range: - min: 40960 - max: 41498720 + min: 12288 + max: 2058944 primary_compute_unit: NPU precision: int8 layer_info: @@ -128,14 +160,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 99 - job_id: jn5q3r3np + job_id: jep2ln84g job_status: Passed torchscript_onnx_qnn: - inference_time: 19868.0 - throughput: 50.33219247030401 + inference_time: 5351.0 + throughput: 186.88095683049897 estimated_peak_memory_range: - min: 802816 - max: 50369568 + min: 0 + max: 6063744 primary_compute_unit: NPU precision: int8 layer_info: @@ -143,75 +175,83 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 100 - job_id: j1pvvrwkp + job_id: jogk3mz25 job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-30T00:18:21.431848Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:28.582842Z' - torchscript_onnx_tflite: - inference_time: 124831.0 - throughput: 8.010830643029376 + inference_time: 15025.0 + throughput: 66.55574043261231 estimated_peak_memory_range: - min: 11464704 - max: 28637488 + min: 5541888 + max: 47370848 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 96 - layers_on_gpu: 3 + layers_on_npu: 99 + layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 99 - job_id: j1gl323jg + job_id: jnp1e1wkg job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 20149.0 + throughput: 49.63025460320611 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 12288 + max: 49872128 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 100 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: j7gje2lv5 - job_status: Failed + total_layers: 100 + job_id: jegn3q3v5 + job_status: Passed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-30T00:18:21.432019Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:28.582857Z' - torchscript_onnx_tflite: - inference_time: 3534.0 - throughput: 282.9654782116582 + inference_time: 125926.0 + throughput: 7.941171799310706 estimated_peak_memory_range: - min: 12288 - max: 17568944 + min: 11571200 + max: 17936624 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 99 - layers_on_gpu: 0 + layers_on_npu: 96 + layers_on_gpu: 3 layers_on_cpu: 0 total_layers: 99 - job_id: jogk3q3w5 + job_id: jvgdo4qkp job_status: Passed - torchscript_onnx_qnn: - inference_time: 5297.0 - throughput: 188.78610534264678 + reference_device_info: + name: RB5 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:28.582868Z' + - torchscript_onnx_qnn: + inference_time: 5343.0 + throughput: 187.16077110237694 estimated_peak_memory_range: - min: 831488 - max: 14169232 + min: 790528 + max: 790528 primary_compute_unit: NPU precision: int8 layer_info: @@ -219,13 +259,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 100 - job_id: jwgo3n3qg + job_id: j1p8zvoxp + job_status: Passed + torchscript_onnx_ort: + inference_time: 50376.0 + throughput: 19.850722566301414 + estimated_peak_memory_range: + min: 130891776 + max: 130891776 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 122 + layers_on_gpu: 0 + layers_on_cpu: 51 + total_layers: 173 + job_id: jw56nl60g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 472873.0 + throughput: 2.114732708359327 + estimated_peak_memory_range: + min: 248066048 + max: 248066048 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j1p3e2kl5 job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-30T00:18:21.432205Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.582890Z' diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py index b2203267..999eebf7 100644 --- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py +++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import os -import tempfile import zipfile import torch @@ -23,6 +22,7 @@ CachedWebModelAsset, load_image, load_numpy, + qaihm_temp_dir, ) from qai_hub_models.utils.testing import skip_clone_repo_check @@ -48,7 +48,7 @@ def test_task(): def test_aimet_export(): model = DeepLabV3PlusMobilenetQuantizable.from_pretrained() name = model.__class__.__name__ - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: output_zip = model.convert_to_onnx_and_aimet_encodings( tmpdir, ) diff --git a/qai_hub_models/models/deeplabv3_resnet50/README.md b/qai_hub_models/models/deeplabv3_resnet50/README.md index ab4e87f0..1fec3ba3 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/README.md +++ b/qai_hub_models/models/deeplabv3_resnet50/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/deeplabv3_resnet50/export.py b/qai_hub_models/models/deeplabv3_resnet50/export.py index 3f35964c..0e6e2e19 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/export.py +++ b/qai_hub_models/models/deeplabv3_resnet50/export.py @@ -120,12 +120,17 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + + " --force_channel_last_output output_0,output_1" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0,output_1", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +168,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +199,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0,output_1", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0,output_1", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +216,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/deeplabv3_resnet50/model.py b/qai_hub_models/models/deeplabv3_resnet50/model.py index 190f6e20..9dc8cdb7 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/model.py +++ b/qai_hub_models/models/deeplabv3_resnet50/model.py @@ -4,7 +4,10 @@ # --------------------------------------------------------------------- from __future__ import annotations +from typing import Optional + import torchvision.models as tv_models +from qai_hub.client import Device from qai_hub_models.models._shared.deeplab.model import DeepLabV3Model from qai_hub_models.utils.base_model import TargetRuntime @@ -23,20 +26,36 @@ def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> DeepLabV3_ResNet50: return cls(model) def get_hub_compile_options( - self, target_runtime: TargetRuntime, other_compile_options: str = "" + self, + target_runtime: TargetRuntime, + other_compile_options: str = "", + device: Optional[Device] = None, ) -> str: compile_options = super().get_hub_compile_options( - target_runtime, other_compile_options + target_runtime, other_compile_options, device ) - return compile_options + " --compute_unit gpu" + if ( + target_runtime == TargetRuntime.TFLITE + and "--compute_unit" not in compile_options + ): + compile_options = compile_options + " --compute_unit gpu" + return compile_options def get_hub_profile_options( - self, target_runtime: TargetRuntime, other_profile_options: str = "" + self, + target_runtime: TargetRuntime, + other_profile_options: str = "", ) -> str: profile_options = super().get_hub_profile_options( - target_runtime, other_profile_options + target_runtime, + other_profile_options, ) - return profile_options + " --compute_unit gpu" + if ( + target_runtime == TargetRuntime.TFLITE + and "--compute_unit" not in profile_options + ): + profile_options = profile_options + " --compute_unit gpu" + return profile_options def forward(self, image): return super().forward(image)["out"] diff --git a/qai_hub_models/models/deeplabv3_resnet50/perf.yaml b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml index 30be88e6..972d9c32 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/perf.yaml +++ b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DeepLabV3-ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 290847.0 - throughput: 3.4382338480369405 + inference_time: 295509.0 + throughput: 3.383991688916412 estimated_peak_memory_range: - min: 32768 - max: 223952912 + min: 12288 + max: 211050624 primary_compute_unit: GPU precision: fp16 layer_info: @@ -46,23 +48,38 @@ models: layers_on_gpu: 95 layers_on_cpu: 0 total_layers: 95 - job_id: jz5wq3935 + job_id: jwgo3qyxg job_status: Passed torchscript_onnx_qnn: - inference_time: 810711.0 - throughput: 1.23348517535842 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 3481600 - max: 11830488 - primary_compute_unit: GPU - precision: fp16 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: layers_on_npu: 0 - layers_on_gpu: 83 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 83 - job_id: jvgdoqvrp - job_status: Passed + total_layers: 0 + job_id: jlpek391p + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jmg9we1lp + job_status: Failed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-30T00:18:21.450422Z' + timestamp: '2024-05-20T16:35:28.622222Z' - torchscript_onnx_tflite: - inference_time: 228363.0 - throughput: 4.37899309432789 + inference_time: 227563.0 + throughput: 4.394387488299944 estimated_peak_memory_range: - min: 102400 - max: 31114256 + min: 69632 + max: 30257776 primary_compute_unit: GPU precision: fp16 layer_info: @@ -84,23 +101,38 @@ models: layers_on_gpu: 95 layers_on_cpu: 0 total_layers: 95 - job_id: jmg9wy4wp + job_id: j1pvvx3jp job_status: Passed torchscript_onnx_qnn: - inference_time: 588856.0 - throughput: 1.6982080508647275 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 3207168 - max: 37364864 - primary_compute_unit: GPU - precision: fp16 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: layers_on_npu: 0 - layers_on_gpu: 83 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 83 - job_id: jz57xldvg - job_status: Passed + total_layers: 0 + job_id: jygzrkek5 + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jnp1exl2g + job_status: Failed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -108,8 +140,23 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-30T00:18:21.450461Z' + timestamp: '2024-05-20T16:35:28.622250Z' - torchscript_onnx_tflite: + inference_time: 292688.0 + throughput: 3.416607445470945 + estimated_peak_memory_range: + min: 1380352 + max: 149690448 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 95 + layers_on_cpu: 0 + total_layers: 95 + job_id: j7gje4xx5 + job_status: Passed + torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' estimated_peak_memory_range: @@ -122,23 +169,8 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jnp1ew88g + job_id: jz5wqnv65 job_status: Failed - torchscript_onnx_qnn: - inference_time: 821173.0 - throughput: 1.217770189716418 - estimated_peak_memory_range: - min: 3436544 - max: 12462344 - primary_compute_unit: GPU - precision: fp16 - layer_info: - layers_on_npu: 0 - layers_on_gpu: 83 - layers_on_cpu: 0 - total_layers: 83 - job_id: jqp4vdw8p - job_status: Passed reference_device_info: name: QCS8550 (Proxy) os: '12' @@ -146,4 +178,12 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-30T00:18:21.450490Z' + timestamp: '2024-05-20T16:35:28.622267Z' + - reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.622274Z' diff --git a/qai_hub_models/models/densenet121/README.md b/qai_hub_models/models/densenet121/README.md index d0d9ab9e..1f95a118 100644 --- a/qai_hub_models/models/densenet121/README.md +++ b/qai_hub_models/models/densenet121/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/densenet121/export.py b/qai_hub_models/models/densenet121/export.py index 3fcb6bf7..341660ea 100644 --- a/qai_hub_models/models/densenet121/export.py +++ b/qai_hub_models/models/densenet121/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/densenet121/perf.yaml b/qai_hub_models/models/densenet121/perf.yaml index 97bd9840..29060c6f 100644 --- a/qai_hub_models/models/densenet121/perf.yaml +++ b/qai_hub_models/models/densenet121/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DenseNet-121 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1945.0 - throughput: 514.1388174807198 + inference_time: 1948.0 + throughput: 513.347022587269 estimated_peak_memory_range: min: 16384 - max: 2306688 + max: 2162632 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 312 - job_id: jvgdezmz5 + job_id: jvgdol9ep job_status: Passed torchscript_onnx_qnn: - inference_time: 2005.0 - throughput: 498.75311720698255 + inference_time: 1981.0 + throughput: 504.79555779909134 estimated_peak_memory_range: min: 12288 - max: 40807680 + max: 18832024 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,23 +63,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 372 - job_id: jqp4k921g + job_id: jnp1exl8g job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 1971.0 + throughput: 507.35667174023337 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 12288 + max: 46477632 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 374 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jo5mqdl9p - job_status: Failed + total_layers: 374 + job_id: j0pxy2j3g + job_status: Passed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.432252Z' + timestamp: '2024-05-20T16:35:28.650430Z' - torchscript_onnx_tflite: - inference_time: 1282.0 - throughput: 780.0312012480499 + inference_time: 1321.0 + throughput: 757.002271006813 estimated_peak_memory_range: - min: 12288 - max: 95228096 + min: 16384 + max: 95688784 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 312 - job_id: jz570789g + job_id: jz5wqnv35 job_status: Passed torchscript_onnx_qnn: - inference_time: 1330.0 - throughput: 751.8796992481203 + inference_time: 1319.0 + throughput: 758.1501137225171 estimated_peak_memory_range: min: 618496 - max: 155690704 + max: 162806592 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,23 +116,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 372 - job_id: j0pxndzl5 + job_id: jvgdol9rp job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 1355.0 + throughput: 738.0073800738007 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 618496 + max: 49577808 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 374 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jegnl7wq5 - job_status: Failed + total_layers: 374 + job_id: jo5m3y2dg + job_status: Passed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.432349Z' + timestamp: '2024-05-20T16:35:28.650456Z' - torchscript_onnx_tflite: - inference_time: 1944.0 - throughput: 514.40329218107 + inference_time: 1948.0 + throughput: 513.347022587269 estimated_peak_memory_range: - min: 20480 - max: 2194800 + min: 28672 + max: 2603520 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 312 - job_id: j0pxnrjl5 + job_id: jmg9we1wp job_status: Passed torchscript_onnx_qnn: - inference_time: 2008.0 - throughput: 498.00796812749 + inference_time: 1983.0 + throughput: 504.2864346949067 estimated_peak_memory_range: - min: 12288 - max: 40726728 + min: 622592 + max: 6049752 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 372 - job_id: jep20d6qg + job_id: jqp4v0o8p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.432464Z' + timestamp: '2024-05-20T16:35:28.650472Z' + - torchscript_onnx_qnn: + inference_time: 2255.0 + throughput: 443.4589800443459 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 372 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 372 + job_id: jz57x3wvg + job_status: Passed + torchscript_onnx_ort: + inference_time: 2014.0 + throughput: 496.52432969215494 + estimated_peak_memory_range: + min: 606208 + max: 606208 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 374 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 374 + job_id: jegn38yk5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 17596.0 + throughput: 56.83109797681291 + estimated_peak_memory_range: + min: 856064 + max: 856064 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 311 + total_layers: 311 + job_id: joprejq05 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.650494Z' diff --git a/qai_hub_models/models/detr_resnet101/README.md b/qai_hub_models/models/detr_resnet101/README.md index 6ed10970..adaff2fc 100644 --- a/qai_hub_models/models/detr_resnet101/README.md +++ b/qai_hub_models/models/detr_resnet101/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/detr_resnet101/export.py b/qai_hub_models/models/detr_resnet101/export.py index 334955f6..ff0acdce 100644 --- a/qai_hub_models/models/detr_resnet101/export.py +++ b/qai_hub_models/models/detr_resnet101/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +206,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/detr_resnet101/perf.yaml b/qai_hub_models/models/detr_resnet101/perf.yaml index 41a76998..5f60a399 100644 --- a/qai_hub_models/models/detr_resnet101/perf.yaml +++ b/qai_hub_models/models/detr_resnet101/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,38 +31,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DETR-ResNet101 performance_metrics: - torchscript_onnx_tflite: - inference_time: 47978.0 - throughput: 20.842886322897996 + inference_time: 24664.0 + throughput: 40.5449237755433 estimated_peak_memory_range: - min: 94208 - max: 9060976 + min: 438272 + max: 3728248 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 910 - layers_on_gpu: 2 + layers_on_npu: 839 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 839 + job_id: jep2ln6rg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 21040.0 + throughput: 47.52851711026616 + estimated_peak_memory_range: + min: 2801664 + max: 31885224 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 912 - job_id: jep20vzqg + total_layers: 1084 + job_id: j1p8zv9kp job_status: Passed torchscript_onnx_ort: - inference_time: 26243.0 - throughput: 38.105399535114124 + inference_time: 22542.0 + throughput: 44.36163605713779 estimated_peak_memory_range: - min: 0 - max: 299546600 - primary_compute_unit: CPU - precision: fp32 + min: 16384 + max: 296984832 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 856 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: j2p03vxnp + layers_on_cpu: 0 + total_layers: 856 + job_id: jw56nlj6g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,36 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.456092Z' + timestamp: '2024-05-20T16:35:28.680620Z' - torchscript_onnx_tflite: - inference_time: 35573.0 - throughput: 28.111207938605123 + inference_time: 17307.0 + throughput: 57.78008898133703 estimated_peak_memory_range: - min: 28672 - max: 261178736 + min: 106496 + max: 282048208 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 910 - layers_on_gpu: 2 + layers_on_npu: 839 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 912 - job_id: jqpyr7yl5 + total_layers: 839 + job_id: jqpy60w85 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 15126.0 + throughput: 66.11133148221606 + estimated_peak_memory_range: + min: 2797568 + max: 330730224 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1084 + job_id: jogk3mnw5 job_status: Passed torchscript_onnx_ort: - inference_time: 19779.0 - throughput: 50.558673340411545 + inference_time: 15844.0 + throughput: 63.11537490532694 estimated_peak_memory_range: - min: 3723264 - max: 90043392 - primary_compute_unit: CPU - precision: fp32 + min: 2781184 + max: 113431904 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 856 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: j1p804kog + layers_on_cpu: 0 + total_layers: 856 + job_id: j1p3e2335 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,21 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.456228Z' + timestamp: '2024-05-20T16:35:28.680646Z' - torchscript_onnx_tflite: - inference_time: 48057.0 - throughput: 20.80862309340991 + inference_time: 24760.0 + throughput: 40.38772213247173 estimated_peak_memory_range: - min: 1380352 - max: 12433288 + min: 405504 + max: 3265984 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 910 - layers_on_gpu: 2 + layers_on_npu: 839 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 912 - job_id: j1gl68rmg + total_layers: 839 + job_id: j2p0l7q9p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 21118.0 + throughput: 47.35296903115825 + estimated_peak_memory_range: + min: 2813952 + max: 31273000 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1084 + job_id: j1gl3rzjg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.456328Z' + timestamp: '2024-05-20T16:35:28.680664Z' + - torchscript_onnx_qnn: + inference_time: 31213.0 + throughput: 32.03793291256848 + estimated_peak_memory_range: + min: 2768896 + max: 2768896 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1084 + job_id: jn5q3oknp + job_status: Passed + torchscript_onnx_ort: + inference_time: 23126.0 + throughput: 43.24137334601747 + estimated_peak_memory_range: + min: 117997568 + max: 117997568 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 856 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 856 + job_id: jwgo3q0qg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 1815598.0 + throughput: 0.5507827173195828 + estimated_peak_memory_range: + min: 280969216 + max: 280969216 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 628 + total_layers: 628 + job_id: j1pvvxxkp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.680687Z' diff --git a/qai_hub_models/models/detr_resnet101_dc5/README.md b/qai_hub_models/models/detr_resnet101_dc5/README.md index 05a1484e..272c64ab 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/README.md +++ b/qai_hub_models/models/detr_resnet101_dc5/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/detr_resnet101_dc5/export.py b/qai_hub_models/models/detr_resnet101_dc5/export.py index 36390c36..2a05937e 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/export.py +++ b/qai_hub_models/models/detr_resnet101_dc5/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +206,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/detr_resnet101_dc5/perf.yaml b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml index a8021e26..51fb42f3 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/perf.yaml +++ b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,38 +31,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DETR-ResNet101-DC5 performance_metrics: - torchscript_onnx_tflite: - inference_time: 407929.0 - throughput: 2.451406985039063 + inference_time: 146017.0 + throughput: 6.848517638357178 estimated_peak_memory_range: - min: 7622656 - max: 15500416 + min: 1216512 + max: 4088024 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 911 - layers_on_gpu: 2 + layers_on_npu: 840 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 840 + job_id: j7gje44v5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 142673.0 + throughput: 7.009034645658254 + estimated_peak_memory_range: + min: 2891776 + max: 63987360 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 913 - job_id: jn5qemdo5 + total_layers: 1084 + job_id: jz5wqnn35 job_status: Passed torchscript_onnx_ort: - inference_time: 179129.0 - throughput: 5.582568986596252 + inference_time: 135442.0 + throughput: 7.383234151887893 estimated_peak_memory_range: - min: 2637824 - max: 309754336 - primary_compute_unit: CPU - precision: fp32 + min: 2297856 + max: 306707784 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 856 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: jw56ed9yg + layers_on_cpu: 0 + total_layers: 856 + job_id: jz57x33vg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,36 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.473830Z' + timestamp: '2024-05-20T16:35:28.711241Z' - torchscript_onnx_tflite: - inference_time: 311354.0 - throughput: 3.2117782331365583 + inference_time: 107206.0 + throughput: 9.327836128574893 estimated_peak_memory_range: - min: 90112 - max: 447334464 + min: 790528 + max: 492355520 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 911 - layers_on_gpu: 2 + layers_on_npu: 840 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 913 - job_id: j1gl619mg + total_layers: 840 + job_id: jlpek33op + job_status: Passed + torchscript_onnx_qnn: + inference_time: 100534.0 + throughput: 9.946883641355164 + estimated_peak_memory_range: + min: 460566528 + max: 811388336 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1084 + job_id: jmg9weewp job_status: Passed torchscript_onnx_ort: - inference_time: 135318.0 - throughput: 7.3899998522000026 + inference_time: 95212.0 + throughput: 10.502877788514052 estimated_peak_memory_range: - min: 10055680 - max: 190681632 - primary_compute_unit: CPU - precision: fp32 + min: 4116480 + max: 168196992 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 856 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: j1p3vwlng + layers_on_cpu: 0 + total_layers: 856 + job_id: jqp4v008p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,21 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.473979Z' + timestamp: '2024-05-20T16:35:28.711267Z' - torchscript_onnx_tflite: - inference_time: 405436.0 - throughput: 2.4664805296026993 + inference_time: 141747.0 + throughput: 7.054823029764298 estimated_peak_memory_range: - min: 6467584 - max: 13861952 + min: 184320 + max: 5835464 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 911 - layers_on_gpu: 2 + layers_on_npu: 840 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 913 - job_id: jlpeex3vp + total_layers: 840 + job_id: jygzrkko5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 144502.0 + throughput: 6.92031944194544 + estimated_peak_memory_range: + min: 2871296 + max: 58689696 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1084 + job_id: jvgdollrp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.474090Z' + timestamp: '2024-05-20T16:35:28.711285Z' + - torchscript_onnx_qnn: + inference_time: 172453.0 + throughput: 5.798681379854221 + estimated_peak_memory_range: + min: 2772992 + max: 2772992 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1084 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1084 + job_id: jnp1exx8g + job_status: Passed + torchscript_onnx_ort: + inference_time: 125853.0 + throughput: 7.945778010853933 + estimated_peak_memory_range: + min: 119799808 + max: 119799808 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 856 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 856 + job_id: j0pxy223g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2208720.0 + throughput: 0.4527509145568474 + estimated_peak_memory_range: + min: 280973312 + max: 280973312 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 628 + total_layers: 628 + job_id: jo5m3yydg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.711307Z' diff --git a/qai_hub_models/models/detr_resnet50/README.md b/qai_hub_models/models/detr_resnet50/README.md index 362f9f43..e37e8210 100644 --- a/qai_hub_models/models/detr_resnet50/README.md +++ b/qai_hub_models/models/detr_resnet50/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/detr_resnet50/export.py b/qai_hub_models/models/detr_resnet50/export.py index f775cebf..3d9e505c 100644 --- a/qai_hub_models/models/detr_resnet50/export.py +++ b/qai_hub_models/models/detr_resnet50/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +206,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/detr_resnet50/perf.yaml b/qai_hub_models/models/detr_resnet50/perf.yaml index cf9904e7..dacb9380 100644 --- a/qai_hub_models/models/detr_resnet50/perf.yaml +++ b/qai_hub_models/models/detr_resnet50/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,38 +31,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DETR-ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 39035.0 - throughput: 25.618035096708084 + inference_time: 20791.0 + throughput: 48.0977345967005 estimated_peak_memory_range: - min: 1327104 - max: 9193440 + min: 57344 + max: 3249616 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 842 - layers_on_gpu: 2 + layers_on_npu: 771 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 771 + job_id: jegn388k5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 19328.0 + throughput: 51.73841059602649 + estimated_peak_memory_range: + min: 2805760 + max: 23254680 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 844 - job_id: j1pv09yr5 + total_layers: 863 + job_id: jqpy60085 job_status: Passed torchscript_onnx_ort: - inference_time: 22280.0 - throughput: 44.88330341113106 + inference_time: 16790.0 + throughput: 59.55926146515783 estimated_peak_memory_range: - min: 1789952 - max: 205559344 - primary_compute_unit: CPU - precision: fp32 + min: 536576 + max: 208713080 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 737 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: jlpeel0vp + layers_on_cpu: 0 + total_layers: 737 + job_id: jn5q3oonp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,36 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.491676Z' + timestamp: '2024-05-20T16:35:28.741938Z' - torchscript_onnx_tflite: - inference_time: 28469.0 - throughput: 35.12592644631002 + inference_time: 14384.0 + throughput: 69.52169076751946 estimated_peak_memory_range: - min: 1241088 - max: 215942624 + min: 409600 + max: 231124128 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 842 - layers_on_gpu: 2 + layers_on_npu: 771 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 844 - job_id: j7gjzw6e5 + total_layers: 771 + job_id: joprejj05 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 13592.0 + throughput: 73.57268981753973 + estimated_peak_memory_range: + min: 2801664 + max: 247117184 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 863 + job_id: j2p0l779p job_status: Passed torchscript_onnx_ort: - inference_time: 17238.0 - throughput: 58.0113702285648 + inference_time: 11524.0 + throughput: 86.77542519958348 estimated_peak_memory_range: - min: 3723264 - max: 80445392 - primary_compute_unit: CPU - precision: fp32 + min: 4878336 + max: 99183200 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 737 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: jygzo4qx5 + layers_on_cpu: 0 + total_layers: 737 + job_id: j1gl3rrjg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,21 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.491805Z' + timestamp: '2024-05-20T16:35:28.741963Z' - torchscript_onnx_tflite: - inference_time: 38866.0 - throughput: 25.729429321257655 + inference_time: 20731.0 + throughput: 48.23693984853601 estimated_peak_memory_range: - min: 1429504 - max: 8463712 + min: 405504 + max: 3824656 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 842 - layers_on_gpu: 2 + layers_on_npu: 771 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 844 - job_id: jz570n39g + total_layers: 771 + job_id: jep2lnnrg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 19426.0 + throughput: 51.47740142077628 + estimated_peak_memory_range: + min: 40960 + max: 25594136 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 863 + job_id: jogk3mmw5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.491909Z' + timestamp: '2024-05-20T16:35:28.741980Z' + - torchscript_onnx_qnn: + inference_time: 22410.0 + throughput: 44.62293618920125 + estimated_peak_memory_range: + min: 2768896 + max: 2768896 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 863 + job_id: j1p8zvvkp + job_status: Passed + torchscript_onnx_ort: + inference_time: 17039.0 + throughput: 58.68889019308645 + estimated_peak_memory_range: + min: 33472512 + max: 33472512 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 737 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 737 + job_id: jw56nll6g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 68004.0 + throughput: 14.705017351920475 + estimated_peak_memory_range: + min: 3866624 + max: 3866624 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 577 + total_layers: 577 + job_id: j1p3e2235 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.742002Z' diff --git a/qai_hub_models/models/detr_resnet50_dc5/README.md b/qai_hub_models/models/detr_resnet50_dc5/README.md index cb3249d0..39efe2a1 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/README.md +++ b/qai_hub_models/models/detr_resnet50_dc5/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/detr_resnet50_dc5/export.py b/qai_hub_models/models/detr_resnet50_dc5/export.py index 4415c02a..ac9e21c6 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/export.py +++ b/qai_hub_models/models/detr_resnet50_dc5/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +206,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/detr_resnet50_dc5/perf.yaml b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml index 8cf30015..5ee7970c 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/perf.yaml +++ b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,38 +31,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: DETR-ResNet50-DC5 performance_metrics: - torchscript_onnx_tflite: - inference_time: 405395.0 - throughput: 2.4667299794028046 + inference_time: 135457.0 + throughput: 7.382416560236828 estimated_peak_memory_range: - min: 339968 - max: 8125832 + min: 1200128 + max: 4621488 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 843 - layers_on_gpu: 2 + layers_on_npu: 772 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 772 + job_id: jwgo3qqqg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 121332.0 + throughput: 8.2418488115254 + estimated_peak_memory_range: + min: 65536 + max: 55100088 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 845 - job_id: jmg9jx785 + total_layers: 863 + job_id: jlpevmmo5 job_status: Passed torchscript_onnx_ort: - inference_time: 174726.0 - throughput: 5.723246683378547 + inference_time: 119137.0 + throughput: 8.39369801153294 estimated_peak_memory_range: - min: 7774208 - max: 210473208 - primary_compute_unit: CPU - precision: fp32 + min: 679936 + max: 229172048 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 737 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: jvgdezyz5 + layers_on_cpu: 0 + total_layers: 737 + job_id: jnp18zz8g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,36 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.509285Z' + timestamp: '2024-05-20T16:35:28.772381Z' - torchscript_onnx_tflite: - inference_time: 306266.0 - throughput: 3.26513553577609 + inference_time: 102211.0 + throughput: 9.78368277386974 estimated_peak_memory_range: - min: 16384 - max: 412400848 + min: 1204224 + max: 442913328 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 843 - layers_on_gpu: 2 + layers_on_npu: 772 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 845 - job_id: jnp1yvk7p + total_layers: 772 + job_id: j1pvwkkkg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 92508.0 + throughput: 10.809875902624638 + estimated_peak_memory_range: + min: 2818048 + max: 287246416 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 863 + job_id: jygz7ddop job_status: Passed torchscript_onnx_ort: - inference_time: 130531.0 - throughput: 7.66101539097992 + inference_time: 90890.0 + throughput: 11.002310485201892 estimated_peak_memory_range: - min: 10014720 - max: 184574640 - primary_compute_unit: CPU - precision: fp32 + min: 4927488 + max: 146881408 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 3 + layers_on_npu: 737 layers_on_gpu: 0 - layers_on_cpu: 5 - total_layers: 8 - job_id: jz570719g + layers_on_cpu: 0 + total_layers: 737 + job_id: jvgdv11rg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,21 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.509384Z' + timestamp: '2024-05-20T16:35:28.772406Z' - torchscript_onnx_tflite: - inference_time: 400391.0 - throughput: 2.497558636432887 + inference_time: 134542.0 + throughput: 7.432623270056934 estimated_peak_memory_range: - min: 7581696 - max: 16235952 + min: 1204224 + max: 4576992 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 843 - layers_on_gpu: 2 + layers_on_npu: 772 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 845 - job_id: jegnlq8q5 + total_layers: 772 + job_id: j7gjlnnvp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 133524.0 + throughput: 7.4892903148497645 + estimated_peak_memory_range: + min: 16384 + max: 52330520 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 863 + job_id: jmg94nnw5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.509493Z' + timestamp: '2024-05-20T16:35:28.772424Z' + - torchscript_onnx_qnn: + inference_time: 165859.0 + throughput: 6.029217588433549 + estimated_peak_memory_range: + min: 2772992 + max: 2772992 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 863 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 863 + job_id: jz5w9663p + job_status: Passed + torchscript_onnx_ort: + inference_time: 119044.0 + throughput: 8.40025536776318 + estimated_peak_memory_range: + min: 31268864 + max: 31268864 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 737 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 737 + job_id: jz57drrv5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jqp4wrr8g + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.772448Z' diff --git a/qai_hub_models/models/efficientnet_b0/README.md b/qai_hub_models/models/efficientnet_b0/README.md index 197315d7..31379b53 100644 --- a/qai_hub_models/models/efficientnet_b0/README.md +++ b/qai_hub_models/models/efficientnet_b0/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/e a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/efficientnet_b0/export.py b/qai_hub_models/models/efficientnet_b0/export.py index 2bff3de6..d745fda1 100644 --- a/qai_hub_models/models/efficientnet_b0/export.py +++ b/qai_hub_models/models/efficientnet_b0/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/efficientnet_b0/perf.yaml b/qai_hub_models/models/efficientnet_b0/perf.yaml index 40fef96a..bdfab403 100644 --- a/qai_hub_models/models/efficientnet_b0/perf.yaml +++ b/qai_hub_models/models/efficientnet_b0/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: EfficientNet-B0 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1637.0 - throughput: 610.8735491753207 + inference_time: 1623.0 + throughput: 616.1429451632779 estimated_peak_memory_range: min: 24576 - max: 18330576 + max: 2090224 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: j0pxnd8l5 + job_id: j0px1oo3g job_status: Passed torchscript_onnx_qnn: - inference_time: 1692.0 - throughput: 591.016548463357 + inference_time: 1678.0 + throughput: 595.9475566150179 estimated_peak_memory_range: - min: 16384 - max: 89136624 + min: 12288 + max: 88022416 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 243 - job_id: jegnl7dq5 + job_id: jopry330g job_status: Passed torchscript_onnx_ort: - inference_time: 1847.0 - throughput: 541.4185165132648 + inference_time: 1575.0 + throughput: 634.9206349206349 estimated_peak_memory_range: min: 12288 - max: 80485720 + max: 80602048 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 245 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jep20vqqg + total_layers: 245 + job_id: j1p87yyk5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.527091Z' + timestamp: '2024-05-20T16:35:28.802791Z' - torchscript_onnx_tflite: - inference_time: 1177.0 - throughput: 849.6176720475786 + inference_time: 1162.0 + throughput: 860.5851979345955 estimated_peak_memory_range: min: 16384 - max: 70869408 + max: 71535472 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: jo5mqd19p + job_id: jo5mzxxdp job_status: Passed torchscript_onnx_qnn: - inference_time: 1180.0 - throughput: 847.457627118644 + inference_time: 1182.0 + throughput: 846.0236886632825 estimated_peak_memory_range: - min: 0 - max: 70362624 + min: 618496 + max: 69430064 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 243 - job_id: jopr8nm75 + job_id: jep2myyr5 job_status: Passed torchscript_onnx_ort: - inference_time: 1299.0 - throughput: 769.8229407236336 + inference_time: 1137.0 + throughput: 879.5074758135444 estimated_peak_memory_range: - min: 761856 - max: 28745360 + min: 0 + max: 34872096 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 245 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jqpyr7kl5 + total_layers: 245 + job_id: jogkyxxwp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.527166Z' + timestamp: '2024-05-20T16:35:28.802817Z' - torchscript_onnx_tflite: - inference_time: 1635.0 - throughput: 611.6207951070336 + inference_time: 1626.0 + throughput: 615.0061500615006 estimated_peak_memory_range: - min: 28672 - max: 2553520 + min: 24576 + max: 2679392 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: j1gl6qmmg + job_id: jegnevvkg job_status: Passed torchscript_onnx_qnn: - inference_time: 1694.0 - throughput: 590.318772136954 + inference_time: 1668.0 + throughput: 599.5203836930456 estimated_peak_memory_range: - min: 622592 - max: 68146216 + min: 16384 + max: 14848360 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 243 - job_id: j1pv0nkr5 + job_id: j2p0r009p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.527250Z' + timestamp: '2024-05-20T16:35:28.802834Z' + - torchscript_onnx_qnn: + inference_time: 1835.0 + throughput: 544.9591280653951 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 243 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 243 + job_id: jqpyd338p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1670.0 + throughput: 598.8023952095808 + estimated_peak_memory_range: + min: 34729984 + max: 34729984 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 245 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 245 + job_id: jn5q2qqn5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 10374.0 + throughput: 96.3948332369385 + estimated_peak_memory_range: + min: 36884480 + max: 36884480 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j1glkmmjp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.802858Z' diff --git a/qai_hub_models/models/esrgan/README.md b/qai_hub_models/models/esrgan/README.md index 71784914..4afc7424 100644 --- a/qai_hub_models/models/esrgan/README.md +++ b/qai_hub_models/models/esrgan/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/e a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/esrgan/export.py b/qai_hub_models/models/esrgan/export.py index 6bc2ec3f..2a7d632d 100644 --- a/qai_hub_models/models/esrgan/export.py +++ b/qai_hub_models/models/esrgan/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/esrgan/perf.yaml b/qai_hub_models/models/esrgan/perf.yaml index 8cbaa145..ae40c9f2 100644 --- a/qai_hub_models/models/esrgan/perf.yaml +++ b/qai_hub_models/models/esrgan/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ESRGAN performance_metrics: - torchscript_onnx_tflite: - inference_time: 65051.0 - throughput: 15.372553842369832 + inference_time: 68602.0 + throughput: 14.576834494621147 estimated_peak_memory_range: - min: 3252224 - max: 6824744 + min: 4915200 + max: 8401176 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1024 - job_id: j1p804dog + job_id: jw561446p job_status: Passed torchscript_onnx_qnn: - inference_time: 65381.0 - throughput: 15.294963368562732 + inference_time: 67537.0 + throughput: 14.806698550424212 estimated_peak_memory_range: - min: 102400 - max: 104823816 + min: 122880 + max: 105180416 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1026 - job_id: jn5qemxo5 + job_id: j1pvwk6kg job_status: Passed torchscript_onnx_ort: - inference_time: 70770.0 - throughput: 14.130281192595733 + inference_time: 70574.0 + throughput: 14.169524187377787 estimated_peak_memory_range: - min: 3174400 - max: 141778696 + min: 6324224 + max: 153237392 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1028 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jw56edxyg + total_layers: 1028 + job_id: jz5w96e3p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.551433Z' + timestamp: '2024-05-20T16:35:28.833041Z' - torchscript_onnx_tflite: - inference_time: 51233.0 - throughput: 19.518669607479556 + inference_time: 51332.0 + throughput: 19.48102548118133 estimated_peak_memory_range: - min: 94208 - max: 579142256 + min: 3239936 + max: 585991072 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1024 - job_id: jogk79wnp + job_id: j1p3m003g job_status: Passed torchscript_onnx_qnn: - inference_time: 50830.0 - throughput: 19.673421207948063 + inference_time: 50345.0 + throughput: 19.86294567484358 estimated_peak_memory_range: - min: 102400 - max: 255173680 + min: 12288 + max: 260077888 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1026 - job_id: j1gl61dmg + job_id: j7gjlnvvp job_status: Passed torchscript_onnx_ort: - inference_time: 51607.0 - throughput: 19.37721626911078 + inference_time: 51390.0 + throughput: 19.45903872348706 estimated_peak_memory_range: - min: 6688768 - max: 197563712 + min: 6324224 + max: 192683632 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1028 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p3vwdng + total_layers: 1028 + job_id: jmg94nlw5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.551673Z' + timestamp: '2024-05-20T16:35:28.833066Z' - torchscript_onnx_tflite: - inference_time: 71702.0 - throughput: 13.946612367855847 + inference_time: 71946.0 + throughput: 13.899313373919329 estimated_peak_memory_range: - min: 3293184 - max: 6629192 + min: 0 + max: 3606600 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1024 - job_id: jmg9jqn85 + job_id: jwgov66q5 job_status: Passed torchscript_onnx_qnn: - inference_time: 68263.0 - throughput: 14.649224323572067 + inference_time: 70208.0 + throughput: 14.243391066545122 estimated_peak_memory_range: - min: 118784 - max: 62391352 + min: 196608 + max: 104068704 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1026 - job_id: jqp4k2r1g + job_id: jygz7d3op job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.551903Z' + timestamp: '2024-05-20T16:35:28.833084Z' + - torchscript_onnx_qnn: + inference_time: 73168.0 + throughput: 13.667176907937897 + estimated_peak_memory_range: + min: 204800 + max: 204800 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1026 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1026 + job_id: jlpevmdo5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 65764.0 + throughput: 15.205887719725078 + estimated_peak_memory_range: + min: 1138688 + max: 1138688 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1028 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1028 + job_id: jnp18z48g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 641039.0 + throughput: 1.5599674902775027 + estimated_peak_memory_range: + min: 554172416 + max: 554172416 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jvgdv1xrg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.833106Z' diff --git a/qai_hub_models/models/facebook_denoiser/README.md b/qai_hub_models/models/facebook_denoiser/README.md index 72d5dba1..eaa05e87 100644 --- a/qai_hub_models/models/facebook_denoiser/README.md +++ b/qai_hub_models/models/facebook_denoiser/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/facebook_denoiser/demo.py b/qai_hub_models/models/facebook_denoiser/demo.py index cb8eb9d1..4239c719 100644 --- a/qai_hub_models/models/facebook_denoiser/demo.py +++ b/qai_hub_models/models/facebook_denoiser/demo.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import os -import tempfile from pathlib import Path from typing import List @@ -23,7 +22,11 @@ get_on_device_demo_parser, validate_on_device_demo_args, ) -from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_path +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_path, + qaihm_temp_dir, +) EXAMPLE_RECORDING = CachedWebModelAsset.from_asset_store( MODEL_ID, ASSET_VERSION, "icsi_meeting_recording.wav" @@ -57,7 +60,7 @@ def main(is_test: bool = False): # Download data audio_files: List[str] = args.audio audio_tensors = [] - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: for idx, file in enumerate(audio_files): audio_file = load_path(file, tmpdir) audio, sample_rate = torchaudio.load(audio_file) diff --git a/qai_hub_models/models/facebook_denoiser/export.py b/qai_hub_models/models/facebook_denoiser/export.py index da820ee0..8b898270 100644 --- a/qai_hub_models/models/facebook_denoiser/export.py +++ b/qai_hub_models/models/facebook_denoiser/export.py @@ -120,7 +120,7 @@ def export_model( # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -192,7 +192,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/facebook_denoiser/perf.yaml b/qai_hub_models/models/facebook_denoiser/perf.yaml index 98736731..7def156d 100644 --- a/qai_hub_models/models/facebook_denoiser/perf.yaml +++ b/qai_hub_models/models/facebook_denoiser/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Facebook-Denoiser performance_metrics: - torchscript_onnx_tflite: - inference_time: 683713.0 - throughput: 1.4626019982068499 + inference_time: 727870.0 + throughput: 1.37387170785992 estimated_peak_memory_range: - min: 380928 - max: 375423608 + min: 45551616 + max: 416715824 primary_compute_unit: CPU precision: fp32 layer_info: @@ -46,22 +48,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 209 total_layers: 209 - job_id: j1pv098r5 + job_id: jz57dryv5 job_status: Passed torchscript_onnx_ort: - inference_time: 14433398.0 - throughput: 0.0692837542483066 + inference_time: 14547237.0 + throughput: 0.06874157614947773 estimated_peak_memory_range: - min: 1519616 - max: 86092704 + min: 143360 + max: 92274744 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 4 + layers_on_npu: 175 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 7 - job_id: jlpeelqvp + total_layers: 178 + job_id: jo5mzxndp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +72,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.576051Z' + timestamp: '2024-05-20T16:35:28.863583Z' - torchscript_onnx_tflite: - inference_time: 677141.0 - throughput: 1.476797299233099 + inference_time: 779484.0 + throughput: 1.2828999697235608 estimated_peak_memory_range: - min: 363802624 - max: 387318224 + min: 430981120 + max: 452244496 primary_compute_unit: CPU precision: fp32 layer_info: @@ -84,22 +86,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 209 total_layers: 209 - job_id: j7gjzw9e5 + job_id: jqp4wrl8g job_status: Passed torchscript_onnx_ort: - inference_time: 10716749.0 - throughput: 0.09331188030997087 + inference_time: 10691874.0 + throughput: 0.09352897349893947 estimated_peak_memory_range: - min: 19521536 - max: 273877616 + min: 17801216 + max: 224185136 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 4 + layers_on_npu: 175 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 7 - job_id: jygzo46x5 + total_layers: 178 + job_id: jegnev6kg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +110,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.576099Z' + timestamp: '2024-05-20T16:35:28.863604Z' - torchscript_onnx_tflite: - inference_time: 704020.0 - throughput: 1.4204141927786142 + inference_time: 727753.0 + throughput: 1.3740925836100986 estimated_peak_memory_range: - min: 321875968 - max: 538203832 + min: 235909120 + max: 447833184 primary_compute_unit: CPU precision: fp32 layer_info: @@ -122,7 +124,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 209 total_layers: 209 - job_id: j1p80kyog + job_id: j0px1ok3g job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +133,42 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.576158Z' + timestamp: '2024-05-20T16:35:28.863616Z' + - torchscript_onnx_ort: + inference_time: 15602048.0 + throughput: 0.06409414969111747 + estimated_peak_memory_range: + min: 450560 + max: 450560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 175 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 178 + job_id: jopry3v0g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 138131.0 + throughput: 7.239504528310082 + estimated_peak_memory_range: + min: 139943936 + max: 139943936 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 107 + total_layers: 107 + job_id: jep2mykr5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.863636Z' diff --git a/qai_hub_models/models/fastsam_s/README.md b/qai_hub_models/models/fastsam_s/README.md index 6d29272a..717a36e8 100644 --- a/qai_hub_models/models/fastsam_s/README.md +++ b/qai_hub_models/models/fastsam_s/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/fastsam_s/export.py b/qai_hub_models/models/fastsam_s/export.py index 1f5c1637..e808d3d9 100644 --- a/qai_hub_models/models/fastsam_s/export.py +++ b/qai_hub_models/models/fastsam_s/export.py @@ -122,12 +122,17 @@ def export_model( model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + + " --force_channel_last_output output_1,output_2,output_3,output_5" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_1,output_2,output_3,output_5", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -165,8 +170,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -194,8 +201,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_1,output_2,output_3,output_5", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_1,output_2,output_3,output_5", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/fastsam_s/perf.yaml b/qai_hub_models/models/fastsam_s/perf.yaml index 345dbbcb..91456b39 100644 --- a/qai_hub_models/models/fastsam_s/perf.yaml +++ b/qai_hub_models/models/fastsam_s/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FastSam-S performance_metrics: - torchscript_onnx_tflite: - inference_time: 8729.0 - throughput: 114.56065986940085 + inference_time: 8636.0 + throughput: 115.7943492357573 estimated_peak_memory_range: - min: 7823360 - max: 10576056 + min: 8404992 + max: 26145480 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 288 - job_id: jmg9jxr85 + job_id: jqpyd318p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 8361.0 + throughput: 119.60291831120679 + estimated_peak_memory_range: + min: 4947968 + max: 19891312 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 286 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 286 + job_id: jogkyxewp job_status: Passed torchscript_onnx_ort: - inference_time: 10386.0 - throughput: 96.28345850182939 + inference_time: 10837.0 + throughput: 92.27646027498385 estimated_peak_memory_range: - min: 20791296 - max: 84541352 + min: 21467136 + max: 77311024 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 289 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jvgdezkz5 + total_layers: 289 + job_id: j1p3m0j3g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.594002Z' + timestamp: '2024-05-20T16:35:28.890936Z' - torchscript_onnx_tflite: - inference_time: 6438.0 - throughput: 155.32774153463808 + inference_time: 6531.0 + throughput: 153.11590874291838 estimated_peak_memory_range: - min: 6541312 - max: 77737344 + min: 5767168 + max: 76610048 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 288 - job_id: jnp1yv97p + job_id: j2p0r0z9p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 6171.0 + throughput: 162.04829039053638 + estimated_peak_memory_range: + min: 4952064 + max: 91897808 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 286 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 286 + job_id: jn5q2q6n5 job_status: Passed torchscript_onnx_ort: - inference_time: 7468.0 - throughput: 133.9046598821639 + inference_time: 7948.0 + throughput: 125.81781580271766 estimated_peak_memory_range: - min: 24322048 - max: 63913008 + min: 28004352 + max: 71806784 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 289 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz5707m9g + total_layers: 289 + job_id: jwgov62q5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.594052Z' + timestamp: '2024-05-20T16:35:28.890962Z' - torchscript_onnx_tflite: - inference_time: 8739.0 - throughput: 114.42956860052638 + inference_time: 8645.0 + throughput: 115.6737998843262 estimated_peak_memory_range: - min: 7802880 - max: 25345168 + min: 7819264 + max: 25353920 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 288 - job_id: jw56e0yyg + job_id: j1p87yqk5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 8210.0 + throughput: 121.8026796589525 + estimated_peak_memory_range: + min: 4984832 + max: 19259848 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 286 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 286 + job_id: jw5614y6p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.594089Z' + timestamp: '2024-05-20T16:35:28.890979Z' + - torchscript_onnx_qnn: + inference_time: 9182.0 + throughput: 108.90873448050533 + estimated_peak_memory_range: + min: 4935680 + max: 4935680 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 286 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 286 + job_id: j1glkmvjp + job_status: Passed + torchscript_onnx_ort: + inference_time: 10779.0 + throughput: 92.77298450691158 + estimated_peak_memory_range: + min: 67710976 + max: 67710976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 289 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 289 + job_id: j1pvwkqkg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 62697.0 + throughput: 15.949726462191174 + estimated_peak_memory_range: + min: 70156288 + max: 70156288 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 201 + total_layers: 201 + job_id: j7gjlndvp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.891004Z' diff --git a/qai_hub_models/models/fastsam_x/README.md b/qai_hub_models/models/fastsam_x/README.md index 8c7c2091..b3c84891 100644 --- a/qai_hub_models/models/fastsam_x/README.md +++ b/qai_hub_models/models/fastsam_x/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/fastsam_x/export.py b/qai_hub_models/models/fastsam_x/export.py index 494a3229..b87735a2 100644 --- a/qai_hub_models/models/fastsam_x/export.py +++ b/qai_hub_models/models/fastsam_x/export.py @@ -122,12 +122,17 @@ def export_model( model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + + " --force_channel_last_output output_1,output_2,output_3,output_5" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_1,output_2,output_3,output_5", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -165,8 +170,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -194,8 +201,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_1,output_2,output_3,output_5", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_1,output_2,output_3,output_5", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/fastsam_x/perf.yaml b/qai_hub_models/models/fastsam_x/perf.yaml index 4b8e861d..629b0462 100644 --- a/qai_hub_models/models/fastsam_x/perf.yaml +++ b/qai_hub_models/models/fastsam_x/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FastSam-X performance_metrics: - torchscript_onnx_tflite: - inference_time: 50012.0 - throughput: 19.995201151723585 + inference_time: 49665.0 + throughput: 20.13490385583409 estimated_peak_memory_range: - min: 9154560 - max: 13813200 + min: 9117696 + max: 14327728 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 420 - job_id: j0pxndql5 + job_id: jlpevmoo5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 46166.0 + throughput: 21.66096261317853 + estimated_peak_memory_range: + min: 4935680 + max: 20646312 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 418 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 418 + job_id: jmg94n0w5 job_status: Passed torchscript_onnx_ort: - inference_time: 50171.0 - throughput: 19.93183313069303 + inference_time: 50328.0 + throughput: 19.86965506278811 estimated_peak_memory_range: - min: 24637440 - max: 351124872 + min: 25731072 + max: 346581656 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 421 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jegnl74q5 + total_layers: 421 + job_id: jmg94n085 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.611861Z' + timestamp: '2024-05-20T16:35:28.921155Z' - torchscript_onnx_tflite: - inference_time: 36802.0 - throughput: 27.172436280636923 + inference_time: 36007.0 + throughput: 27.772377593245757 estimated_peak_memory_range: - min: 8462336 - max: 149995872 + min: 73728 + max: 135466464 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 420 - job_id: jo5mqd79p + job_id: jygz7d2op + job_status: Passed + torchscript_onnx_qnn: + inference_time: 34949.0 + throughput: 28.61312197773899 + estimated_peak_memory_range: + min: 4096000 + max: 127015584 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 418 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 418 + job_id: jnp18z28g job_status: Passed torchscript_onnx_ort: - inference_time: 36880.0 - throughput: 27.114967462039047 + inference_time: 36890.0 + throughput: 27.107617240444565 estimated_peak_memory_range: - min: 26107904 - max: 93739104 + min: 29392896 + max: 93988544 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 421 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jopr8nr75 + total_layers: 421 + job_id: jnp18z27g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.611951Z' + timestamp: '2024-05-20T16:35:28.921181Z' - torchscript_onnx_tflite: - inference_time: 52081.0 - throughput: 19.200860198536894 + inference_time: 50541.0 + throughput: 19.785916384717357 estimated_peak_memory_range: - min: 9240576 - max: 13789008 + min: 9220096 + max: 14009928 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 420 - job_id: jmg9jql85 + job_id: jz5w96w3p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 45832.0 + throughput: 21.81881654739047 + estimated_peak_memory_range: + min: 4988928 + max: 21102120 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 418 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 418 + job_id: jz5w96wmp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.612007Z' + timestamp: '2024-05-20T16:35:28.921198Z' + - torchscript_onnx_qnn: + inference_time: 57556.0 + throughput: 17.374383209396065 + estimated_peak_memory_range: + min: 4939776 + max: 4939776 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 418 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 418 + job_id: jvgdv1nrg + job_status: Passed + torchscript_onnx_ort: + inference_time: 49642.0 + throughput: 20.144232706176222 + estimated_peak_memory_range: + min: 36737024 + max: 36737024 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 421 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 421 + job_id: jvgdv1nzg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2190810.0 + throughput: 0.45645217978738456 + estimated_peak_memory_range: + min: 582156288 + max: 582156288 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jz57dr295 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.921219Z' diff --git a/qai_hub_models/models/fcn_resnet50/README.md b/qai_hub_models/models/fcn_resnet50/README.md index 674f6c47..c2af6df1 100644 --- a/qai_hub_models/models/fcn_resnet50/README.md +++ b/qai_hub_models/models/fcn_resnet50/README.md @@ -1,11 +1,11 @@ [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) -# [FCN_ResNet50: Fully-convolutional network model for image segmentation](https://aihub.qualcomm.com/models/fcn_resnet50) +# [FCN-ResNet50: Fully-convolutional network model for image segmentation](https://aihub.qualcomm.com/models/fcn_resnet50) FCN_ResNet50 is a machine learning model that can segment images from the COCO dataset. It uses ResNet50 as a backbone. -This is based on the implementation of FCN_ResNet50 found +This is based on the implementation of FCN-ResNet50 found [here](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py). This repository contains scripts for optimized on-device export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/fcn_resnet50). @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage @@ -39,7 +41,7 @@ Additional options are documented with the `--help` option. Note that the above script requires access to Deployment instructions for Qualcomm® AI Hub. ## License -- The license for the original implementation of FCN_ResNet50 can be found +- The license for the original implementation of FCN-ResNet50 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). - The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) diff --git a/qai_hub_models/models/fcn_resnet50/app.py b/qai_hub_models/models/fcn_resnet50/app.py index f00519d8..08d0329a 100644 --- a/qai_hub_models/models/fcn_resnet50/app.py +++ b/qai_hub_models/models/fcn_resnet50/app.py @@ -15,7 +15,6 @@ from qai_hub_models.models.fcn_resnet50.model import NUM_CLASSES from qai_hub_models.utils.draw import create_color_map -from qai_hub_models.utils.image_processing import normalize_image_transform def preprocess_image(image: Image) -> torch.Tensor: @@ -30,13 +29,7 @@ def preprocess_image(image: Image) -> torch.Tensor: Returns: torch tensor to be directly passed to the model. """ - transform = transforms.Compose( - [ - transforms.ToTensor(), - normalize_image_transform(), - ] - ) - out_tensor: torch.Tensor = transform(image) # type: ignore + out_tensor: torch.Tensor = transforms.ToTensor()(image) # type: ignore return out_tensor.unsqueeze(0) diff --git a/qai_hub_models/models/fcn_resnet50/demo.py b/qai_hub_models/models/fcn_resnet50/demo.py index 6c12063c..2a997b45 100644 --- a/qai_hub_models/models/fcn_resnet50/demo.py +++ b/qai_hub_models/models/fcn_resnet50/demo.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +from typing import Type + from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App from qai_hub_models.models.fcn_resnet50.model import ( MODEL_ASSET_VERSION, @@ -26,9 +28,9 @@ ) -def main(is_test: bool = False): +def fcn_resnet50_demo(model_cls: Type[FCN_ResNet50], is_test: bool = False): # Demo parameters - parser = get_model_cli_parser(FCN_ResNet50) + parser = get_model_cli_parser(model_cls) parser = get_on_device_demo_parser(parser, add_output_dir=True) parser.add_argument( "--image", @@ -39,12 +41,12 @@ def main(is_test: bool = False): args = parser.parse_args([] if is_test else None) validate_on_device_demo_args(args, MODEL_ID) - model = demo_model_from_cli_args(FCN_ResNet50, MODEL_ID, args) + model = demo_model_from_cli_args(model_cls, MODEL_ID, args) # This FCN ResNet 50 demo comes from # https://pytorch.org/hub/pytorch_vision_fcn_resnet101/ # load image - (_, _, height, width) = FCN_ResNet50.get_input_spec()["image"][0] + (_, _, height, width) = model_cls.get_input_spec()["image"][0] orig_image = load_image(args.image) image, scale, padding = pil_resize_pad(orig_image, (height, width)) input_image = image.convert("RGB") @@ -58,5 +60,9 @@ def main(is_test: bool = False): display_or_save_image(image_annotated, args.output_dir, "fcn_demo_output.png") +def main(is_test: bool = False): + return fcn_resnet50_demo(FCN_ResNet50, is_test=is_test) + + if __name__ == "__main__": main() diff --git a/qai_hub_models/models/fcn_resnet50/export.py b/qai_hub_models/models/fcn_resnet50/export.py index d0a84c70..9b549255 100644 --- a/qai_hub_models/models/fcn_resnet50/export.py +++ b/qai_hub_models/models/fcn_resnet50/export.py @@ -98,7 +98,7 @@ def export_model( if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "fcn_resnet50", - "FCN_ResNet50", + "FCN-ResNet50", device, skip_profiling, skip_inferencing, @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/fcn_resnet50/info.yaml b/qai_hub_models/models/fcn_resnet50/info.yaml index 87c73764..fcaeaafa 100644 --- a/qai_hub_models/models/fcn_resnet50/info.yaml +++ b/qai_hub_models/models/fcn_resnet50/info.yaml @@ -1,4 +1,4 @@ -name: FCN_ResNet50 +name: FCN-ResNet50 # id must match with the model dir name in qai_hub_models id: fcn_resnet50 status: public @@ -24,7 +24,7 @@ applicable_scenarios: - Inventory Management related_models: - sam - - unet_segmentation + - deeplabv3_plus_mobilenet - ddrnet23_slim form_factors: - Phone @@ -35,4 +35,5 @@ has_static_banner: yes has_animated_banner: no license_type: bsd-3-clause deploy_license_type: AI Model Hub License -dataset: [] +dataset: + - coco diff --git a/qai_hub_models/models/fcn_resnet50/model.py b/qai_hub_models/models/fcn_resnet50/model.py index 156e63ec..e336cecf 100644 --- a/qai_hub_models/models/fcn_resnet50/model.py +++ b/qai_hub_models/models/fcn_resnet50/model.py @@ -7,7 +7,10 @@ import torch import torchvision.models as tv_models +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.segmentation_evaluator import SegmentationOutputEvaluator from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.image_processing import normalize_image_torchvision from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] @@ -29,9 +32,13 @@ def __init__( @classmethod def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> FCN_ResNet50: model = tv_models.segmentation.fcn_resnet50(weights=weights).eval() + model.aux_classifier = None return cls(model) - def forward(self, image: torch.Tensor) -> torch.Tensor: + def get_evaluator(self) -> BaseEvaluator: + return SegmentationOutputEvaluator(NUM_CLASSES) + + def forward(self, image): """ Run FCN_ResNet50 on `image`, and produce a tensor of classes for segmentation @@ -43,14 +50,14 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: Returns: tensor: 1x21xHxW tensor of class logits per pixel """ - return self.model(image)["out"] + return self.model(normalize_image_torchvision(image))["out"] @staticmethod def get_input_spec( batch_size: int = 1, num_channels: int = 3, - height: int = 224, - width: int = 224, + height: int = 512, + width: int = 512, ) -> InputSpec: # Get the input specification ordered (name -> (shape, type)) pairs for this model. # diff --git a/qai_hub_models/models/fcn_resnet50/perf.yaml b/qai_hub_models/models/fcn_resnet50/perf.yaml index 17a1dc92..23e73bfb 100644 --- a/qai_hub_models/models/fcn_resnet50/perf.yaml +++ b/qai_hub_models/models/fcn_resnet50/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,53 +31,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: -- name: FCN_ResNet50 +- name: FCN-ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 8481.0 - throughput: 117.91062374719962 + inference_time: 42451.0 + throughput: 23.55657110550988 estimated_peak_memory_range: - min: 4251648 - max: 6673424 + min: 22093824 + max: 24844120 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 84 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 84 - job_id: jqpyr7ll5 + total_layers: 86 + job_id: jqp4wrn1g job_status: Passed torchscript_onnx_qnn: - inference_time: 7915.0 - throughput: 126.34238787113077 + inference_time: 42160.0 + throughput: 23.719165085388994 estimated_peak_memory_range: - min: 32768 - max: 14371224 + min: 3166208 + max: 20971816 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 125 + layers_on_npu: 127 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 125 - job_id: j1p804nog + total_layers: 127 + job_id: jegnev0qg job_status: Passed torchscript_onnx_ort: - inference_time: 434382.0 - throughput: 2.3021211744501384 + inference_time: 42833.0 + throughput: 23.346485186655148 estimated_peak_memory_range: - min: 229376 - max: 157385104 + min: 46034944 + max: 200591552 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 129 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jn5qemno5 + total_layers: 129 + job_id: j2p0r04np job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,51 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.629765Z' + timestamp: '2024-05-20T16:35:28.951494Z' - torchscript_onnx_tflite: - inference_time: 6385.0 - throughput: 156.61707126076743 + inference_time: 30899.0 + throughput: 32.363506909608724 estimated_peak_memory_range: - min: 4259840 - max: 81999104 + min: 20209664 + max: 155788144 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 84 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 84 - job_id: j2p03vwnp + total_layers: 86 + job_id: j0px1o9lg job_status: Passed torchscript_onnx_qnn: - inference_time: 5804.0 - throughput: 172.2949689869056 + inference_time: 31911.0 + throughput: 31.337156466422236 estimated_peak_memory_range: - min: 618496 - max: 57524672 + min: 2564096 + max: 76317072 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 125 + layers_on_npu: 127 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 125 - job_id: jogk791np + total_layers: 127 + job_id: jopry367g job_status: Passed torchscript_onnx_ort: - inference_time: 334126.0 - throughput: 2.9928829244057633 + inference_time: 32386.0 + throughput: 30.877539677638485 estimated_peak_memory_range: - min: 3608576 - max: 48710400 + min: 43917312 + max: 112401296 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 129 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1gl61jmg + total_layers: 129 + job_id: j1p87y2o5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,36 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.629842Z' + timestamp: '2024-05-20T16:35:28.951519Z' - torchscript_onnx_tflite: - inference_time: 8533.0 - throughput: 117.19207781553968 + inference_time: 42178.0 + throughput: 23.709042628858647 estimated_peak_memory_range: - min: 4243456 - max: 6395552 + min: 18853888 + max: 20525048 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 84 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 84 - job_id: jvgdemx65 + total_layers: 86 + job_id: jo5mzxe9p job_status: Passed torchscript_onnx_qnn: - inference_time: 7887.0 - throughput: 126.79092177000126 + inference_time: 42067.0 + throughput: 23.77160244372073 estimated_peak_memory_range: - min: 16384 - max: 14326120 + min: 3178496 + max: 20597416 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 125 + layers_on_npu: 127 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 125 - job_id: jo5mqln7p + total_layers: 127 + job_id: jqpyd3zlp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.629884Z' + timestamp: '2024-05-20T16:35:28.951537Z' + - torchscript_onnx_qnn: + inference_time: 68578.0 + throughput: 14.581935897809792 + estimated_peak_memory_range: + min: 3153920 + max: 3153920 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 127 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 127 + job_id: jep2myxq5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 42426.0 + throughput: 23.57045208127092 + estimated_peak_memory_range: + min: 40243200 + max: 40243200 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 129 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 129 + job_id: jogkyxvnp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 340971.0 + throughput: 2.932800736719545 + estimated_peak_memory_range: + min: 278179840 + max: 278179840 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 59 + total_layers: 59 + job_id: jn5q2q0o5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.951559Z' diff --git a/qai_hub_models/models/fcn_resnet50_quantized/README.md b/qai_hub_models/models/fcn_resnet50_quantized/README.md new file mode 100644 index 00000000..72302a12 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/README.md @@ -0,0 +1,56 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [FCN-ResNet50-Quantized: Quantized fully-convolutional network model for image segmentation](https://aihub.qualcomm.com/models/fcn_resnet50_quantized) + +FCN_ResNet50 is a quantized machine learning model that can segment images from the COCO dataset. It uses ResNet50 as a backbone. + +This is based on the implementation of FCN-ResNet50-Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/fcn_resnet50_quantized). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.fcn_resnet50_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.fcn_resnet50_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of FCN-ResNet50-Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/fcn_resnet50_quantized/__init__.py b/qai_hub_models/models/fcn_resnet50_quantized/__init__.py new file mode 100644 index 00000000..6f6e853c --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/__init__.py @@ -0,0 +1,8 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import FCN_ResNet50Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/fcn_resnet50_quantized/conftest.py b/qai_hub_models/models/fcn_resnet50_quantized/conftest.py new file mode 100644 index 00000000..8fd7c424 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/conftest.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import inspect + +import pytest + +from qai_hub_models.models.fcn_resnet50_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +# Instantiate the model only once for all tests. +# Mock from_pretrained to always return the initialized model. +# This speeds up tests and limits memory leaks. +@pytest.fixture(scope="module", autouse=True) +def cached_from_pretrained(): + with pytest.MonkeyPatch.context() as mp: + pretrained_cache = {} + from_pretrained = Model.from_pretrained + sig = inspect.signature(from_pretrained) + + @skip_clone_repo_check + def _cached_from_pretrained(*args, **kwargs): + cache_key = str(args) + str(kwargs) + model = pretrained_cache.get(cache_key, None) + if model: + return model + else: + model = from_pretrained(*args, **kwargs) + pretrained_cache[cache_key] = model + return model + + _cached_from_pretrained.__signature__ = sig + + mp.setattr(Model, "from_pretrained", _cached_from_pretrained) + yield mp diff --git a/qai_hub_models/models/fcn_resnet50_quantized/demo.py b/qai_hub_models/models/fcn_resnet50_quantized/demo.py new file mode 100644 index 00000000..cc6abc16 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/demo.py @@ -0,0 +1,14 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.fcn_resnet50.demo import fcn_resnet50_demo +from qai_hub_models.models.fcn_resnet50_quantized.model import FCN_ResNet50Quantizable + + +def main(is_test: bool = False): + fcn_resnet50_demo(FCN_ResNet50Quantizable, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fcn_resnet50_quantized/export.py b/qai_hub_models/models/fcn_resnet50_quantized/export.py new file mode 100644 index 00000000..cf0b371a --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/export.py @@ -0,0 +1,232 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.fcn_resnet50_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "fcn_resnet50_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "fcn_resnet50_quantized", + "FCN-ResNet50-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + channel_last_flags, hub_device + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub_device, + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub_device, + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub_device, + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + # Convert outputs from channel last to channel first + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/fcn_resnet50_quantized/info.yaml b/qai_hub_models/models/fcn_resnet50_quantized/info.yaml new file mode 100644 index 00000000..21939860 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/info.yaml @@ -0,0 +1,41 @@ +name: FCN-ResNet50-Quantized +# id must match with the model dir name in qai_hub_models +id: fcn_resnet50_quantized +status: public +headline: Quantized fully-convolutional network model for image segmentation. +domain: Computer Vision +use_case: Semantic Segmentation +description: FCN_ResNet50 is a quantized machine learning model that can segment images from + the COCO dataset. It uses ResNet50 as a backbone. +tags: + - quantized +research_paper: https://arxiv.org/abs/1411.4038 +research_paper_title: Fully Convolutional Networks for Semantic Segmentation +license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: + https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: + https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py +technical_details: + Model checkpoint: COCO_WITH_VOC_LABELS_V1 + Input resolution: 512x512 + Number of parameters: 33.0M + Model size: 32.2 MB +applicable_scenarios: + - Anomaly Detection + - Inventory Management +related_models: + - sam + - deeplabv3_plus_mobilenet + - ddrnet23_slim +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: no +license_type: bsd-3-clause +deploy_license_type: AI Model Hub License +dataset: + - coco diff --git a/qai_hub_models/models/fcn_resnet50_quantized/model.py b/qai_hub_models/models/fcn_resnet50_quantized/model.py new file mode 100644 index 00000000..affc65ef --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/model.py @@ -0,0 +1,87 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.fcn_resnet50.model import FCN_ResNet50 +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.quantization_aimet import ( + constrain_quantized_inputs_to_image_range, + tie_observers, +) + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "fcn_resnet50_quantized_encodings.json" + + +class FCN_ResNet50Quantizable(AIMETQuantizableMixin, FCN_ResNet50): + """ + FCN_ResNet50 with post train quantization support. + + Supports only 8 bit weights and activations + """ + + def __init__( + self, + model: QuantizationSimModel, + ) -> None: + FCN_ResNet50.__init__(self, model.model) + AIMETQuantizableMixin.__init__(self, model) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "FCN_ResNet50Quantizable": + # Load Model + fp16_model = FCN_ResNet50.from_pretrained() + input_shape = cls.get_input_spec()["image"][0] + + model = prepare_model(fp16_model) + equalize_model(model, input_shape) + + sim = QuantizationSimModel( + model, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_default_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + tie_observers(sim) + constrain_quantized_inputs_to_image_range(sim) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + final_model = cls(sim) + return final_model + + def forward(self, image: torch.Tensor): + """ + Run FCN_ResNet50Quantizable on `image`, and produce a segmentation mask. + + See FCN_ResNet50 model for details. + """ + return self.model(image) diff --git a/qai_hub_models/models/fcn_resnet50_quantized/perf.yaml b/qai_hub_models/models/fcn_resnet50_quantized/perf.yaml new file mode 100644 index 00000000..dae0447f --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/perf.yaml @@ -0,0 +1,301 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - QCS6490 (Proxy) + - QCS8250 (Proxy) + - QCS8550 (Proxy) + - RB3 Gen 2 (Proxy) + - RB5 (Proxy) + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy S24+ + - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Qcs6490 + - Qcs8250 + - Qcs8550 + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 + - Snapdragon® X Elite +models: +- name: FCN-ResNet50-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 14056.0 + throughput: 71.14399544678429 + estimated_peak_memory_range: + min: 5554176 + max: 7613336 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 87 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 87 + job_id: j1glkm6mp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 15255.0 + throughput: 65.55227794165847 + estimated_peak_memory_range: + min: 16384 + max: 85850320 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 79 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 79 + job_id: jwgov6kk5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 19290.0 + throughput: 51.84033177812338 + estimated_peak_memory_range: + min: 44077056 + max: 93926136 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 82 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 82 + job_id: jygz7doxp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-05-20T16:35:28.981987Z' + - torchscript_onnx_tflite: + inference_time: 10013.0 + throughput: 99.87016878058525 + estimated_peak_memory_range: + min: 49152 + max: 82780048 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 87 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 87 + job_id: jw5614eyp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 11218.0 + throughput: 89.14244963451596 + estimated_peak_memory_range: + min: 802816 + max: 56818672 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 79 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 79 + job_id: j1pvwk0rg + job_status: Passed + torchscript_onnx_ort: + inference_time: 14506.0 + throughput: 68.93699158968703 + estimated_peak_memory_range: + min: 48697344 + max: 95933808 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 82 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 82 + job_id: jz5w962mp + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-05-20T16:35:28.982012Z' + - torchscript_onnx_tflite: + inference_time: 14093.0 + throughput: 70.95721280068119 + estimated_peak_memory_range: + min: 5595136 + max: 7636376 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 87 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 87 + job_id: j1p3m0vng + job_status: Passed + torchscript_onnx_qnn: + inference_time: 15248.0 + throughput: 65.58237145855195 + estimated_peak_memory_range: + min: 16384 + max: 73538552 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 79 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 79 + job_id: jlpevmev5 + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:28.982029Z' + - torchscript_onnx_tflite: + inference_time: 90967.0 + throughput: 10.992997460617586 + estimated_peak_memory_range: + min: 274432 + max: 138676000 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 87 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 87 + job_id: jqpy62q45 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 93267.0 + throughput: 10.721905925997405 + estimated_peak_memory_range: + min: 905216 + max: 129785056 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 79 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 79 + job_id: jw56nmr7g + job_status: Passed + reference_device_info: + name: RB3 Gen 2 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:28.982045Z' + - torchscript_onnx_tflite: + inference_time: 703201.0 + throughput: 1.422068512416791 + estimated_peak_memory_range: + min: 51548160 + max: 190297032 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 66 + layers_on_gpu: 9 + layers_on_cpu: 12 + total_layers: 87 + job_id: j2p0l9dep + job_status: Passed + reference_device_info: + name: RB5 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:28.982055Z' + - torchscript_onnx_qnn: + inference_time: 16865.0 + throughput: 59.29439667951379 + estimated_peak_memory_range: + min: 786432 + max: 786432 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 79 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 79 + job_id: j7gjlnzep + job_status: Passed + torchscript_onnx_ort: + inference_time: 17493.0 + throughput: 57.16572343223004 + estimated_peak_memory_range: + min: 72589312 + max: 72589312 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 82 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 82 + job_id: jmg94nj85 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jnp18zy7g + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:28.982080Z' diff --git a/qai_hub_models/models/fcn_resnet50_quantized/test.py b/qai_hub_models/models/fcn_resnet50_quantized/test.py new file mode 100644 index 00000000..d5dae110 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50_quantized/test.py @@ -0,0 +1,40 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import torch + +from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App +from qai_hub_models.models.fcn_resnet50.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.fcn_resnet50_quantized.demo import main as demo_main +from qai_hub_models.models.fcn_resnet50_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + FCN_ResNet50Quantizable, +) +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + load_numpy, +) +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_MASK = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "fcn_resnet50_output_mask.npy" +) + + +@skip_clone_repo_check +def test_task(): + # AIMET Quantization Simulator introduces randomness. Eliminate that for this test. + torch.manual_seed(0) + image = load_image(INPUT_IMAGE_ADDRESS) + app = FCN_ResNet50App(FCN_ResNet50Quantizable.from_pretrained()) + output_mask = app.predict(image, True) + output_mask_gt = load_numpy(OUTPUT_IMAGE_MASK) + assert (output_mask == output_mask_gt).mean() > 0.95 + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_122ns_lowres/README.md b/qai_hub_models/models/ffnet_122ns_lowres/README.md index e4102369..6d34b21a 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/README.md +++ b/qai_hub_models/models/ffnet_122ns_lowres/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_122ns_lowres/export.py b/qai_hub_models/models/ffnet_122ns_lowres/export.py index 68bd1824..1bb5ca1b 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/export.py +++ b/qai_hub_models/models/ffnet_122ns_lowres/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml index 66734ffc..af26da00 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml +++ b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-122NS-LowRes performance_metrics: - torchscript_onnx_tflite: - inference_time: 9669.0 - throughput: 103.42331161443789 + inference_time: 9717.0 + throughput: 102.91242152927859 estimated_peak_memory_range: - min: 675840 - max: 2991672 + min: 651264 + max: 3155872 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 216 - job_id: j1p3vwyng + job_id: jvgdv1ezg job_status: Passed torchscript_onnx_qnn: - inference_time: 10768.0 - throughput: 92.86775631500743 + inference_time: 10869.0 + throughput: 92.00478424878094 estimated_peak_memory_range: - min: 6320128 - max: 41702576 + min: 8364032 + max: 43265120 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 348 - job_id: j1pv09jr5 + job_id: j0px1onlg job_status: Passed torchscript_onnx_ort: - inference_time: 7374.0 - throughput: 135.61160835367508 + inference_time: 7858.0 + throughput: 127.25884448969204 estimated_peak_memory_range: - min: 1433600 - max: 142206056 + min: 2232320 + max: 141084128 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 350 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jlpeeljvp + total_layers: 350 + job_id: jep2my0q5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.654178Z' + timestamp: '2024-05-20T16:35:29.021584Z' - torchscript_onnx_tflite: - inference_time: 6839.0 - throughput: 146.22020763269484 + inference_time: 6794.0 + throughput: 147.18869590815424 estimated_peak_memory_range: - min: 569344 - max: 59671696 + min: 303104 + max: 60447344 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 216 - job_id: jwgok4jkp + job_id: jz57dr095 job_status: Passed torchscript_onnx_qnn: - inference_time: 7605.0 - throughput: 131.49243918474687 + inference_time: 7585.0 + throughput: 131.83915622940015 estimated_peak_memory_range: min: 6307840 - max: 88354272 + max: 88988128 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 348 - job_id: j7gjzwje5 + job_id: jo5mzxq9p job_status: Passed torchscript_onnx_ort: - inference_time: 5809.0 - throughput: 172.14666896195558 + inference_time: 5761.0 + throughput: 173.58097552508247 estimated_peak_memory_range: - min: 61464576 - max: 106276496 + min: 5238784 + max: 60652944 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 350 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jygzo41x5 + total_layers: 350 + job_id: jqpyd3rlp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.654276Z' + timestamp: '2024-05-20T16:35:29.021611Z' - torchscript_onnx_tflite: - inference_time: 9658.0 - throughput: 103.54110581901014 + inference_time: 9668.0 + throughput: 103.4340091021928 estimated_peak_memory_range: - min: 0 - max: 4034800 + min: 651264 + max: 2883976 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 216 - job_id: jqpyry105 + job_id: jqp4wrk1g job_status: Passed torchscript_onnx_qnn: - inference_time: 10822.0 - throughput: 92.40436148586214 + inference_time: 10900.0 + throughput: 91.74311926605505 estimated_peak_memory_range: - min: 6328320 - max: 38539008 + min: 6332416 + max: 40664968 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 348 - job_id: jn5qed0e5 + job_id: jopry387g job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.654354Z' + timestamp: '2024-05-20T16:35:29.021627Z' + - torchscript_onnx_qnn: + inference_time: 17551.0 + throughput: 56.976810438151674 + estimated_peak_memory_range: + min: 6303744 + max: 6303744 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 348 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 348 + job_id: jegnevlqg + job_status: Passed + torchscript_onnx_ort: + inference_time: 7536.0 + throughput: 132.6963906581741 + estimated_peak_memory_range: + min: 6365184 + max: 6365184 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 350 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 350 + job_id: j2p0r03np + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 38423.0 + throughput: 26.026078130286546 + estimated_peak_memory_range: + min: 6307840 + max: 6307840 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 151 + total_layers: 151 + job_id: j1p87y0o5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.021650Z' diff --git a/qai_hub_models/models/ffnet_40s/README.md b/qai_hub_models/models/ffnet_40s/README.md index 1e3b56a5..f1911ec4 100644 --- a/qai_hub_models/models/ffnet_40s/README.md +++ b/qai_hub_models/models/ffnet_40s/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_40s/export.py b/qai_hub_models/models/ffnet_40s/export.py index 8fe5c587..fd46c18f 100644 --- a/qai_hub_models/models/ffnet_40s/export.py +++ b/qai_hub_models/models/ffnet_40s/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_40s/perf.yaml b/qai_hub_models/models/ffnet_40s/perf.yaml index ee5c5c18..d8dea744 100644 --- a/qai_hub_models/models/ffnet_40s/perf.yaml +++ b/qai_hub_models/models/ffnet_40s/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-40S performance_metrics: - torchscript_onnx_tflite: - inference_time: 23048.0 - throughput: 43.38771259979174 + inference_time: 23181.0 + throughput: 43.138777447047154 estimated_peak_memory_range: - min: 0 - max: 30911488 + min: 2527232 + max: 5196976 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 92 - job_id: jmg9jx685 + job_id: jogkyx7np job_status: Passed torchscript_onnx_qnn: - inference_time: 17363.0 - throughput: 57.59373380176237 + inference_time: 17245.0 + throughput: 57.98782255726297 estimated_peak_memory_range: - min: 25214976 - max: 44166488 + min: 1662976 + max: 17190312 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 140 - job_id: jvgdezjz5 + job_id: jw5614zyp job_status: Passed torchscript_onnx_ort: - inference_time: 28590.0 - throughput: 34.97726477789437 + inference_time: 27135.0 + throughput: 36.852773171181134 estimated_peak_memory_range: - min: 30191616 - max: 118917360 + min: 33619968 + max: 118794368 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 142 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jmg9jx6m5 + total_layers: 142 + job_id: j7gjln2ep job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:31.678643Z' + timestamp: '2024-05-20T16:35:29.051849Z' - torchscript_onnx_tflite: - inference_time: 16867.0 - throughput: 59.28736586233474 + inference_time: 16628.0 + throughput: 60.13952369497233 estimated_peak_memory_range: - min: 32768 - max: 105460576 + min: 65536 + max: 96903808 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 92 - job_id: jnp1yvr7p + job_id: jn5q2qeo5 job_status: Passed torchscript_onnx_qnn: - inference_time: 12552.0 - throughput: 79.66857871255577 + inference_time: 12571.0 + throughput: 79.54816641476414 estimated_peak_memory_range: - min: 25202688 - max: 84533840 + min: 25198592 + max: 80803488 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 140 - job_id: jz5w21j45 + job_id: j1p3m01ng job_status: Passed torchscript_onnx_ort: - inference_time: 20354.0 - throughput: 49.13039206052864 + inference_time: 19730.0 + throughput: 50.68423720223011 estimated_peak_memory_range: - min: 352256 - max: 45279760 + min: 32903168 + max: 79929760 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 142 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jnp1yvrnp + total_layers: 142 + job_id: jlpevmwv5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:31.678725Z' + timestamp: '2024-05-20T16:35:29.051876Z' - torchscript_onnx_tflite: - inference_time: 22456.0 - throughput: 44.53152832205201 + inference_time: 23514.0 + throughput: 42.527855745513314 estimated_peak_memory_range: - min: 32768 - max: 1647568 + min: 2555904 + max: 4820560 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 92 - job_id: jygzo0245 + job_id: j1glkm2mp job_status: Passed torchscript_onnx_qnn: - inference_time: 17241.0 - throughput: 58.00127602807262 + inference_time: 17349.0 + throughput: 57.64020981036371 estimated_peak_memory_range: - min: 25214976 - max: 52246888 + min: 25227264 + max: 46301352 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 140 - job_id: jvgdemn65 + job_id: j1pvwkrrg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:31.678770Z' + timestamp: '2024-05-20T16:35:29.051893Z' + - torchscript_onnx_qnn: + inference_time: 23285.0 + throughput: 42.94610264118531 + estimated_peak_memory_range: + min: 25214976 + max: 25214976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 140 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 140 + job_id: jwgov6nk5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 26395.0 + throughput: 37.885963250615646 + estimated_peak_memory_range: + min: 25223168 + max: 25223168 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 142 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 142 + job_id: jygz7djxp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 143723.0 + throughput: 6.957828600850247 + estimated_peak_memory_range: + min: 208834560 + max: 208834560 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 67 + total_layers: 67 + job_id: jz5w963mp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.051916Z' diff --git a/qai_hub_models/models/ffnet_40s_quantized/README.md b/qai_hub_models/models/ffnet_40s_quantized/README.md index bafe50ad..7767cf30 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/README.md +++ b/qai_hub_models/models/ffnet_40s_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_40s_quantized/export.py b/qai_hub_models/models/ffnet_40s_quantized/export.py index a40f6ed5..c61ae011 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/export.py +++ b/qai_hub_models/models/ffnet_40s_quantized/export.py @@ -123,12 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -170,8 +174,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -199,8 +205,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -209,7 +219,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_40s_quantized/perf.yaml b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml index a4fab1f8..79fff9d7 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/perf.yaml +++ b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-40S-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 6424.0 - throughput: 155.6662515566625 + inference_time: 6448.0 + throughput: 155.08684863523573 estimated_peak_memory_range: - min: 651264 - max: 25140680 + min: 823296 + max: 2440760 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,22 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 97 - job_id: jz5707qng + job_id: jmg94ny85 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 4328.0 + throughput: 231.0536044362292 + estimated_peak_memory_range: + min: 6303744 + max: 20703816 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 89 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 89 + job_id: jz57drl95 job_status: Passed torchscript_onnx_ort: - inference_time: 50173.0 - throughput: 19.93103860642178 + inference_time: 11529.0 + throughput: 86.73779165582444 estimated_peak_memory_range: - min: 29384704 - max: 58656168 + min: 25239552 + max: 52880320 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 94 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j0pxndw85 + total_layers: 94 + job_id: jegnevmqg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.012713Z' + timestamp: '2024-05-20T16:35:29.082174Z' - torchscript_onnx_tflite: inference_time: 4623.0 throughput: 216.3097555699762 estimated_peak_memory_range: - min: 20480 - max: 67550048 + min: 36864 + max: 67842848 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,22 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 97 - job_id: jqp4k9z2g + job_id: jnp18zw7g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 3154.0 + throughput: 317.0577045022194 + estimated_peak_memory_range: + min: 6311936 + max: 56225968 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 89 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 89 + job_id: jqp4wrd1g job_status: Passed torchscript_onnx_ort: - inference_time: 31095.0 - throughput: 32.15951117543013 + inference_time: 8449.0 + throughput: 118.35720203574388 estimated_peak_memory_range: - min: 31465472 - max: 65073664 + min: 29212672 + max: 64461296 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 94 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jo5mqdj7p + total_layers: 94 + job_id: jopry327g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,88 +146,156 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.012866Z' + timestamp: '2024-05-20T16:35:29.082201Z' - torchscript_onnx_tflite: - inference_time: 46106.0 - throughput: 21.68915108662647 + inference_time: 6431.0 + throughput: 155.49681231534754 estimated_peak_memory_range: - min: 12288 - max: 52922016 + min: 651264 + max: 2546568 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 99 + layers_on_npu: 97 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 99 - job_id: jegnlw0j5 + total_layers: 97 + job_id: jvgdv1qzg job_status: Passed - torchscript_onnx_ort: - inference_time: 362244.0 - throughput: 2.7605702233853426 + torchscript_onnx_qnn: + inference_time: 4293.0 + throughput: 232.93733985557884 estimated_peak_memory_range: - min: 159432704 - max: 207613904 - primary_compute_unit: CPU - precision: fp32 + min: 6332416 + max: 23257352 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 89 layers_on_gpu: 0 - layers_on_cpu: 92 - total_layers: 92 - job_id: jegnl7jj5 + layers_on_cpu: 0 + total_layers: 89 + job_id: jo5mzx69p job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.013041Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:29.082219Z' - torchscript_onnx_tflite: - inference_time: 206934.0 - throughput: 4.832458658316178 + inference_time: 35053.0 + throughput: 28.528228682281117 + estimated_peak_memory_range: + min: 147456 + max: 42857344 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 97 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 97 + job_id: j1gl3818g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 28024.0 + throughput: 35.68369968598344 estimated_peak_memory_range: - min: 2678784 - max: 4932640 + min: 6324224 + max: 55642400 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 99 + layers_on_npu: 89 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 99 - job_id: jep29omxg + total_layers: 89 + job_id: jlpekxl1p job_status: Passed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.013136Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:29.082235Z' - torchscript_onnx_tflite: - inference_time: 8927.0 - throughput: 112.0197154699227 + inference_time: 186982.0 + throughput: 5.348108374068092 estimated_peak_memory_range: - min: 2711552 - max: 19152008 + min: 774144 + max: 11267904 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 99 + layers_on_npu: 97 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 99 - job_id: jopr876k5 + total_layers: 97 + job_id: jw56nmd0g job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.013229Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:29.082246Z' + - torchscript_onnx_qnn: + inference_time: 5258.0 + throughput: 190.1863826550019 + estimated_peak_memory_range: + min: 6303744 + max: 6303744 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 89 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 89 + job_id: j0px1o6lg + job_status: Passed + torchscript_onnx_ort: + inference_time: 10886.0 + throughput: 91.86110600771633 + estimated_peak_memory_range: + min: 25223168 + max: 25223168 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 94 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 94 + job_id: jep2my9q5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 551323.0 + throughput: 1.813818759601903 + estimated_peak_memory_range: + min: 204230656 + max: 204230656 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jqpyd3jlp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.082269Z' diff --git a/qai_hub_models/models/ffnet_54s/README.md b/qai_hub_models/models/ffnet_54s/README.md index b1f072c6..6aea8fe7 100644 --- a/qai_hub_models/models/ffnet_54s/README.md +++ b/qai_hub_models/models/ffnet_54s/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_54s/export.py b/qai_hub_models/models/ffnet_54s/export.py index b8f1207c..4acec5b3 100644 --- a/qai_hub_models/models/ffnet_54s/export.py +++ b/qai_hub_models/models/ffnet_54s/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_54s/perf.yaml b/qai_hub_models/models/ffnet_54s/perf.yaml index aff4821c..8826e4ad 100644 --- a/qai_hub_models/models/ffnet_54s/perf.yaml +++ b/qai_hub_models/models/ffnet_54s/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-54S performance_metrics: - torchscript_onnx_tflite: - inference_time: 25024.0 - throughput: 39.9616368286445 + inference_time: 25556.0 + throughput: 39.12975426514321 estimated_peak_memory_range: - min: 2580480 - max: 5287928 + min: 2527232 + max: 5075256 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 113 - job_id: jopr8nzk5 + job_id: j2p0r02np job_status: Passed torchscript_onnx_qnn: - inference_time: 19758.0 - throughput: 50.61241016297196 + inference_time: 20540.0 + throughput: 48.685491723466406 estimated_peak_memory_range: - min: 25214976 - max: 48724312 + min: 25178112 + max: 46235888 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 175 - job_id: jqpyr7905 + job_id: jn5q2qro5 job_status: Passed torchscript_onnx_ort: - inference_time: 30799.0 - throughput: 32.46858664242345 + inference_time: 30453.0 + throughput: 32.837487275473684 estimated_peak_memory_range: - min: 30203904 - max: 103625272 + min: 33370112 + max: 130933960 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 177 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p804oqg + total_layers: 177 + job_id: jwgov63k5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.053359Z' + timestamp: '2024-05-20T16:35:29.121526Z' - torchscript_onnx_tflite: - inference_time: 18446.0 - throughput: 54.21229534858506 + inference_time: 18475.0 + throughput: 54.12719891745602 estimated_peak_memory_range: - min: 1429504 - max: 120768592 + min: 2248704 + max: 109217248 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 113 - job_id: jep20v26g + job_id: j1p87ymo5 job_status: Passed torchscript_onnx_qnn: - inference_time: 14552.0 - throughput: 68.71907641561297 + inference_time: 14482.0 + throughput: 69.05123601712471 estimated_peak_memory_range: - min: 180420608 - max: 252953088 + min: 24494080 + max: 90410912 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 175 - job_id: j2p03vy0p + job_id: j1glkm3mp job_status: Passed torchscript_onnx_ort: - inference_time: 23498.0 - throughput: 42.556813345816664 + inference_time: 23113.0 + throughput: 43.265694630727296 estimated_peak_memory_range: - min: 30953472 - max: 85531952 + min: 29417472 + max: 74020448 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 177 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jogk79zvp + total_layers: 177 + job_id: j1pvwkvrg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.053423Z' + timestamp: '2024-05-20T16:35:29.121553Z' - torchscript_onnx_tflite: - inference_time: 25045.0 - throughput: 39.92812936713915 + inference_time: 25895.0 + throughput: 38.61749372465727 estimated_peak_memory_range: - min: 2555904 - max: 5156288 + min: 2523136 + max: 5051104 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 113 - job_id: jn5qedee5 + job_id: jogkyxqnp job_status: Passed torchscript_onnx_qnn: - inference_time: 19986.0 - throughput: 50.035024517162014 + inference_time: 20155.0 + throughput: 49.61548002976929 estimated_peak_memory_range: min: 25214976 - max: 55043864 + max: 43633496 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 175 - job_id: jwgok9k1p + job_id: j1p3m0eng job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.053476Z' + timestamp: '2024-05-20T16:35:29.121570Z' + - torchscript_onnx_qnn: + inference_time: 25810.0 + throughput: 38.74467260751646 + estimated_peak_memory_range: + min: 25219072 + max: 25219072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 175 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 175 + job_id: jw5614nyp + job_status: Passed + torchscript_onnx_ort: + inference_time: 29548.0 + throughput: 33.84323812102342 + estimated_peak_memory_range: + min: 25219072 + max: 25219072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 177 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 177 + job_id: j7gjlneep + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 176490.0 + throughput: 5.666043401892458 + estimated_peak_memory_range: + min: 414695424 + max: 414695424 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 81 + total_layers: 81 + job_id: jlpevmkv5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.121592Z' diff --git a/qai_hub_models/models/ffnet_54s_quantized/README.md b/qai_hub_models/models/ffnet_54s_quantized/README.md index ce069620..9f4d0a9c 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/README.md +++ b/qai_hub_models/models/ffnet_54s_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_54s_quantized/export.py b/qai_hub_models/models/ffnet_54s_quantized/export.py index 04980844..5deb7808 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/export.py +++ b/qai_hub_models/models/ffnet_54s_quantized/export.py @@ -123,12 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -170,8 +174,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -199,8 +205,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -209,7 +219,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_54s_quantized/perf.yaml b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml index e1c908a5..1a7b2f06 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/perf.yaml +++ b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-54S-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 7125.0 - throughput: 140.35087719298247 + inference_time: 7101.0 + throughput: 140.8252358822701 estimated_peak_memory_range: - min: 647168 - max: 2562192 + min: 692224 + max: 2279272 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,22 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 118 - job_id: j1gl61n2g + job_id: jygz7drxp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 4974.0 + throughput: 201.04543626859672 + estimated_peak_memory_range: + min: 6311936 + max: 20048864 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 110 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 110 + job_id: jnp18ze7g job_status: Passed torchscript_onnx_ort: - inference_time: 51385.0 - throughput: 19.46093217865136 + inference_time: 11814.0 + throughput: 84.64533604198408 estimated_peak_memory_range: - min: 29982720 - max: 70964288 + min: 30167040 + max: 62607768 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 115 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p3vwkmg + total_layers: 115 + job_id: j0px1oylg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.077877Z' + timestamp: '2024-05-20T16:35:29.151840Z' - torchscript_onnx_tflite: - inference_time: 5099.0 - throughput: 196.11688566385567 + inference_time: 5164.0 + throughput: 193.64833462432222 estimated_peak_memory_range: min: 16384 - max: 75082320 + max: 74278720 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,22 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 118 - job_id: jw56ed6ng + job_id: jz5w96qmp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 3622.0 + throughput: 276.09055770292656 + estimated_peak_memory_range: + min: 6307840 + max: 63588464 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 110 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 110 + job_id: jvgdv1ozg job_status: Passed torchscript_onnx_ort: - inference_time: 31008.0 - throughput: 32.24974200206398 + inference_time: 9025.0 + throughput: 110.80332409972299 estimated_peak_memory_range: - min: 15433728 - max: 55696624 + min: 675840 + max: 35809952 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 115 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jwgok4y1p + total_layers: 115 + job_id: jo5mzx39p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,88 +146,156 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.077918Z' + timestamp: '2024-05-20T16:35:29.151867Z' - torchscript_onnx_tflite: - inference_time: 49684.0 - throughput: 20.127203928830205 + inference_time: 7134.0 + throughput: 140.17381553125875 estimated_peak_memory_range: - min: 126976 - max: 56138256 + min: 643072 + max: 3436240 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 120 + layers_on_npu: 118 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 120 - job_id: jygzo0o45 + total_layers: 118 + job_id: jmg94nw85 job_status: Passed - torchscript_onnx_ort: - inference_time: 420355.0 - throughput: 2.3789416088782103 + torchscript_onnx_qnn: + inference_time: 4965.0 + throughput: 201.4098690835851 estimated_peak_memory_range: - min: 187011072 - max: 248380464 - primary_compute_unit: CPU - precision: fp32 + min: 6307840 + max: 20582560 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 110 layers_on_gpu: 0 - layers_on_cpu: 113 - total_layers: 113 - job_id: j1pv093z5 + layers_on_cpu: 0 + total_layers: 110 + job_id: jqp4wrv1g job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.077964Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:29.151884Z' - torchscript_onnx_tflite: - inference_time: 216291.0 - throughput: 4.623400881220208 + inference_time: 39060.0 + throughput: 25.60163850486431 + estimated_peak_memory_range: + min: 40960 + max: 43989968 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 118 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 118 + job_id: jn5q31v4p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 31116.0 + throughput: 32.13780691605605 estimated_peak_memory_range: - min: 2650112 - max: 4899184 + min: 6332416 + max: 62882080 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 120 + layers_on_npu: 110 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 120 - job_id: jqpyj8drp + total_layers: 110 + job_id: j7gjeyqx5 job_status: Passed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.077990Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:29.151900Z' - torchscript_onnx_tflite: - inference_time: 10210.0 - throughput: 97.94319294809011 + inference_time: 200139.0 + throughput: 4.996527413447654 estimated_peak_memory_range: - min: 2527232 - max: 4340680 + min: 765952 + max: 3306112 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 120 + layers_on_npu: 118 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 120 - job_id: jvgdeme65 + total_layers: 118 + job_id: j1gl38l8g job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.078022Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:29.151911Z' + - torchscript_onnx_qnn: + inference_time: 6006.0 + throughput: 166.5001665001665 + estimated_peak_memory_range: + min: 6303744 + max: 6303744 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 110 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 110 + job_id: jz57drx95 + job_status: Passed + torchscript_onnx_ort: + inference_time: 11424.0 + throughput: 87.53501400560224 + estimated_peak_memory_range: + min: 25227264 + max: 25227264 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 115 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 115 + job_id: jegnev3qg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 870655.0 + throughput: 1.1485605664700713 + estimated_peak_memory_range: + min: 241315840 + max: 241315840 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 113 + total_layers: 113 + job_id: jopry3e7g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.151933Z' diff --git a/qai_hub_models/models/ffnet_78s/README.md b/qai_hub_models/models/ffnet_78s/README.md index 9f577cda..c3f6b6dc 100644 --- a/qai_hub_models/models/ffnet_78s/README.md +++ b/qai_hub_models/models/ffnet_78s/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_78s/export.py b/qai_hub_models/models/ffnet_78s/export.py index 4084ce61..70bebcd0 100644 --- a/qai_hub_models/models/ffnet_78s/export.py +++ b/qai_hub_models/models/ffnet_78s/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_78s/perf.yaml b/qai_hub_models/models/ffnet_78s/perf.yaml index 44f26730..6808013a 100644 --- a/qai_hub_models/models/ffnet_78s/perf.yaml +++ b/qai_hub_models/models/ffnet_78s/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-78S performance_metrics: - torchscript_onnx_tflite: - inference_time: 29177.0 - throughput: 34.27357164890153 + inference_time: 29391.0 + throughput: 34.02402095879691 estimated_peak_memory_range: - min: 2576384 - max: 5205816 + min: 2580480 + max: 4887232 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: j7gjzwx15 + job_id: jep2mylq5 job_status: Passed torchscript_onnx_qnn: - inference_time: 23420.0 - throughput: 42.69854824935952 + inference_time: 23544.0 + throughput: 42.473666326877336 estimated_peak_memory_range: - min: 24846336 - max: 48603008 + min: 25210880 + max: 46779104 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 235 - job_id: jygzo4e45 + job_id: j1p87yzo5 job_status: Passed torchscript_onnx_ort: - inference_time: 35439.0 - throughput: 28.21750049380626 + inference_time: 34349.0 + throughput: 29.1129290517919 estimated_peak_memory_range: - min: 30183424 - max: 150703648 + min: 30216192 + max: 174827344 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 237 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jmg9jxvm5 + total_layers: 237 + job_id: jw56141yp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.105190Z' + timestamp: '2024-05-20T16:35:29.191393Z' - torchscript_onnx_tflite: - inference_time: 21728.0 - throughput: 46.02356406480118 + inference_time: 21206.0 + throughput: 47.15646515137225 estimated_peak_memory_range: - min: 0 - max: 133794256 + min: 794624 + max: 119306480 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: jlpeel98p + job_id: jqpyd36lp job_status: Passed torchscript_onnx_qnn: - inference_time: 17745.0 - throughput: 56.353902507748664 + inference_time: 17482.0 + throughput: 57.201693170117835 estimated_peak_memory_range: - min: 25317376 - max: 101665296 + min: 20983808 + max: 100170336 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 235 - job_id: jz5w21o45 + job_id: jogkyx3np job_status: Passed torchscript_onnx_ort: - inference_time: 26731.0 - throughput: 37.40974898058434 + inference_time: 26382.0 + throughput: 37.904631946023805 estimated_peak_memory_range: min: 29417472 - max: 90195264 + max: 78554720 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 237 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jnp1yv0np + total_layers: 237 + job_id: j1p3m0mng job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.105263Z' + timestamp: '2024-05-20T16:35:29.191421Z' - torchscript_onnx_tflite: - inference_time: 29631.0 - throughput: 33.748439134690024 + inference_time: 29621.0 + throughput: 33.759832551230545 estimated_peak_memory_range: - min: 499712 - max: 1916448 + min: 2560000 + max: 5156816 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: jegnlwlj5 + job_id: j2p0r0lnp job_status: Passed torchscript_onnx_qnn: - inference_time: 23601.0 - throughput: 42.371085970933436 + inference_time: 23548.0 + throughput: 42.466451503312385 estimated_peak_memory_range: - min: 25165824 - max: 55560608 + min: 25202688 + max: 46491072 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 235 - job_id: j2p03x20p + job_id: j1glkmkmp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.105324Z' + timestamp: '2024-05-20T16:35:29.191438Z' + - torchscript_onnx_qnn: + inference_time: 32624.0 + throughput: 30.65228052967141 + estimated_peak_memory_range: + min: 25214976 + max: 25214976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 235 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 235 + job_id: jn5q2q3o5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 33277.0 + throughput: 30.050785828049403 + estimated_peak_memory_range: + min: 26583040 + max: 26583040 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 237 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 237 + job_id: jwgov6vk5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 207214.0 + throughput: 4.825928749987935 + estimated_peak_memory_range: + min: 139489280 + max: 139489280 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 105 + total_layers: 105 + job_id: j1pvwkwrg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.191460Z' diff --git a/qai_hub_models/models/ffnet_78s_lowres/README.md b/qai_hub_models/models/ffnet_78s_lowres/README.md index 0139c054..ac546964 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/README.md +++ b/qai_hub_models/models/ffnet_78s_lowres/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_78s_lowres/export.py b/qai_hub_models/models/ffnet_78s_lowres/export.py index badf0c59..74ef3914 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/export.py +++ b/qai_hub_models/models/ffnet_78s_lowres/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_78s_lowres/perf.yaml b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml index 42001680..5fbc7a1b 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/perf.yaml +++ b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-78S-LowRes performance_metrics: - torchscript_onnx_tflite: - inference_time: 10805.0 - throughput: 92.5497454881999 + inference_time: 10832.0 + throughput: 92.31905465288035 estimated_peak_memory_range: min: 667648 - max: 2943392 + max: 2444712 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: jz5707zng + job_id: j7gjlnlep job_status: Passed torchscript_onnx_qnn: - inference_time: 11389.0 - throughput: 87.80402142418123 + inference_time: 11360.0 + throughput: 88.02816901408451 estimated_peak_memory_range: - min: 32768 - max: 63143120 + min: 135168 + max: 63213296 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 236 - job_id: j0pxndv85 + job_id: jz5w969mp job_status: Passed torchscript_onnx_ort: - inference_time: 7820.0 - throughput: 127.8772378516624 + inference_time: 8961.0 + throughput: 111.59468809284678 estimated_peak_memory_range: - min: 2232320 - max: 124968440 + min: 2129920 + max: 131892976 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 238 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jegnl72j5 + total_layers: 238 + job_id: jz5w9694p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.129479Z' + timestamp: '2024-05-20T16:35:29.221675Z' - torchscript_onnx_tflite: - inference_time: 7620.0 - throughput: 131.23359580052494 + inference_time: 7598.0 + throughput: 131.61358252171624 estimated_peak_memory_range: - min: 299008 - max: 53659920 + min: 32768 + max: 51441440 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: jqp4k9q2g + job_id: jlpevmvv5 job_status: Passed torchscript_onnx_qnn: - inference_time: 7996.0 - throughput: 125.06253126563281 + inference_time: 7919.0 + throughput: 126.27857052658165 estimated_peak_memory_range: - min: 6324224 - max: 70041552 + min: 6307840 + max: 73605024 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 236 - job_id: jo5mqdr7p + job_id: jmg94n485 job_status: Passed torchscript_onnx_ort: - inference_time: 5925.0 - throughput: 168.77637130801688 + inference_time: 6622.0 + throughput: 151.01177891875565 estimated_peak_memory_range: - min: 6332416 - max: 48029072 + min: 6012928 + max: 45766784 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 238 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jopr8nkk5 + total_layers: 238 + job_id: jmg94n4m5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.129559Z' + timestamp: '2024-05-20T16:35:29.221702Z' - torchscript_onnx_tflite: - inference_time: 10747.0 - throughput: 93.04922303898762 + inference_time: 10817.0 + throughput: 92.44707405010631 estimated_peak_memory_range: - min: 655360 - max: 2972672 + min: 692224 + max: 2481904 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: jw56e0zng + job_id: jygz7d7xp job_status: Passed torchscript_onnx_qnn: - inference_time: 11414.0 - throughput: 87.61170492377782 + inference_time: 11402.0 + throughput: 87.70391159445711 estimated_peak_memory_range: - min: 6336512 - max: 38367920 + min: 1359872 + max: 53966200 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 236 - job_id: j7gjz8215 + job_id: jvgdv1vzg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.129627Z' + timestamp: '2024-05-20T16:35:29.221720Z' + - torchscript_onnx_qnn: + inference_time: 20470.0 + throughput: 48.85197850512946 + estimated_peak_memory_range: + min: 6303744 + max: 6303744 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 236 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 236 + job_id: jnp18z87g + job_status: Passed + torchscript_onnx_ort: + inference_time: 8747.0 + throughput: 114.32491139819366 + estimated_peak_memory_range: + min: 42668032 + max: 42668032 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 238 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 238 + job_id: jnp18z8ng + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 64289.0 + throughput: 15.554760534461572 + estimated_peak_memory_range: + min: 42369024 + max: 42369024 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 106 + total_layers: 106 + job_id: jvgdv1v6g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.221743Z' diff --git a/qai_hub_models/models/ffnet_78s_quantized/README.md b/qai_hub_models/models/ffnet_78s_quantized/README.md index c9cd33a7..43dcb2af 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/README.md +++ b/qai_hub_models/models/ffnet_78s_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/ffnet_78s_quantized/export.py b/qai_hub_models/models/ffnet_78s_quantized/export.py index e53a076e..c2f4f09e 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/export.py +++ b/qai_hub_models/models/ffnet_78s_quantized/export.py @@ -123,12 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -170,8 +174,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -199,8 +205,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -209,7 +219,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/ffnet_78s_quantized/perf.yaml b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml index 76857574..1c52bc46 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/perf.yaml +++ b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: FFNet-78S-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 8382.0 - throughput: 119.30326890956812 + inference_time: 8341.0 + throughput: 119.88970147464333 estimated_peak_memory_range: - min: 688128 - max: 2625256 + min: 684032 + max: 2437040 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,22 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 154 - job_id: jqpyr7e05 + job_id: jz57drdn5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 5952.0 + throughput: 168.01075268817203 + estimated_peak_memory_range: + min: 8372224 + max: 27369456 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jo5mzxz7p job_status: Passed torchscript_onnx_ort: - inference_time: 53059.0 - throughput: 18.846943968035582 + inference_time: 12352.0 + throughput: 80.95854922279793 estimated_peak_memory_range: - min: 30326784 - max: 75211072 + min: 30101504 + max: 79464896 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 151 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p8049qg + total_layers: 151 + job_id: jqpyd3d0p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.154340Z' + timestamp: '2024-05-20T16:35:29.251992Z' - torchscript_onnx_tflite: - inference_time: 5988.0 - throughput: 167.000668002672 + inference_time: 5972.0 + throughput: 167.44809109176154 estimated_peak_memory_range: - min: 20480 - max: 87117952 + min: 12288 + max: 88653408 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,22 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 154 - job_id: j2p03vq0p + job_id: jqp4wrw2g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 4317.0 + throughput: 231.6423442205235 + estimated_peak_memory_range: + min: 6307840 + max: 75240272 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jegnevejg job_status: Passed torchscript_onnx_ort: - inference_time: 31534.0 - throughput: 31.71180313312615 + inference_time: 9441.0 + throughput: 105.92098294672175 estimated_peak_memory_range: - min: 31961088 - max: 77114832 + min: 31965184 + max: 81051088 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 151 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jogk79nvp + total_layers: 151 + job_id: j2p0r010p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,88 +146,156 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.154385Z' + timestamp: '2024-05-20T16:35:29.252019Z' - torchscript_onnx_tflite: - inference_time: 57755.0 - throughput: 17.31451822353043 + inference_time: 8351.0 + throughput: 119.74613818704347 estimated_peak_memory_range: - min: 319488 - max: 58248928 + min: 696320 + max: 3185352 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 156 + layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 156 - job_id: jz5708lng + total_layers: 154 + job_id: j0px1o18g job_status: Passed - torchscript_onnx_ort: - inference_time: 547799.0 - throughput: 1.825487085591613 + torchscript_onnx_qnn: + inference_time: 5974.0 + throughput: 167.39203213927016 estimated_peak_memory_range: - min: 166916096 - max: 242608960 - primary_compute_unit: CPU - precision: fp32 + min: 6336512 + max: 26276408 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 146 layers_on_gpu: 0 - layers_on_cpu: 149 - total_layers: 149 - job_id: jn5qemke5 + layers_on_cpu: 0 + total_layers: 146 + job_id: jep2mym65 job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.154436Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:29.252037Z' - torchscript_onnx_tflite: - inference_time: 235689.0 - throughput: 4.242879387667647 + inference_time: 45673.0 + throughput: 21.89477371751363 + estimated_peak_memory_range: + min: 774144 + max: 49521760 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 154 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 154 + job_id: jz5wqzl65 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 37262.0 + throughput: 26.83699210992432 estimated_peak_memory_range: - min: 2572288 - max: 5196608 + min: 6307840 + max: 71671376 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 156 + layers_on_npu: 146 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 156 - job_id: j2p02or25 + total_layers: 146 + job_id: j0pxyrl1g job_status: Passed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.154466Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:29.252054Z' - torchscript_onnx_tflite: - inference_time: 10675.0 - throughput: 93.6768149882904 + inference_time: 218485.0 + throughput: 4.576973247591368 estimated_peak_memory_range: - min: 2576384 - max: 4529144 + min: 770048 + max: 10557616 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 156 + layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 156 - job_id: jvgdemq65 + total_layers: 154 + job_id: jmg9w2zlp job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.154494Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:29.252065Z' + - torchscript_onnx_qnn: + inference_time: 7096.0 + throughput: 140.92446448703495 + estimated_peak_memory_range: + min: 6303744 + max: 6303744 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jopry3ykg + job_status: Passed + torchscript_onnx_ort: + inference_time: 13843.0 + throughput: 72.23867658744491 + estimated_peak_memory_range: + min: 34721792 + max: 34721792 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 151 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 151 + job_id: j1p87y3q5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 801403.0 + throughput: 1.2478116503182544 + estimated_peak_memory_range: + min: 204279808 + max: 204279808 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jogkyxlvp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.252087Z' diff --git a/qai_hub_models/models/googlenet/README.md b/qai_hub_models/models/googlenet/README.md index ddf7fdbf..71a8d343 100644 --- a/qai_hub_models/models/googlenet/README.md +++ b/qai_hub_models/models/googlenet/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/g a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/googlenet/export.py b/qai_hub_models/models/googlenet/export.py index 3226da2e..eec00f37 100644 --- a/qai_hub_models/models/googlenet/export.py +++ b/qai_hub_models/models/googlenet/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/googlenet/perf.yaml b/qai_hub_models/models/googlenet/perf.yaml index ff2fab34..0f54510e 100644 --- a/qai_hub_models/models/googlenet/perf.yaml +++ b/qai_hub_models/models/googlenet/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: GoogLeNet performance_metrics: - torchscript_onnx_tflite: - inference_time: 1044.0 - throughput: 957.8544061302682 + inference_time: 1047.0 + throughput: 955.1098376313277 estimated_peak_memory_range: - min: 28672 - max: 2002104 + min: 16384 + max: 1526704 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 84 - job_id: jnp1yvlnp + job_id: jqp4wrx2g job_status: Passed torchscript_onnx_qnn: - inference_time: 1075.0 - throughput: 930.2325581395348 + inference_time: 1089.0 + throughput: 918.2736455463728 estimated_peak_memory_range: - min: 20480 - max: 26621784 + min: 618496 + max: 4593576 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 143 - job_id: jz5707wng + job_id: jegnev9jg job_status: Passed torchscript_onnx_ort: - inference_time: 1293.0 - throughput: 773.3952049497293 + inference_time: 1227.0 + throughput: 814.9959250203749 estimated_peak_memory_range: - min: 12288 - max: 46074600 + min: 16384 + max: 45472688 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 145 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j0pxndj85 + total_layers: 145 + job_id: j2p0r0e0p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.213322Z' + timestamp: '2024-05-20T16:35:29.373835Z' - torchscript_onnx_tflite: - inference_time: 650.0 - throughput: 1538.4615384615386 + inference_time: 691.0 + throughput: 1447.178002894356 estimated_peak_memory_range: - min: 16384 - max: 45786064 + min: 12288 + max: 46214624 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 84 - job_id: jvgdez965 + job_id: j0px1o78g job_status: Passed torchscript_onnx_qnn: - inference_time: 693.0 - throughput: 1443.001443001443 + inference_time: 699.0 + throughput: 1430.615164520744 estimated_peak_memory_range: min: 0 - max: 53494384 + max: 56918592 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 143 - job_id: jqp4k9o2g + job_id: jopry34kg job_status: Passed torchscript_onnx_ort: - inference_time: 852.0 - throughput: 1173.7089201877934 + inference_time: 898.0 + throughput: 1113.5857461024498 estimated_peak_memory_range: - min: 618496 - max: 24414912 + min: 602112 + max: 25082000 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 145 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jo5mqd27p + total_layers: 145 + job_id: j1p87ywq5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.213386Z' + timestamp: '2024-05-20T16:35:29.373860Z' - torchscript_onnx_tflite: - inference_time: 1043.0 - throughput: 958.7727708533077 + inference_time: 1047.0 + throughput: 955.1098376313277 estimated_peak_memory_range: min: 12288 - max: 1850480 + max: 17376784 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 84 - job_id: jlpeenk8p + job_id: jo5mzxw7p job_status: Passed torchscript_onnx_qnn: - inference_time: 1090.0 - throughput: 917.4311926605504 + inference_time: 1094.0 + throughput: 914.0767824497258 estimated_peak_memory_range: min: 622592 - max: 4955600 + max: 5356744 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 143 - job_id: jnp1ymenp + job_id: jqpyd340p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.213429Z' + timestamp: '2024-05-20T16:35:29.373877Z' + - torchscript_onnx_qnn: + inference_time: 1259.0 + throughput: 794.2811755361398 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 143 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 143 + job_id: jep2my765 + job_status: Passed + torchscript_onnx_ort: + inference_time: 1333.0 + throughput: 750.1875468867216 + estimated_peak_memory_range: + min: 11251712 + max: 11251712 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 145 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 145 + job_id: jogkyxrvp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 5736.0 + throughput: 174.33751743375174 + estimated_peak_memory_range: + min: 11059200 + max: 11059200 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 84 + total_layers: 84 + job_id: jn5q2q9e5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.373901Z' diff --git a/qai_hub_models/models/googlenet_quantized/README.md b/qai_hub_models/models/googlenet_quantized/README.md index dfa75bdd..38fef799 100644 --- a/qai_hub_models/models/googlenet_quantized/README.md +++ b/qai_hub_models/models/googlenet_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/g a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/googlenet_quantized/export.py b/qai_hub_models/models/googlenet_quantized/export.py index 44eb1ac6..df3a2329 100644 --- a/qai_hub_models/models/googlenet_quantized/export.py +++ b/qai_hub_models/models/googlenet_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/googlenet_quantized/perf.yaml b/qai_hub_models/models/googlenet_quantized/perf.yaml index 164b7f27..17e41ae4 100644 --- a/qai_hub_models/models/googlenet_quantized/perf.yaml +++ b/qai_hub_models/models/googlenet_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,6 +37,7 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: GoogLeNetQuantized performance_metrics: @@ -44,7 +46,7 @@ models: throughput: 3367.003367003367 estimated_peak_memory_range: min: 12288 - max: 1529584 + max: 1659216 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 84 - job_id: jopr8nqk5 + job_id: j1glkme2p job_status: Passed torchscript_onnx_qnn: - inference_time: 346.0 - throughput: 2890.173410404624 + inference_time: 345.0 + throughput: 2898.550724637681 estimated_peak_memory_range: - min: 16384 - max: 139797592 + min: 90112 + max: 4621032 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,22 +69,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 86 - job_id: jogk79mvp + job_id: jwgov6e15 job_status: Passed torchscript_onnx_ort: - inference_time: 756.0 - throughput: 1322.7513227513227 + inference_time: 623.0 + throughput: 1605.1364365971108 estimated_peak_memory_range: min: 12288 - max: 22997816 + max: 31466656 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 94 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1gl61r2g + total_layers: 94 + job_id: jygz7dv4p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -91,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.237563Z' + timestamp: '2024-05-20T16:35:29.404186Z' - torchscript_onnx_tflite: - inference_time: 229.0 - throughput: 4366.812227074236 + inference_time: 214.0 + throughput: 4672.897196261682 estimated_peak_memory_range: min: 12288 - max: 32807600 + max: 33138256 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 84 - job_id: jqpyr7w05 + job_id: jw5614qnp job_status: Passed torchscript_onnx_qnn: - inference_time: 242.0 - throughput: 4132.231404958678 + inference_time: 250.0 + throughput: 4000.0 estimated_peak_memory_range: - min: 163840 - max: 41416608 + min: 0 + max: 43090384 primary_compute_unit: NPU precision: int8 layer_info: @@ -120,22 +122,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 86 - job_id: jn5qemoe5 + job_id: j1pvwkzzg job_status: Passed torchscript_onnx_ort: - inference_time: 547.0 - throughput: 1828.1535648994516 + inference_time: 475.0 + throughput: 2105.2631578947367 estimated_peak_memory_range: - min: 3473408 - max: 30390976 + min: 0 + max: 26393632 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 94 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jw56edlng + total_layers: 94 + job_id: jz5w96m4p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -144,13 +146,28 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.237614Z' + timestamp: '2024-05-20T16:35:29.404215Z' - torchscript_onnx_tflite: - inference_time: 1013.0 - throughput: 987.1668311944719 + inference_time: 297.0 + throughput: 3367.003367003367 estimated_peak_memory_range: - min: 20480 - max: 16869552 + min: 12288 + max: 1518064 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 84 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 84 + job_id: j1p3m0qmg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 344.0 + throughput: 2906.9767441860463 + estimated_peak_memory_range: + min: 16384 + max: 100695528 primary_compute_unit: NPU precision: int8 layer_info: @@ -158,37 +175,45 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 86 - job_id: jz5708xqg + job_id: jlpevm485 job_status: Passed - torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:29.404232Z' + - torchscript_onnx_tflite: + inference_time: 950.0 + throughput: 1052.6315789473683 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 12288 + max: 17406016 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 84 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jegnlwev5 - job_status: Failed - torchscript_onnx_ort: - inference_time: 10247.0 - throughput: 97.58953840148337 + total_layers: 84 + job_id: jygzrylk5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1100.0 + throughput: 909.0909090909091 estimated_peak_memory_range: - min: 2646016 - max: 50596416 - primary_compute_unit: CPU - precision: fp32 + min: 163840 + max: 37495168 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 86 layers_on_gpu: 0 - layers_on_cpu: 95 - total_layers: 95 - job_id: j1p3vw2mg + layers_on_cpu: 0 + total_layers: 86 + job_id: jmg9w2owp job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -197,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.237671Z' + timestamp: '2024-05-20T16:35:29.404247Z' - torchscript_onnx_tflite: - inference_time: 5919.0 - throughput: 168.94745734076702 + inference_time: 5755.0 + throughput: 173.7619461337967 estimated_peak_memory_range: min: 20480 - max: 6396208 + max: 7049192 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 86 + layers_on_npu: 84 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 86 - job_id: j1p8mj7z5 + total_layers: 84 + job_id: jz5wqzy65 job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -220,13 +245,13 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.237693Z' - - torchscript_onnx_tflite: - inference_time: 322.0 - throughput: 3105.590062111801 + timestamp: '2024-05-20T16:35:29.404257Z' + - torchscript_onnx_qnn: + inference_time: 465.0 + throughput: 2150.537634408602 estimated_peak_memory_range: - min: 12288 - max: 2046792 + min: 540672 + max: 540672 primary_compute_unit: NPU precision: int8 layer_info: @@ -234,28 +259,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 86 - job_id: j0pxnzyj5 + job_id: j7gjlnk1p job_status: Passed - torchscript_onnx_qnn: - inference_time: 365.0 - throughput: 2739.72602739726 + torchscript_onnx_ort: + inference_time: 616.0 + throughput: 1623.3766233766235 estimated_peak_memory_range: - min: 634880 - max: 5391328 + min: 19083264 + max: 19083264 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 88 + layers_on_npu: 94 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 88 - job_id: jep20zmxg + total_layers: 94 + job_id: jmg94n9m5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2182.0 + throughput: 458.29514207149407 + estimated_peak_memory_range: + min: 1978368 + max: 1978368 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 94 + total_layers: 94 + job_id: jnp18zqng job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.237731Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.404279Z' diff --git a/qai_hub_models/models/hrnet_pose/README.md b/qai_hub_models/models/hrnet_pose/README.md index 60809ac6..1291e266 100644 --- a/qai_hub_models/models/hrnet_pose/README.md +++ b/qai_hub_models/models/hrnet_pose/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/h a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/hrnet_pose/app.py b/qai_hub_models/models/hrnet_pose/app.py index 7f500e96..7f97c3a7 100644 --- a/qai_hub_models/models/hrnet_pose/app.py +++ b/qai_hub_models/models/hrnet_pose/app.py @@ -200,6 +200,6 @@ def predict_pose_keypoints( predicted_images = [] for i, img in enumerate(NHWC_int_numpy_frames): - draw_points(img, keypoints[i], color=(255, 0, 0), size=2) + draw_points(img, keypoints[i], color=(255, 0, 0), size=6) predicted_images.append(fromarray(img)) return predicted_images diff --git a/qai_hub_models/models/hrnet_pose/export.py b/qai_hub_models/models/hrnet_pose/export.py index 7a1669ca..f8c0f803 100644 --- a/qai_hub_models/models/hrnet_pose/export.py +++ b/qai_hub_models/models/hrnet_pose/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/hrnet_pose/info.yaml b/qai_hub_models/models/hrnet_pose/info.yaml index bf4f1b15..9220f62f 100644 --- a/qai_hub_models/models/hrnet_pose/info.yaml +++ b/qai_hub_models/models/hrnet_pose/info.yaml @@ -16,7 +16,7 @@ source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet technical_details: Model checkpoint: hrnet_posenet_FP32_state_dict - Input resolution: 192x256 + Input resolution: 256x192 Number of parameters: 28.5M Model size: 109 MB applicable_scenarios: @@ -29,7 +29,7 @@ form_factors: - IoT related_models: [litehrnet, openpose] has_static_banner: yes -has_animated_banner: no +has_animated_banner: yes license_type: other deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/hrnet_pose/model.py b/qai_hub_models/models/hrnet_pose/model.py index 592ec066..c4d3c102 100644 --- a/qai_hub_models/models/hrnet_pose/model.py +++ b/qai_hub_models/models/hrnet_pose/model.py @@ -21,7 +21,7 @@ from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 # This model originally comes from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch # but we'll use the weights from AIMET # Weights and config stored in S3 are sourced from diff --git a/qai_hub_models/models/hrnet_pose/perf.yaml b/qai_hub_models/models/hrnet_pose/perf.yaml index 7e98ec4d..50386875 100644 --- a/qai_hub_models/models/hrnet_pose/perf.yaml +++ b/qai_hub_models/models/hrnet_pose/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,53 +31,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: HRNetPose performance_metrics: - torchscript_onnx_tflite: - inference_time: 2289.0 - throughput: 436.871996505024 + inference_time: 2818.0 + throughput: 354.86160397444996 estimated_peak_memory_range: - min: 16384 - max: 2655344 + min: 28672 + max: 2913312 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 514 + layers_on_npu: 516 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 514 - job_id: jwgok4q1p + total_layers: 516 + job_id: jvgdv176g job_status: Passed torchscript_onnx_qnn: - inference_time: 2297.0 - throughput: 435.35045711798 + inference_time: 2886.0 + throughput: 346.5003465003465 estimated_peak_memory_range: - min: 12288 - max: 59340792 + min: 16384 + max: 20957856 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 745 + layers_on_npu: 747 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 745 - job_id: j7gjzw415 + total_layers: 747 + job_id: j0px1oe8g job_status: Passed torchscript_onnx_ort: - inference_time: 3007.0 - throughput: 332.5573661456601 + inference_time: 3134.0 + throughput: 319.0810465858328 estimated_peak_memory_range: min: 0 - max: 148641888 + max: 128298872 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 749 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jygzo4k45 + total_layers: 749 + job_id: jep2my365 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,51 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.272940Z' + timestamp: '2024-05-20T16:35:29.443668Z' - torchscript_onnx_tflite: - inference_time: 1753.0 - throughput: 570.4506560182544 + inference_time: 2065.0 + throughput: 484.26150121065376 estimated_peak_memory_range: - min: 225280 - max: 107290736 + min: 12288 + max: 109086992 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 514 + layers_on_npu: 516 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 514 - job_id: j1pv09xz5 + total_layers: 516 + job_id: jz57drvn5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1719.0 - throughput: 581.7335660267597 + inference_time: 2134.0 + throughput: 468.6035613870665 estimated_peak_memory_range: - min: 606208 - max: 177224704 + min: 0 + max: 189704832 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 745 + layers_on_npu: 747 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 745 - job_id: jlpeel38p + total_layers: 747 + job_id: jo5mzxv7p job_status: Passed torchscript_onnx_ort: - inference_time: 2250.0 - throughput: 444.44444444444446 + inference_time: 2215.0 + throughput: 451.46726862302484 estimated_peak_memory_range: min: 12288 - max: 81136704 + max: 93863680 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 749 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz5w21n45 + total_layers: 749 + job_id: jqpyd3v0p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,36 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.273098Z' + timestamp: '2024-05-20T16:35:29.443693Z' - torchscript_onnx_tflite: - inference_time: 2294.0 - throughput: 435.9197907585004 + inference_time: 2881.0 + throughput: 347.1017007983339 estimated_peak_memory_range: - min: 16384 - max: 3533472 + min: 24576 + max: 4152200 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 514 + layers_on_npu: 516 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 514 - job_id: jogk7kyyp + total_layers: 516 + job_id: jqp4wrj2g job_status: Passed torchscript_onnx_qnn: - inference_time: 2291.0 - throughput: 436.4906154517678 + inference_time: 2909.0 + throughput: 343.7607425232039 estimated_peak_memory_range: - min: 610304 - max: 59474648 + min: 630784 + max: 16131888 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 745 + layers_on_npu: 747 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 745 - job_id: j1p3vrmxg + total_layers: 747 + job_id: jopry31kg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.273247Z' + timestamp: '2024-05-20T16:35:29.443710Z' + - torchscript_onnx_qnn: + inference_time: 3156.0 + throughput: 316.85678073510775 + estimated_peak_memory_range: + min: 589824 + max: 589824 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 747 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 747 + job_id: jegnevrjg + job_status: Passed + torchscript_onnx_ort: + inference_time: 2975.0 + throughput: 336.1344537815126 + estimated_peak_memory_range: + min: 54882304 + max: 54882304 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 749 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 749 + job_id: j2p0r0k0p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 19453.0 + throughput: 51.40595280933532 + estimated_peak_memory_range: + min: 37265408 + max: 37265408 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j1p87y8q5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.443733Z' diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/README.md b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md index 3e9e7062..fc585abc 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/README.md +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/h a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/export.py b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py index 170b15f0..44c79380 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/export.py +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py @@ -119,7 +119,7 @@ def export_model( # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/model.py b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py index 5074e78e..f476aa98 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/model.py +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py @@ -82,7 +82,12 @@ def get_hub_profile_options( profile_options = super().get_hub_profile_options( target_runtime, other_profile_options ) - return profile_options + " --compute_unit cpu" + if ( + target_runtime == TargetRuntime.TFLITE + and "--compute_unit" not in profile_options + ): + profile_options = profile_options + " --compute_unit gpu" + return profile_options # Modules used to override Huggingface WavLM to be NPU friendly diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml index ed9b36af..63c58551 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: HuggingFace-WavLM-Base-Plus performance_metrics: - torchscript_onnx_tflite: - inference_time: 884463.0 - throughput: 1.1306295458374178 + inference_time: 938575.0 + throughput: 1.0654449564499373 estimated_peak_memory_range: - min: 149233664 - max: 152668384 + min: 130052096 + max: 143676568 primary_compute_unit: CPU precision: fp32 layer_info: @@ -46,23 +48,38 @@ models: layers_on_gpu: 0 layers_on_cpu: 811 total_layers: 811 - job_id: jo5mqdy7p + job_id: jmg94n8m5 job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jz57dr6n5 + job_status: Failed torchscript_onnx_ort: - inference_time: 613080.0 - throughput: 1.631108501337509 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 16220160 - max: 44091568 - primary_compute_unit: CPU - precision: fp32 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: layers_on_npu: 0 layers_on_gpu: 0 - layers_on_cpu: 484 - total_layers: 484 - job_id: jopr8njk5 - job_status: Passed + layers_on_cpu: 0 + total_layers: 0 + job_id: jo5mzx47p + job_status: Failed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.343742Z' + timestamp: '2024-05-20T16:35:29.513720Z' - torchscript_onnx_tflite: - inference_time: 789013.0 - throughput: 1.2674062404548467 + inference_time: 852446.0 + throughput: 1.173094835332678 estimated_peak_memory_range: - min: 148623360 - max: 174462192 + min: 148041728 + max: 183065760 primary_compute_unit: CPU precision: fp32 layer_info: @@ -84,23 +101,38 @@ models: layers_on_gpu: 0 layers_on_cpu: 811 total_layers: 811 - job_id: jegnl78j5 + job_id: jnp18z3ng job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jqp4wr82g + job_status: Failed torchscript_onnx_ort: - inference_time: 513891.0 - throughput: 1.9459379518224682 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 995328 - max: 204911264 - primary_compute_unit: CPU - precision: fp32 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: layers_on_npu: 0 layers_on_gpu: 0 - layers_on_cpu: 484 - total_layers: 484 - job_id: jep20vn6g - job_status: Passed + layers_on_cpu: 0 + total_layers: 0 + job_id: jegnevxjg + job_status: Failed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.343896Z' + timestamp: '2024-05-20T16:35:29.513746Z' - torchscript_onnx_tflite: - inference_time: 928773.0 - throughput: 1.0766893525113241 + inference_time: 867664.0 + throughput: 1.1525198694425491 estimated_peak_memory_range: - min: 150151168 - max: 158231104 + min: 149274624 + max: 152991232 primary_compute_unit: CPU precision: fp32 layer_info: @@ -122,8 +154,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 811 total_layers: 811 - job_id: jqp4k2wqg + job_id: jvgdv106g job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j0px1om8g + job_status: Failed reference_device_info: name: QCS8550 (Proxy) os: '12' @@ -131,4 +178,42 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.343990Z' + timestamp: '2024-05-20T16:35:29.513762Z' + - torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jopry39kg + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jep2myj65 + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.513781Z' diff --git a/qai_hub_models/models/inception_v3/README.md b/qai_hub_models/models/inception_v3/README.md index 33e4e2ca..0b085c5e 100644 --- a/qai_hub_models/models/inception_v3/README.md +++ b/qai_hub_models/models/inception_v3/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/i a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/inception_v3/export.py b/qai_hub_models/models/inception_v3/export.py index 94f8800d..e3919318 100644 --- a/qai_hub_models/models/inception_v3/export.py +++ b/qai_hub_models/models/inception_v3/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/inception_v3/perf.yaml b/qai_hub_models/models/inception_v3/perf.yaml index e11be443..4d8aab78 100644 --- a/qai_hub_models/models/inception_v3/perf.yaml +++ b/qai_hub_models/models/inception_v3/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Inception-v3 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1337.0 - throughput: 747.9431563201197 + inference_time: 1342.0 + throughput: 745.156482861401 estimated_peak_memory_range: - min: 20480 - max: 2064624 + min: 12288 + max: 1685032 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 129 - job_id: j2p03600p + job_id: jqpyd3n0p job_status: Passed torchscript_onnx_qnn: - inference_time: 1396.0 - throughput: 716.3323782234957 + inference_time: 1414.0 + throughput: 707.2135785007072 estimated_peak_memory_range: min: 16384 - max: 150190256 + max: 149750296 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 219 - job_id: jogk78xvp + job_id: jogkyxovp job_status: Passed torchscript_onnx_ort: - inference_time: 1728.0 - throughput: 578.7037037037037 + inference_time: 1719.0 + throughput: 581.7335660267597 estimated_peak_memory_range: - min: 57344 - max: 214567960 + min: 12288 + max: 214330432 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 221 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1gl6lm2g + total_layers: 221 + job_id: j1p3m0xmg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.362074Z' + timestamp: '2024-05-20T16:35:29.537865Z' - torchscript_onnx_tflite: - inference_time: 1019.0 - throughput: 981.3542688910696 + inference_time: 1013.0 + throughput: 987.1668311944719 estimated_peak_memory_range: - min: 12288 - max: 51945968 + min: 16384 + max: 52159904 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 129 - job_id: j1p801yqg + job_id: j2p0r0d0p job_status: Passed torchscript_onnx_qnn: - inference_time: 1044.0 - throughput: 957.8544061302682 + inference_time: 1043.0 + throughput: 958.7727708533077 estimated_peak_memory_range: - min: 618496 - max: 62186832 + min: 0 + max: 66127216 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 219 - job_id: jn5qevqe5 + job_id: jn5q2qze5 job_status: Passed torchscript_onnx_ort: - inference_time: 1343.0 - throughput: 744.6016381236038 + inference_time: 1333.0 + throughput: 750.1875468867216 estimated_peak_memory_range: min: 618496 - max: 25688304 + max: 28967744 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 221 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jw56ew4ng + total_layers: 221 + job_id: jwgov6o15 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.362144Z' + timestamp: '2024-05-20T16:35:29.537891Z' - torchscript_onnx_tflite: - inference_time: 1335.0 - throughput: 749.0636704119851 + inference_time: 1352.0 + throughput: 739.6449704142012 estimated_peak_memory_range: - min: 24576 - max: 1812440 + min: 16384 + max: 2133976 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 129 - job_id: jogk7klyp + job_id: j1p87y6q5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1397.0 - throughput: 715.8196134574088 + inference_time: 1421.0 + throughput: 703.7297677691766 estimated_peak_memory_range: - min: 36864 - max: 150659520 + min: 20480 + max: 150041024 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 219 - job_id: j1p3vr4xg + job_id: jw5614rnp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.362196Z' + timestamp: '2024-05-20T16:35:29.537908Z' + - torchscript_onnx_qnn: + inference_time: 1636.0 + throughput: 611.2469437652812 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 219 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 219 + job_id: j1glkmo2p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1677.0 + throughput: 596.3029218843172 + estimated_peak_memory_range: + min: 48324608 + max: 48324608 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 221 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 221 + job_id: j1pvwkezg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 12033.0 + throughput: 83.10479514667996 + estimated_peak_memory_range: + min: 26181632 + max: 26181632 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 123 + total_layers: 123 + job_id: j7gjlno1p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.537932Z' diff --git a/qai_hub_models/models/inception_v3_quantized/README.md b/qai_hub_models/models/inception_v3_quantized/README.md index ee26f496..a0f99c07 100644 --- a/qai_hub_models/models/inception_v3_quantized/README.md +++ b/qai_hub_models/models/inception_v3_quantized/README.md @@ -3,7 +3,7 @@ # [Inception-v3-Quantized: Quantized Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/inception_v3_quantized) -InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using samples from [Google's open images dataset](https://storage.googleapis.com/openimages/web/index.html). +InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using samples from Google's open images dataset. This is based on the implementation of Inception-v3-Quantized found [here](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py). This repository contains scripts for optimized on-device @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/i a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/inception_v3_quantized/export.py b/qai_hub_models/models/inception_v3_quantized/export.py index c6c03ade..f5eab10b 100644 --- a/qai_hub_models/models/inception_v3_quantized/export.py +++ b/qai_hub_models/models/inception_v3_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/inception_v3_quantized/info.yaml b/qai_hub_models/models/inception_v3_quantized/info.yaml index 06f3bf87..c3d40275 100644 --- a/qai_hub_models/models/inception_v3_quantized/info.yaml +++ b/qai_hub_models/models/inception_v3_quantized/info.yaml @@ -7,7 +7,7 @@ domain: Computer Vision description: InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using - samples from [Google's open images dataset](https://storage.googleapis.com/openimages/web/index.html). + samples from Google's open images dataset. use_case: Image Classification tags: - backbone diff --git a/qai_hub_models/models/inception_v3_quantized/perf.yaml b/qai_hub_models/models/inception_v3_quantized/perf.yaml index 0ab60e20..04c24471 100644 --- a/qai_hub_models/models/inception_v3_quantized/perf.yaml +++ b/qai_hub_models/models/inception_v3_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Inception-v3-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 623.0 - throughput: 1605.1364365971108 + inference_time: 615.0 + throughput: 1626.0162601626016 estimated_peak_memory_range: - min: 40960 - max: 1585824 + min: 20480 + max: 1835968 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,22 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 144 - job_id: jwgok861p + job_id: jlpevm885 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 656.0 + throughput: 1524.3902439024391 + estimated_peak_memory_range: + min: 16384 + max: 70614144 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 134 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 134 + job_id: jmg94nkm5 job_status: Passed torchscript_onnx_ort: - inference_time: 1098.0 - throughput: 910.7468123861566 + inference_time: 934.0 + throughput: 1070.6638115631692 estimated_peak_memory_range: - min: 53248 - max: 53526464 + min: 12288 + max: 63129504 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 137 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j7gjzqn15 + total_layers: 137 + job_id: jmg94nkq5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.386127Z' + timestamp: '2024-05-20T16:35:29.568165Z' - torchscript_onnx_tflite: - inference_time: 492.0 - throughput: 2032.520325203252 + inference_time: 466.0 + throughput: 2145.922746781116 estimated_peak_memory_range: min: 12288 - max: 64321136 + max: 65030624 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,22 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 144 - job_id: j1pv07kz5 + job_id: jygz7d84p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 493.0 + throughput: 2028.3975659229209 + estimated_peak_memory_range: + min: 163840 + max: 49682240 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 134 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 134 + job_id: jnp18z7ng job_status: Passed torchscript_onnx_ort: - inference_time: 880.0 - throughput: 1136.3636363636363 + inference_time: 708.0 + throughput: 1412.4293785310736 estimated_peak_memory_range: - min: 618496 - max: 36779824 + min: 0 + max: 35132704 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 137 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jlpeeym8p + total_layers: 137 + job_id: jnp18z7kg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,36 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.386165Z' + timestamp: '2024-05-20T16:35:29.568191Z' + - torchscript_onnx_tflite: + inference_time: 627.0 + throughput: 1594.896331738437 + estimated_peak_memory_range: + min: 16384 + max: 2002888 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: jz5w9684p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 645.0 + throughput: 1550.3875968992247 + estimated_peak_memory_range: + min: 24576 + max: 70914568 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 134 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 134 + job_id: jz5w968zp + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:29.568208Z' - torchscript_onnx_tflite: - inference_time: 2624.0 - throughput: 381.0975609756098 + inference_time: 2476.0 + throughput: 403.8772213247173 estimated_peak_memory_range: min: 12288 - max: 20812688 + max: 21173984 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 146 + layers_on_npu: 144 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 146 - job_id: j7gjz8075 + total_layers: 144 + job_id: jygzry0o5 job_status: Passed - torchscript_onnx_ort: - inference_time: 26460.0 - throughput: 37.79289493575208 + torchscript_onnx_qnn: + inference_time: 2578.0 + throughput: 387.8975950349108 estimated_peak_memory_range: - min: 17575936 - max: 85502320 - primary_compute_unit: CPU - precision: fp32 + min: 163840 + max: 52566912 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 134 layers_on_gpu: 0 - layers_on_cpu: 138 - total_layers: 138 - job_id: jygzond45 + layers_on_cpu: 0 + total_layers: 134 + job_id: jqp4v428p job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -152,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.386219Z' + timestamp: '2024-05-20T16:35:29.568224Z' - torchscript_onnx_tflite: - inference_time: 7950.0 - throughput: 125.78616352201257 + inference_time: 7805.0 + throughput: 128.12299807815504 estimated_peak_memory_range: - min: 45056 - max: 4402544 + min: 16384 + max: 7895408 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 146 + layers_on_npu: 144 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 146 - job_id: jn5qr427p + total_layers: 144 + job_id: jz5wqzr35 job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -175,27 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.386245Z' - - torchscript_onnx_tflite: - inference_time: 641.0 - throughput: 1560.0624024960998 + timestamp: '2024-05-20T16:35:29.568235Z' + - torchscript_onnx_qnn: + inference_time: 716.0 + throughput: 1396.6480446927374 estimated_peak_memory_range: - min: 12288 - max: 1923000 + min: 413696 + max: 413696 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 134 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 134 + job_id: jvgdv186g + job_status: Passed + torchscript_onnx_ort: + inference_time: 910.0 + throughput: 1098.901098901099 + estimated_peak_memory_range: + min: 39702528 + max: 39702528 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 146 + layers_on_npu: 137 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 146 - job_id: jlpeenr7p + total_layers: 137 + job_id: jvgdv18kg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 21412.0 + throughput: 46.70278348589576 + estimated_peak_memory_range: + min: 20770816 + max: 20770816 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jz57drkq5 job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.386270Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.568257Z' diff --git a/qai_hub_models/models/lama_dilated/README.md b/qai_hub_models/models/lama_dilated/README.md index 34259a6a..511bdc4a 100644 --- a/qai_hub_models/models/lama_dilated/README.md +++ b/qai_hub_models/models/lama_dilated/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/l a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/lama_dilated/export.py b/qai_hub_models/models/lama_dilated/export.py index 64713a41..a3f5ee20 100644 --- a/qai_hub_models/models/lama_dilated/export.py +++ b/qai_hub_models/models/lama_dilated/export.py @@ -120,12 +120,17 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image,mask" + + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image,mask" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +168,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image,mask", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image,mask", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +201,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) diff --git a/qai_hub_models/models/lama_dilated/perf.yaml b/qai_hub_models/models/lama_dilated/perf.yaml index 80ce8f88..45244c8f 100644 --- a/qai_hub_models/models/lama_dilated/perf.yaml +++ b/qai_hub_models/models/lama_dilated/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: LaMa-Dilated performance_metrics: - torchscript_onnx_tflite: - inference_time: 87925.0 - throughput: 11.373329542223486 + inference_time: 87247.0 + throughput: 11.46171215056105 estimated_peak_memory_range: - min: 0 - max: 3269648 + min: 2240512 + max: 138049312 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 347 - job_id: jz5w24645 + job_id: jqp4wrmqg job_status: Passed torchscript_onnx_qnn: - inference_time: 81938.0 - throughput: 12.204349630208206 + inference_time: 81632.0 + throughput: 12.250098000784007 estimated_peak_memory_range: - min: 1654784 - max: 33961664 + min: 4276224 + max: 42687880 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 332 + layers_on_npu: 333 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 332 - job_id: jnp1y6znp + total_layers: 333 + job_id: jegnev7vg job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -76,7 +78,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jz5w246z5 + job_id: j2p0r0v2p job_status: Failed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.412866Z' + timestamp: '2024-05-20T16:35:29.607433Z' - torchscript_onnx_tflite: - inference_time: 60997.0 - throughput: 16.39424889748676 + inference_time: 59804.0 + throughput: 16.721289545849775 estimated_peak_memory_range: - min: 2707456 - max: 271146544 + min: 2932736 + max: 243608672 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,22 +101,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 347 - job_id: jmg9jdnm5 + job_id: j0px1o3jg job_status: Passed torchscript_onnx_qnn: - inference_time: 57249.0 - throughput: 17.4675540184108 + inference_time: 57736.0 + throughput: 17.32021615629763 estimated_peak_memory_range: - min: 4161536 - max: 189298048 + min: 2392064 + max: 161784064 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 332 + layers_on_npu: 333 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 332 - job_id: jvgde2165 + total_layers: 333 + job_id: jopry3nvg job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -129,7 +131,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jmg9jdnq5 + job_id: j1p87y4z5 job_status: Failed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.412968Z' + timestamp: '2024-05-20T16:35:29.607460Z' - torchscript_onnx_tflite: - inference_time: 87453.0 - throughput: 11.434713503253176 + inference_time: 85940.0 + throughput: 11.63602513381429 estimated_peak_memory_range: - min: 3260416 - max: 139194808 + min: 3170304 + max: 139550144 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,22 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 347 - job_id: jopr871v5 + job_id: jo5mzxoyp job_status: Passed torchscript_onnx_qnn: - inference_time: 82234.0 - throughput: 12.160420264124328 + inference_time: 80913.0 + throughput: 12.358953443822378 estimated_peak_memory_range: - min: 3178496 - max: 33096560 + min: 3190784 + max: 42527696 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 332 + layers_on_npu: 333 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 332 - job_id: j1p80kwzg + total_layers: 333 + job_id: jqpyd37rp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.413058Z' + timestamp: '2024-05-20T16:35:29.607478Z' + - torchscript_onnx_qnn: + inference_time: 92003.0 + throughput: 10.869210786604784 + estimated_peak_memory_range: + min: 4202496 + max: 4202496 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 333 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 333 + job_id: jep2myvx5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jogkyx9yp + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 659315.0 + throughput: 1.5167256925748693 + estimated_peak_memory_range: + min: 278200320 + max: 278200320 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 220 + total_layers: 220 + job_id: jn5q2qm75 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.607503Z' diff --git a/qai_hub_models/models/litehrnet/README.md b/qai_hub_models/models/litehrnet/README.md index a8956a11..13fa47e0 100644 --- a/qai_hub_models/models/litehrnet/README.md +++ b/qai_hub_models/models/litehrnet/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/l a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/litehrnet/app.py b/qai_hub_models/models/litehrnet/app.py index f2f714f4..0f3fb398 100644 --- a/qai_hub_models/models/litehrnet/app.py +++ b/qai_hub_models/models/litehrnet/app.py @@ -103,6 +103,6 @@ def predict_pose_keypoints( predicted_images = [] for i, img in enumerate(NHWC_int_numpy_frames): - draw_points(img, keypoints[i], color=(255, 0, 0), size=2) + draw_points(img, keypoints[i], color=(255, 0, 0), size=6) predicted_images.append(fromarray(img)) return predicted_images diff --git a/qai_hub_models/models/litehrnet/export.py b/qai_hub_models/models/litehrnet/export.py index 92d23418..79e94273 100644 --- a/qai_hub_models/models/litehrnet/export.py +++ b/qai_hub_models/models/litehrnet/export.py @@ -120,7 +120,7 @@ def export_model( # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( diff --git a/qai_hub_models/models/litehrnet/info.yaml b/qai_hub_models/models/litehrnet/info.yaml index 31da35bd..663ccd8a 100644 --- a/qai_hub_models/models/litehrnet/info.yaml +++ b/qai_hub_models/models/litehrnet/info.yaml @@ -27,7 +27,7 @@ form_factors: - IoT related_models: [openpose, hrnet_pose] has_static_banner: yes -has_animated_banner: no +has_animated_banner: yes license_type: apache-2.0 deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/litehrnet/perf.yaml b/qai_hub_models/models/litehrnet/perf.yaml index 4eea5b8e..6ae1b3c7 100644 --- a/qai_hub_models/models/litehrnet/perf.yaml +++ b/qai_hub_models/models/litehrnet/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: LiteHRNet performance_metrics: - torchscript_onnx_tflite: - inference_time: 15561.0 - throughput: 64.263222157959 + inference_time: 11083.0 + throughput: 90.22827754218171 estimated_peak_memory_range: - min: 6553600 - max: 13181120 + min: 6615040 + max: 31875176 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,7 +48,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 10 total_layers: 1236 - job_id: jvgde21k5 + job_id: j1glkm1ep job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -61,7 +63,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jqp4k3rqg + job_id: jwgov6445 job_status: Failed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +72,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.436998Z' + timestamp: '2024-05-20T16:35:29.637644Z' - torchscript_onnx_tflite: - inference_time: 10344.0 - throughput: 96.67440061871616 + inference_time: 7847.0 + throughput: 127.43723716069836 estimated_peak_memory_range: - min: 20480 - max: 73273328 + min: 16384 + max: 74259408 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,7 +86,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 10 total_layers: 1236 - job_id: jz5709rqg + job_id: jw5614dvp job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -99,7 +101,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: j0pxnxoj5 + job_id: j1pvwk97g job_status: Failed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +110,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.437166Z' + timestamp: '2024-05-20T16:35:29.637666Z' - torchscript_onnx_tflite: - inference_time: 15632.0 - throughput: 63.97134083930399 + inference_time: 11125.0 + throughput: 89.88764044943821 estimated_peak_memory_range: - min: 6529024 - max: 10764512 + min: 6553600 + max: 11774200 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +124,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 10 total_layers: 1236 - job_id: j1gl6qeeg + job_id: j1p3m0wxg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +133,42 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.437300Z' + timestamp: '2024-05-20T16:35:29.637678Z' + - torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j7gjlnw7p + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 17746.0 + throughput: 56.35072692437733 + estimated_peak_memory_range: + min: 9547776 + max: 9547776 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 939 + total_layers: 939 + job_id: jlpevml75 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.637696Z' diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md b/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md index ccb5645f..5f1fc83b 100644 --- a/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md +++ b/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md @@ -15,6 +15,8 @@ a hosted Qualcomm® device. + + ## License - The license for the original implementation of Llama-v2-7B-Chat can be found [here](https://github.com/facebookresearch/llama/blob/main/LICENSE). @@ -29,3 +31,25 @@ a hosted Qualcomm® device. * For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). +## Usage and Limitations + +This model may not be used for or in connection with any of the following applications: + +- Accessing essential private and public services and benefits; +- Administration of justice and democratic processes; +- Assessing or recognizing the emotional state of a person; +- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics; +- Education and vocational training; +- Employment and workers management; +- Exploitation of the vulnerabilities of persons resulting in harmful behavior; +- General purpose social scoring; +- Law enforcement; +- Management and operation of critical infrastructure; +- Migration, asylum and border control management; +- Predictive policing; +- Real-time remote biometric identification in public spaces; +- Recommender systems of social media platforms; +- Scraping of facial images (from the internet or otherwise); and/or +- Subliminal manipulation + + diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml b/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml index 92bb1baf..fc8a7ba1 100644 --- a/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml +++ b/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml @@ -1,6 +1,6 @@ name: Llama-v2-7B-Chat id: llama_v2_7b_chat_quantized -status: public # Renable when approved by marketing #9577 +status: public headline: State-of-the-art large language model useful on a variety of language understanding and generation tasks. domain: Generative AI diff --git a/qai_hub_models/models/mediapipe_face/README.md b/qai_hub_models/models/mediapipe_face/README.md index 5fa48aab..1a280d17 100644 --- a/qai_hub_models/models/mediapipe_face/README.md +++ b/qai_hub_models/models/mediapipe_face/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mediapipe_face/export.py b/qai_hub_models/models/mediapipe_face/export.py index 8b99631f..9aca58fe 100644 --- a/qai_hub_models/models/mediapipe_face/export.py +++ b/qai_hub_models/models/mediapipe_face/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -226,12 +226,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser( - model_cls=Model, - components=ALL_COMPONENTS, - supports_qnn=False, - supports_ort=False, - ) + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mediapipe_face/model.py b/qai_hub_models/models/mediapipe_face/model.py index 29b79435..e7d62e57 100644 --- a/qai_hub_models/models/mediapipe_face/model.py +++ b/qai_hub_models/models/mediapipe_face/model.py @@ -13,7 +13,7 @@ from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 # Vertex indices can be found in # https://github.com/google/mediapipe/blob/0.8.1/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualization.png diff --git a/qai_hub_models/models/mediapipe_face/perf.yaml b/qai_hub_models/models/mediapipe_face/perf.yaml index 030a5d9e..14a2bc5b 100644 --- a/qai_hub_models/models/mediapipe_face/perf.yaml +++ b/qai_hub_models/models/mediapipe_face/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MediaPipeFaceDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 785.0 - throughput: 1273.8853503184714 + inference_time: 815.0 + throughput: 1226.993865030675 estimated_peak_memory_range: - min: 12288 - max: 1533536 + min: 20480 + max: 1627976 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 112 - job_id: jegnlk6v5 + job_id: jygz7d4zp job_status: Passed torchscript_onnx_qnn: - inference_time: 839.0 - throughput: 1191.8951132300358 + inference_time: 843.0 + throughput: 1186.2396204033214 estimated_peak_memory_range: - min: 815104 - max: 6910200 + min: 806912 + max: 6902688 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 148 - job_id: j2p036z2p + job_id: jqp4wr9qg job_status: Passed torchscript_onnx_ort: - inference_time: 996.0 - throughput: 1004.0160642570281 + inference_time: 993.0 + throughput: 1007.0493454179255 estimated_peak_memory_range: - min: 806912 - max: 6602536 + min: 802816 + max: 72047760 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 147 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1gl6lveg + total_layers: 147 + job_id: j1p87y1z5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.455108Z' + timestamp: '2024-05-20T16:35:29.659622Z' - torchscript_onnx_tflite: - inference_time: 544.0 - throughput: 1838.235294117647 + inference_time: 569.0 + throughput: 1757.469244288225 estimated_peak_memory_range: min: 12288 - max: 28679584 + max: 30017104 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 112 - job_id: jep20ekxg + job_id: jmg94nxq5 job_status: Passed torchscript_onnx_qnn: - inference_time: 595.0 - throughput: 1680.672268907563 + inference_time: 592.0 + throughput: 1689.1891891891892 estimated_peak_memory_range: - min: 802816 - max: 47837376 + min: 12288 + max: 47426416 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 148 - job_id: jogk78eyp + job_id: jo5mzxdyp job_status: Passed torchscript_onnx_ort: - inference_time: 706.0 - throughput: 1416.4305949008499 + inference_time: 719.0 + throughput: 1390.8205841446454 estimated_peak_memory_range: min: 12288 - max: 20347024 + max: 22023952 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 147 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p3v6jxg + total_layers: 147 + job_id: jn5q2qv75 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.455176Z' + timestamp: '2024-05-20T16:35:29.659647Z' - torchscript_onnx_tflite: - inference_time: 784.0 - throughput: 1275.5102040816328 + inference_time: 778.0 + throughput: 1285.3470437017995 estimated_peak_memory_range: - min: 24576 - max: 1602632 + min: 12288 + max: 1913768 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 112 - job_id: jlpeen47p + job_id: jvgdv1zkg job_status: Passed torchscript_onnx_qnn: - inference_time: 840.0 - throughput: 1190.4761904761904 + inference_time: 845.0 + throughput: 1183.4319526627219 estimated_peak_memory_range: - min: 815104 - max: 6172048 + min: 806912 + max: 100815984 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 148 - job_id: j0pxnzej5 + job_id: jqpyd3mrp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,15 +178,68 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.455221Z' + timestamp: '2024-05-20T16:35:29.659664Z' + - torchscript_onnx_qnn: + inference_time: 928.0 + throughput: 1077.5862068965516 + estimated_peak_memory_range: + min: 786432 + max: 786432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 147 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 147 + job_id: jopry3wvg + job_status: Passed + torchscript_onnx_ort: + inference_time: 1157.0 + throughput: 864.304235090752 + estimated_peak_memory_range: + min: 3178496 + max: 3178496 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 147 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 147 + job_id: jw5614wvp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 5344.0 + throughput: 187.125748502994 + estimated_peak_memory_range: + min: 9064448 + max: 9064448 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jwgov6845 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.659686Z' - name: MediaPipeFaceLandmarkDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 315.0 - throughput: 3174.6031746031745 + inference_time: 325.0 + throughput: 3076.923076923077 estimated_peak_memory_range: - min: 24576 - max: 1781952 + min: 32768 + max: 4219616 primary_compute_unit: NPU precision: fp16 layer_info: @@ -192,14 +247,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 101 - job_id: jopr8wvv5 + job_id: jz5w961zp job_status: Passed torchscript_onnx_qnn: - inference_time: 390.0 - throughput: 2564.102564102564 + inference_time: 400.0 + throughput: 2500.0 estimated_peak_memory_range: - min: 458752 - max: 94680040 + min: 462848 + max: 42261400 primary_compute_unit: NPU precision: fp16 layer_info: @@ -207,22 +262,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 107 - job_id: j1p801qzg + job_id: j0px1odjg job_status: Passed torchscript_onnx_ort: - inference_time: 494.0 - throughput: 2024.2914979757086 + inference_time: 506.0 + throughput: 1976.2845849802372 estimated_peak_memory_range: min: 12288 - max: 7623304 + max: 7765592 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 106 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jw56ewyvg + total_layers: 106 + job_id: jogkyx8yp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -231,13 +286,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.455272Z' + timestamp: '2024-05-20T16:35:29.659708Z' - torchscript_onnx_tflite: - inference_time: 230.0 - throughput: 4347.826086956522 + inference_time: 235.0 + throughput: 4255.31914893617 estimated_peak_memory_range: min: 12288 - max: 25090016 + max: 25797520 primary_compute_unit: NPU precision: fp16 layer_info: @@ -245,14 +300,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 101 - job_id: jqpyrm1r5 + job_id: jnp18zvkg job_status: Passed torchscript_onnx_qnn: - inference_time: 285.0 - throughput: 3508.7719298245615 + inference_time: 282.0 + throughput: 3546.099290780142 estimated_peak_memory_range: min: 12288 - max: 33592960 + max: 39404800 primary_compute_unit: NPU precision: fp16 layer_info: @@ -260,22 +315,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 107 - job_id: jn5qev675 + job_id: jegnevkvg job_status: Passed torchscript_onnx_ort: - inference_time: 408.0 - throughput: 2450.9803921568628 + inference_time: 395.0 + throughput: 2531.6455696202534 estimated_peak_memory_range: min: 12288 - max: 15898592 + max: 21486416 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 106 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jwgok824p + total_layers: 106 + job_id: j1glkmlep job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -284,13 +339,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.455318Z' + timestamp: '2024-05-20T16:35:29.659728Z' - torchscript_onnx_tflite: - inference_time: 326.0 - throughput: 3067.4846625766872 + inference_time: 306.0 + throughput: 3267.97385620915 estimated_peak_memory_range: - min: 24576 - max: 1871744 + min: 28672 + max: 1867256 primary_compute_unit: NPU precision: fp16 layer_info: @@ -298,14 +353,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 101 - job_id: jygzo0vz5 + job_id: jz57dr7q5 job_status: Passed torchscript_onnx_qnn: - inference_time: 396.0 - throughput: 2525.252525252525 + inference_time: 378.0 + throughput: 2645.5026455026455 estimated_peak_memory_range: - min: 458752 - max: 81438752 + min: 466944 + max: 20140984 primary_compute_unit: NPU precision: fp16 layer_info: @@ -313,7 +368,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 107 - job_id: jo5mqlvyp + job_id: j2p0r062p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -322,4 +377,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.455361Z' + timestamp: '2024-05-20T16:35:29.659743Z' + - torchscript_onnx_qnn: + inference_time: 546.0 + throughput: 1831.5018315018315 + estimated_peak_memory_range: + min: 442368 + max: 442368 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 106 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 106 + job_id: jep2myex5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 529.0 + throughput: 1890.359168241966 + estimated_peak_memory_range: + min: 4382720 + max: 4382720 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 106 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 106 + job_id: j1p3m06xg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2139.0 + throughput: 467.50818139317437 + estimated_peak_memory_range: + min: 5292032 + max: 5292032 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 80 + total_layers: 80 + job_id: j1pvwk77g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.659765Z' diff --git a/qai_hub_models/models/mediapipe_hand/README.md b/qai_hub_models/models/mediapipe_hand/README.md index 481ebcb7..8e327a5a 100644 --- a/qai_hub_models/models/mediapipe_hand/README.md +++ b/qai_hub_models/models/mediapipe_hand/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mediapipe_hand/export.py b/qai_hub_models/models/mediapipe_hand/export.py index 8e8b2ec1..8e734c87 100644 --- a/qai_hub_models/models/mediapipe_hand/export.py +++ b/qai_hub_models/models/mediapipe_hand/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -226,9 +226,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser( - model_cls=Model, components=ALL_COMPONENTS, supports_ort=False - ) + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mediapipe_hand/model.py b/qai_hub_models/models/mediapipe_hand/model.py index 25d2c4e5..134f3751 100644 --- a/qai_hub_models/models/mediapipe_hand/model.py +++ b/qai_hub_models/models/mediapipe_hand/model.py @@ -14,7 +14,7 @@ from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 # https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py # 8 12 16 20 diff --git a/qai_hub_models/models/mediapipe_hand/perf.yaml b/qai_hub_models/models/mediapipe_hand/perf.yaml index 828ff486..18cb733d 100644 --- a/qai_hub_models/models/mediapipe_hand/perf.yaml +++ b/qai_hub_models/models/mediapipe_hand/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MediaPipeHandDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 953.0 - throughput: 1049.3179433368311 + inference_time: 957.0 + throughput: 1044.932079414838 estimated_peak_memory_range: min: 12288 - max: 7786576 + max: 2098904 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 152 - job_id: jlpeeyd7p + job_id: j7gjlnq7p job_status: Passed torchscript_onnx_qnn: - inference_time: 1019.0 - throughput: 981.3542688910696 + inference_time: 1014.0 + throughput: 986.1932938856016 estimated_peak_memory_range: - min: 806912 - max: 8813592 + min: 12288 + max: 21477272 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 197 - job_id: jnp1y64kp + job_id: jvgdv12kg job_status: Passed torchscript_onnx_ort: - inference_time: 1219.0 - throughput: 820.3445447087777 + inference_time: 1160.0 + throughput: 862.0689655172414 estimated_peak_memory_range: min: 12288 - max: 19518840 + max: 18289360 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 196 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j0pxnxkj5 + total_layers: 196 + job_id: jqpyd3xrp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.499757Z' + timestamp: '2024-05-20T16:35:29.706483Z' - torchscript_onnx_tflite: - inference_time: 679.0 - throughput: 1472.7540500736377 + inference_time: 680.0 + throughput: 1470.5882352941176 estimated_peak_memory_range: min: 12288 - max: 52020064 + max: 53739952 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 152 - job_id: jz5w24ez5 + job_id: jygz7dnzp job_status: Passed torchscript_onnx_qnn: - inference_time: 722.0 - throughput: 1385.0415512465374 + inference_time: 725.0 + throughput: 1379.3103448275863 estimated_peak_memory_range: min: 802816 - max: 57062560 + max: 62597664 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 197 - job_id: jz5709yqg + job_id: jqp4wr3qg job_status: Passed torchscript_onnx_ort: - inference_time: 838.0 - throughput: 1193.3174224343675 + inference_time: 868.0 + throughput: 1152.073732718894 estimated_peak_memory_range: - min: 565248 - max: 29618560 + min: 380928 + max: 38582032 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 196 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jegnlk0v5 + total_layers: 196 + job_id: j1p87yxz5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.499831Z' + timestamp: '2024-05-20T16:35:29.706509Z' - torchscript_onnx_tflite: - inference_time: 959.0 - throughput: 1042.752867570386 + inference_time: 956.0 + throughput: 1046.0251046025105 estimated_peak_memory_range: min: 24576 - max: 3871952 + max: 4980488 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 152 - job_id: j1p3vr8xg + job_id: jmg94ndq5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1020.0 - throughput: 980.3921568627451 + inference_time: 1011.0 + throughput: 989.1196834817013 estimated_peak_memory_range: - min: 806912 - max: 7974248 + min: 802816 + max: 6723176 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 197 - job_id: jnp1ym3kp + job_id: jopry30vg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,15 +178,68 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.499901Z' + timestamp: '2024-05-20T16:35:29.706525Z' + - torchscript_onnx_qnn: + inference_time: 1052.0 + throughput: 950.5703422053232 + estimated_peak_memory_range: + min: 786432 + max: 786432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 196 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 196 + job_id: jo5mzx8yp + job_status: Passed + torchscript_onnx_ort: + inference_time: 1200.0 + throughput: 833.3333333333334 + estimated_peak_memory_range: + min: 868352 + max: 868352 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 196 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 196 + job_id: jn5q2qy75 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 16080.0 + throughput: 62.18905472636816 + estimated_peak_memory_range: + min: 802816 + max: 802816 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 116 + total_layers: 116 + job_id: jw56147vp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.706548Z' - name: MediaPipeHandLandmarkDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 1259.0 - throughput: 794.2811755361398 + inference_time: 1214.0 + throughput: 823.7232289950576 estimated_peak_memory_range: - min: 24576 - max: 1977616 + min: 16384 + max: 2188824 primary_compute_unit: NPU precision: fp16 layer_info: @@ -192,14 +247,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 159 - job_id: jygzon3z5 + job_id: jlpevmy75 job_status: Passed torchscript_onnx_qnn: - inference_time: 1293.0 - throughput: 773.3952049497293 + inference_time: 1284.0 + throughput: 778.816199376947 estimated_peak_memory_range: - min: 638976 - max: 10247184 + min: 16384 + max: 51815576 primary_compute_unit: NPU precision: fp16 layer_info: @@ -207,22 +262,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 210 - job_id: jvgde2xk5 + job_id: jz57dr9q5 job_status: Passed torchscript_onnx_ort: - inference_time: 54823.0 - throughput: 18.240519489995076 + inference_time: 1506.0 + throughput: 664.0106241699867 estimated_peak_memory_range: - min: 217088 - max: 18000624 + min: 12288 + max: 42058584 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 209 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jo5mq8nyp + total_layers: 209 + job_id: j2p0r0j2p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -231,13 +286,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.499968Z' + timestamp: '2024-05-20T16:35:29.706573Z' - torchscript_onnx_tflite: - inference_time: 901.0 - throughput: 1109.8779134295228 + inference_time: 889.0 + throughput: 1124.859392575928 estimated_peak_memory_range: min: 12288 - max: 56691584 + max: 57135392 primary_compute_unit: NPU precision: fp16 layer_info: @@ -245,14 +300,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 159 - job_id: jmg9jdlq5 + job_id: jz5w964zp job_status: Passed torchscript_onnx_qnn: - inference_time: 963.0 - throughput: 1038.4215991692627 + inference_time: 948.0 + throughput: 1054.8523206751054 estimated_peak_memory_range: min: 802816 - max: 62409504 + max: 63945952 primary_compute_unit: NPU precision: fp16 layer_info: @@ -260,22 +315,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 210 - job_id: jqp4k3lqg + job_id: j0px1oxjg job_status: Passed torchscript_onnx_ort: - inference_time: 41069.0 - throughput: 24.34926586963403 + inference_time: 1099.0 + throughput: 909.9181073703367 estimated_peak_memory_range: - min: 868352 - max: 30450496 + min: 802816 + max: 33494480 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 209 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jopr8w6v5 + total_layers: 209 + job_id: jogkyx4yp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -284,13 +339,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.500029Z' + timestamp: '2024-05-20T16:35:29.706594Z' - torchscript_onnx_tflite: - inference_time: 1206.0 - throughput: 829.1873963515754 + inference_time: 1200.0 + throughput: 833.3333333333334 estimated_peak_memory_range: - min: 40960 - max: 2078488 + min: 12288 + max: 2557040 primary_compute_unit: NPU precision: fp16 layer_info: @@ -298,14 +353,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 159 - job_id: jwgok9m4p + job_id: jnp18z6kg job_status: Passed torchscript_onnx_qnn: - inference_time: 1308.0 - throughput: 764.525993883792 + inference_time: 1311.0 + throughput: 762.7765064836003 estimated_peak_memory_range: - min: 811008 - max: 8238832 + min: 815104 + max: 52770200 primary_compute_unit: NPU precision: fp16 layer_info: @@ -313,7 +368,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 210 - job_id: jvgdem0k5 + job_id: jep2mywx5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -322,4 +377,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.500084Z' + timestamp: '2024-05-20T16:35:29.706611Z' + - torchscript_onnx_qnn: + inference_time: 1461.0 + throughput: 684.4626967830253 + estimated_peak_memory_range: + min: 786432 + max: 786432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 209 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 209 + job_id: jegnevnvg + job_status: Passed + torchscript_onnx_ort: + inference_time: 1548.0 + throughput: 645.9948320413437 + estimated_peak_memory_range: + min: 19423232 + max: 19423232 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 209 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 209 + job_id: j1glkmxep + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 8524.0 + throughput: 117.31581417175035 + estimated_peak_memory_range: + min: 20221952 + max: 20221952 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j1p3m09xg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.706632Z' diff --git a/qai_hub_models/models/mediapipe_pose/README.md b/qai_hub_models/models/mediapipe_pose/README.md index 06fd35f4..97b007ee 100644 --- a/qai_hub_models/models/mediapipe_pose/README.md +++ b/qai_hub_models/models/mediapipe_pose/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mediapipe_pose/export.py b/qai_hub_models/models/mediapipe_pose/export.py index a187daf4..71d05f58 100644 --- a/qai_hub_models/models/mediapipe_pose/export.py +++ b/qai_hub_models/models/mediapipe_pose/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -226,9 +226,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser( - model_cls=Model, components=ALL_COMPONENTS, supports_ort=False - ) + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mediapipe_pose/model.py b/qai_hub_models/models/mediapipe_pose/model.py index 7c96d6e5..583773a6 100644 --- a/qai_hub_models/models/mediapipe_pose/model.py +++ b/qai_hub_models/models/mediapipe_pose/model.py @@ -13,7 +13,7 @@ from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 POSE_LANDMARK_CONNECTIONS = [ (0, 1), diff --git a/qai_hub_models/models/mediapipe_pose/perf.yaml b/qai_hub_models/models/mediapipe_pose/perf.yaml index 68281558..d3b007a4 100644 --- a/qai_hub_models/models/mediapipe_pose/perf.yaml +++ b/qai_hub_models/models/mediapipe_pose/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MediaPipePoseDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 835.0 - throughput: 1197.6047904191616 + inference_time: 839.0 + throughput: 1191.8951132300358 estimated_peak_memory_range: - min: 16384 - max: 1889240 + min: 24576 + max: 2326784 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 107 - job_id: j2p03642p + job_id: jwgov6r45 job_status: Passed torchscript_onnx_qnn: - inference_time: 884.0 - throughput: 1131.2217194570135 + inference_time: 873.0 + throughput: 1145.475372279496 estimated_peak_memory_range: - min: 69632 - max: 15459024 + min: 12288 + max: 16427488 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 140 - job_id: j1gl6l4eg + job_id: jmg94nmq5 job_status: Passed torchscript_onnx_ort: - inference_time: 1006.0 - throughput: 994.0357852882704 + inference_time: 1003.0 + throughput: 997.0089730807578 estimated_peak_memory_range: - min: 16384 - max: 9676016 + min: 36864 + max: 10321904 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 139 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1pv07q75 + total_layers: 139 + job_id: jopry3lvg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.544709Z' + timestamp: '2024-05-20T16:35:29.766048Z' - torchscript_onnx_tflite: - inference_time: 612.0 - throughput: 1633.986928104575 + inference_time: 606.0 + throughput: 1650.1650165016501 estimated_peak_memory_range: min: 16384 - max: 40580928 + max: 41021648 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 107 - job_id: jogk78vyp + job_id: j7gjln77p job_status: Passed torchscript_onnx_qnn: - inference_time: 636.0 - throughput: 1572.3270440251572 + inference_time: 630.0 + throughput: 1587.3015873015872 estimated_peak_memory_range: - min: 208896 - max: 44032080 + min: 0 + max: 45101520 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 140 - job_id: j1p3v6nxg + job_id: jvgdv13kg job_status: Passed torchscript_onnx_ort: - inference_time: 732.0 - throughput: 1366.120218579235 + inference_time: 769.0 + throughput: 1300.3901170351105 estimated_peak_memory_range: - min: 208896 - max: 21601008 + min: 212992 + max: 30386624 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 139 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jlpeeyo7p + total_layers: 139 + job_id: jqpyd3orp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.544766Z' + timestamp: '2024-05-20T16:35:29.766075Z' - torchscript_onnx_tflite: - inference_time: 845.0 - throughput: 1183.4319526627219 + inference_time: 829.0 + throughput: 1206.2726176115802 estimated_peak_memory_range: - min: 32768 - max: 1538160 + min: 77824 + max: 1838752 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 107 - job_id: j2p03xdep + job_id: jygz7dmzp job_status: Passed torchscript_onnx_qnn: - inference_time: 886.0 - throughput: 1128.6681715575621 + inference_time: 875.0 + throughput: 1142.857142857143 estimated_peak_memory_range: - min: 12288 - max: 104292296 + min: 229376 + max: 5314120 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 140 - job_id: j1pv0nem5 + job_id: jo5mzxmyp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,15 +178,68 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.544811Z' + timestamp: '2024-05-20T16:35:29.766091Z' + - torchscript_onnx_qnn: + inference_time: 1047.0 + throughput: 955.1098376313277 + estimated_peak_memory_range: + min: 540672 + max: 540672 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 139 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 139 + job_id: jqp4wr1qg + job_status: Passed + torchscript_onnx_ort: + inference_time: 1080.0 + throughput: 925.925925925926 + estimated_peak_memory_range: + min: 1073152 + max: 1073152 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 139 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 139 + job_id: j1p87yez5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 15947.0 + throughput: 62.70771932024832 + estimated_peak_memory_range: + min: 26939392 + max: 26939392 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 81 + total_layers: 81 + job_id: jn5q2ql75 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.766117Z' - name: MediaPipePoseLandmarkDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 1206.0 - throughput: 829.1873963515754 + inference_time: 1204.0 + throughput: 830.5647840531561 estimated_peak_memory_range: - min: 16384 - max: 2448848 + min: 24576 + max: 2528368 primary_compute_unit: NPU precision: fp16 layer_info: @@ -192,14 +247,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 230 - job_id: j1p8012zg + job_id: j1pvwkd7g job_status: Passed torchscript_onnx_qnn: - inference_time: 1297.0 - throughput: 771.0100231303007 + inference_time: 1311.0 + throughput: 762.7765064836003 estimated_peak_memory_range: - min: 12288 - max: 15533680 + min: 16384 + max: 13548072 primary_compute_unit: NPU precision: fp16 layer_info: @@ -207,22 +262,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 306 - job_id: jw56ew2vg + job_id: jnp18zjkg job_status: Passed torchscript_onnx_ort: - inference_time: 106535.0 - throughput: 9.386586567794621 + inference_time: 1658.0 + throughput: 603.1363088057901 estimated_peak_memory_range: - min: 102400 - max: 26214168 + min: 53248 + max: 26730224 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 304 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j7gjzqd75 + total_layers: 304 + job_id: jep2myrx5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -231,13 +286,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.544899Z' + timestamp: '2024-05-20T16:35:29.766142Z' - torchscript_onnx_tflite: - inference_time: 880.0 - throughput: 1136.3636363636363 + inference_time: 864.0 + throughput: 1157.4074074074074 estimated_peak_memory_range: - min: 16384 - max: 87924496 + min: 20480 + max: 88312288 primary_compute_unit: NPU precision: fp16 layer_info: @@ -245,14 +300,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 230 - job_id: jn5qev075 + job_id: jlpevmz75 job_status: Passed torchscript_onnx_qnn: - inference_time: 964.0 - throughput: 1037.344398340249 + inference_time: 948.0 + throughput: 1054.8523206751054 estimated_peak_memory_range: min: 802816 - max: 83648384 + max: 89559840 primary_compute_unit: NPU precision: fp16 layer_info: @@ -260,22 +315,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 306 - job_id: jwgok8z4p + job_id: jz57dr4q5 job_status: Passed torchscript_onnx_ort: - inference_time: 82694.0 - throughput: 12.092775775751566 + inference_time: 1253.0 + throughput: 798.0845969672786 estimated_peak_memory_range: - min: 819200 - max: 35448288 + min: 454656 + max: 38605792 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 304 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jygzon2z5 + total_layers: 304 + job_id: j2p0r0m2p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -284,13 +339,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.544985Z' + timestamp: '2024-05-20T16:35:29.766163Z' - torchscript_onnx_tflite: - inference_time: 1247.0 - throughput: 801.924619085806 + inference_time: 1244.0 + throughput: 803.8585209003215 estimated_peak_memory_range: - min: 12288 - max: 2817072 + min: 86016 + max: 3237392 primary_compute_unit: NPU precision: fp16 layer_info: @@ -298,14 +353,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 230 - job_id: j1p80k68g + job_id: jz5w967zp job_status: Passed torchscript_onnx_qnn: - inference_time: 1291.0 - throughput: 774.5933384972889 + inference_time: 1309.0 + throughput: 763.9419404125287 estimated_peak_memory_range: - min: 24576 - max: 13908424 + min: 12288 + max: 14098200 primary_compute_unit: NPU precision: fp16 layer_info: @@ -313,7 +368,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 306 - job_id: j7gjz8o85 + job_id: jegnevzvg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -322,4 +377,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.545055Z' + timestamp: '2024-05-20T16:35:29.766179Z' + - torchscript_onnx_qnn: + inference_time: 1501.0 + throughput: 666.2225183211193 + estimated_peak_memory_range: + min: 786432 + max: 786432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 305 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 305 + job_id: j0px1o4jg + job_status: Passed + torchscript_onnx_ort: + inference_time: 1633.0 + throughput: 612.369871402327 + estimated_peak_memory_range: + min: 7917568 + max: 7917568 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 304 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 304 + job_id: jogkyx2yp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 6059.0 + throughput: 165.0437365901964 + estimated_peak_memory_range: + min: 20336640 + max: 20336640 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j1glkmyep + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.766201Z' diff --git a/qai_hub_models/models/mediapipe_selfie/README.md b/qai_hub_models/models/mediapipe_selfie/README.md index 350d2545..fd842e56 100644 --- a/qai_hub_models/models/mediapipe_selfie/README.md +++ b/qai_hub_models/models/mediapipe_selfie/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/mediapipe_selfie/export.py b/qai_hub_models/models/mediapipe_selfie/export.py index d257c256..ec8731e8 100644 --- a/qai_hub_models/models/mediapipe_selfie/export.py +++ b/qai_hub_models/models/mediapipe_selfie/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mediapipe_selfie/perf.yaml b/qai_hub_models/models/mediapipe_selfie/perf.yaml index 46644d9a..f988331b 100644 --- a/qai_hub_models/models/mediapipe_selfie/perf.yaml +++ b/qai_hub_models/models/mediapipe_selfie/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MediaPipe-Selfie-Segmentation performance_metrics: - torchscript_onnx_tflite: - inference_time: 792.0 - throughput: 1262.6262626262626 + inference_time: 807.0 + throughput: 1239.1573729863692 estimated_peak_memory_range: min: 12288 - max: 4536656 + max: 1954960 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 118 - job_id: jnp1y62kp + job_id: jw56148vp job_status: Passed torchscript_onnx_qnn: - inference_time: 773.0 - throughput: 1293.6610608020699 + inference_time: 787.0 + throughput: 1270.6480304955528 estimated_peak_memory_range: - min: 32768 - max: 18516080 + min: 28672 + max: 13500824 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 138 - job_id: jz57092qg + job_id: j1pvwkl7g job_status: Passed torchscript_onnx_ort: - inference_time: 164651.0 - throughput: 6.073452332509368 + inference_time: 1327.0 + throughput: 753.5795026375282 estimated_peak_memory_range: - min: 1437696 - max: 5932024 + min: 802816 + max: 5487496 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 140 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j0pxnx9j5 + total_layers: 140 + job_id: jz5w96lzp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.589824Z' + timestamp: '2024-05-20T16:35:29.823981Z' - torchscript_onnx_tflite: - inference_time: 536.0 - throughput: 1865.6716417910447 + inference_time: 542.0 + throughput: 1845.018450184502 estimated_peak_memory_range: - min: 12288 - max: 23055696 + min: 16384 + max: 23610032 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 118 - job_id: jvgde2nk5 + job_id: j1p3m0zxg job_status: Passed torchscript_onnx_qnn: - inference_time: 525.0 - throughput: 1904.7619047619048 + inference_time: 510.0 + throughput: 1960.7843137254902 estimated_peak_memory_range: min: 176128 - max: 41755712 + max: 41845584 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 138 - job_id: jqp4k3nqg + job_id: j7gjlnr7p job_status: Passed torchscript_onnx_ort: - inference_time: 121169.0 - throughput: 8.252935981975588 + inference_time: 945.0 + throughput: 1058.2010582010582 estimated_peak_memory_range: min: 12288 - max: 18735968 + max: 20917104 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 140 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jo5mq8eyp + total_layers: 140 + job_id: jmg94nzq5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.589884Z' + timestamp: '2024-05-20T16:35:29.824007Z' - torchscript_onnx_tflite: - inference_time: 785.0 - throughput: 1273.8853503184714 + inference_time: 809.0 + throughput: 1236.0939431396787 estimated_peak_memory_range: - min: 12288 - max: 2039720 + min: 20480 + max: 1607472 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 118 - job_id: j0pxnzd95 + job_id: jwgov6l45 job_status: Passed torchscript_onnx_qnn: - inference_time: 772.0 - throughput: 1295.3367875647668 + inference_time: 787.0 + throughput: 1270.6480304955528 estimated_peak_memory_range: - min: 819200 - max: 8273816 + min: 806912 + max: 41288280 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 138 - job_id: jep20zvmg + job_id: jygz7dlzp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.589938Z' + timestamp: '2024-05-20T16:35:29.824029Z' + - torchscript_onnx_qnn: + inference_time: 945.0 + throughput: 1058.2010582010582 + estimated_peak_memory_range: + min: 786432 + max: 786432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 138 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 138 + job_id: jlpevm775 + job_status: Passed + torchscript_onnx_ort: + inference_time: 1395.0 + throughput: 716.8458781362007 + estimated_peak_memory_range: + min: 2465792 + max: 2465792 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 140 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 140 + job_id: jnp18znkg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 4582.0 + throughput: 218.2453077258839 + estimated_peak_memory_range: + min: 16928768 + max: 16928768 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 110 + total_layers: 110 + job_id: jvgdv1dkg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.824052Z' diff --git a/qai_hub_models/models/midas/README.md b/qai_hub_models/models/midas/README.md new file mode 100644 index 00000000..69a660f5 --- /dev/null +++ b/qai_hub_models/models/midas/README.md @@ -0,0 +1,56 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Midas-V2: Deep Convolutional Neural Network model for depth estimation](#) + +Midas is designed for estimating depth at each point in an image. + +This is based on the implementation of Midas-V2 found +[here](https://github.com/isl-org/MiDaS). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](#). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.midas.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.midas.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Midas-V2 can be found + [here](https://github.com/isl-org/MiDaS/blob/master/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer](https://arxiv.org/abs/1907.01341v3) +* [Source Model Implementation](https://github.com/isl-org/MiDaS) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/midas/__init__.py b/qai_hub_models/models/midas/__init__.py new file mode 100644 index 00000000..10b63ec3 --- /dev/null +++ b/qai_hub_models/models/midas/__init__.py @@ -0,0 +1,7 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from .app import MidasApp as App # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import Midas as Model # noqa: F401 diff --git a/qai_hub_models/models/midas/app.py b/qai_hub_models/models/midas/app.py new file mode 100644 index 00000000..949c87b1 --- /dev/null +++ b/qai_hub_models/models/midas/app.py @@ -0,0 +1,63 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from typing import Callable, List + +import matplotlib.pyplot as plt +import numpy as np +import torch +from PIL import Image +from torchvision import transforms + +from qai_hub_models.utils.image_processing import pil_resize_pad, undo_resize_pad + + +class MidasApp: + def __init__( + self, + model: Callable[[torch.Tensor], torch.Tensor], + input_height: int, + input_width: int, + ): + self.model = model + self.input_height = input_height + self.input_width = input_width + + def predict(self, *args, **kwargs): + return self.estimate_depth(*args, **kwargs) + + def estimate_depth( + self, + image: Image.Image, + raw_output: bool = False, + ) -> List[Image.Image] | np.ndarray: + """ + Estimates the depth at each point in an image and produces a heatmap. + + Parameters: + image: PIL Image to estimate depth. + raw_output: If set, returns the raw depth estimates instead of a heatmap. + + Returns: + A heatmap PIL Image or an np array of depth estimates. + np array will be shape (h, w) where h, w are the dimensions of the input. + np array will contain raw depth estimates, while PIL image will normalize + the values and display them as an RGB image. + """ + resized_image, scale, padding = pil_resize_pad( + image, (self.input_height, self.input_width) + ) + image_tensor = transforms.ToTensor()(resized_image).unsqueeze(0) + with torch.no_grad(): + prediction = self.model(image_tensor) + prediction = undo_resize_pad( + prediction.unsqueeze(0), image.size, scale, padding + ) + numpy_output = prediction.squeeze().cpu().numpy() + if raw_output: + return numpy_output + heatmap = plt.cm.plasma(numpy_output / numpy_output.max())[..., :3] + return Image.fromarray((heatmap * 255).astype(np.uint8)) diff --git a/qai_hub_models/models/midas/conftest.py b/qai_hub_models/models/midas/conftest.py new file mode 100644 index 00000000..fb82cdde --- /dev/null +++ b/qai_hub_models/models/midas/conftest.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import inspect + +import pytest + +from qai_hub_models.models.midas import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +# Instantiate the model only once for all tests. +# Mock from_pretrained to always return the initialized model. +# This speeds up tests and limits memory leaks. +@pytest.fixture(scope="module", autouse=True) +def cached_from_pretrained(): + with pytest.MonkeyPatch.context() as mp: + pretrained_cache = {} + from_pretrained = Model.from_pretrained + sig = inspect.signature(from_pretrained) + + @skip_clone_repo_check + def _cached_from_pretrained(*args, **kwargs): + cache_key = str(args) + str(kwargs) + model = pretrained_cache.get(cache_key, None) + if model: + return model + else: + model = from_pretrained(*args, **kwargs) + pretrained_cache[cache_key] = model + return model + + _cached_from_pretrained.__signature__ = sig + + mp.setattr(Model, "from_pretrained", _cached_from_pretrained) + yield mp diff --git a/qai_hub_models/models/midas/demo.py b/qai_hub_models/models/midas/demo.py new file mode 100644 index 00000000..9b2aa9b7 --- /dev/null +++ b/qai_hub_models/models/midas/demo.py @@ -0,0 +1,62 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from typing import Type + +from qai_hub_models.models.midas.app import MidasApp +from qai_hub_models.models.midas.model import MODEL_ASSET_VERSION, MODEL_ID, Midas +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +# Demo image comes from https://github.com/pytorch/hub/raw/master/images/dog.jpg +INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_input_image.jpg" +) + + +# Run Midas end-to-end on a sample image. +# The demo will display a heatmap of the estimated depth at each point in the image. +def midas_demo(model_cls: Type[Midas], is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(model_cls) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=INPUT_IMAGE_ADDRESS, + help="image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + model = demo_model_from_cli_args(model_cls, MODEL_ID, args) + validate_on_device_demo_args(args, MODEL_ID) + + # Load image + (_, _, height, width) = model_cls.get_input_spec()["image"][0] + image = load_image(args.image) + print("Model Loaded") + + app = MidasApp(model, height, width) + heatmap_image = app.estimate_depth(image) + + if not is_test: + # Resize / unpad annotated image + display_or_save_image( + heatmap_image, args.output_dir, "midas_heatmap.png", "heatmap" + ) + + +def main(is_test: bool = False): + return midas_demo(model_cls=Midas, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/midas/export.py b/qai_hub_models/models/midas/export.py new file mode 100644 index 00000000..b02c2b68 --- /dev/null +++ b/qai_hub_models/models/midas/export.py @@ -0,0 +1,217 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub +import torch + +from qai_hub_models.models.midas import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "midas" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "midas", + "Midas-V2", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + model.eval() + source_model = torch.jit.trace( + model.to("cpu"), make_torch_inputs(input_spec), check_trace=False + ) + + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + channel_last_flags, hub_device + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub_device, + name=model_name, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub_device, + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub_device, + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/midas/info.yaml b/qai_hub_models/models/midas/info.yaml new file mode 100644 index 00000000..be8b2527 --- /dev/null +++ b/qai_hub_models/models/midas/info.yaml @@ -0,0 +1,34 @@ +name: Midas-V2 +# id must match with the model dir name in qai_hub_models +id: midas +status: public +headline: Deep Convolutional Neural Network model for depth estimation. +domain: Computer Vision +use_case: Depth Estimation +description: Midas is designed for estimating depth at each point in an image. +tags: [] +research_paper: https://arxiv.org/abs/1907.01341v3 +research_paper_title: 'Towards Robust Monocular Depth Estimation: Mixing Datasets + for Zero-shot Cross-dataset Transfer' +license: https://github.com/isl-org/MiDaS/blob/master/LICENSE +deploy_license: + https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/isl-org/MiDaS +technical_details: + Model checkpoint: MiDaS_small + Input resolution: 256x256 + Number of parameters: 16.6M + Model size: 63.2 MB +applicable_scenarios: + - Anomaly Detection + - Inventory Management +related_models: [] +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: no +license_type: mit +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/midas/model.py b/qai_hub_models/models/midas/model.py new file mode 100644 index 00000000..3f6b1d53 --- /dev/null +++ b/qai_hub_models/models/midas/model.py @@ -0,0 +1,54 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +import torch + +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.image_processing import normalize_image_torchvision +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_WEIGHTS = "MiDaS_small" + + +class Midas(BaseModel): + """Exportable Midas depth estimation model.""" + + def __init__( + self, + model: torch.nn.Module, + normalize_input: bool = True, + ) -> None: + super().__init__() + self.model = model + self.normalize_input = normalize_input + + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> Midas: + model = torch.hub.load("intel-isl/MiDaS", weights).eval() + return cls(model) + + @staticmethod + def get_input_spec(height: int = 256, width: int = 256) -> InputSpec: + return {"image": ((1, 3, height, width), "float32")} + + def forward(self, image): + """ + Runs the model on an image tensor and returns a tensor of depth estimates + + Parameters: + image: A [1, 3, H, W] image. + Pixel values pre-processed for encoder consumption. + Range: float[0, 1] if self.normalize_input, else ~[-2.5, 2.5] + 3-channel Color Space: RGB + + Returns: + Tensor of depth estimates of size [1, H, W]. + """ + if self.normalize_input: + image = normalize_image_torchvision(image) + return self.model(image) diff --git a/qai_hub_models/models/midas/test.py b/qai_hub_models/models/midas/test.py new file mode 100644 index 00000000..11377780 --- /dev/null +++ b/qai_hub_models/models/midas/test.py @@ -0,0 +1,51 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import numpy as np +import pytest + +from qai_hub_models.models.midas.app import MidasApp +from qai_hub_models.models.midas.demo import INPUT_IMAGE_ADDRESS +from qai_hub_models.models.midas.demo import main as demo_main +from qai_hub_models.models.midas.model import MODEL_ASSET_VERSION, MODEL_ID, Midas +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "midas_output.png" +) + + +# Verify that the output from Torch is as expected. +@skip_clone_repo_check +def test_task(): + (_, _, height, width) = Midas.get_input_spec()["image"][0] + app = MidasApp(Midas.from_pretrained(), height, width) + original_image = load_image(INPUT_IMAGE_ADDRESS) + output_image = app.estimate_depth(original_image) + output_image_oracle = load_image(OUTPUT_IMAGE_ADDRESS) + + np.testing.assert_allclose( + np.asarray(output_image), np.asarray(output_image_oracle), atol=3 + ) + + +@pytest.mark.trace +@skip_clone_repo_check +def test_trace(): + (_, _, height, width) = Midas.get_input_spec()["image"][0] + traced_model = Midas.from_pretrained().convert_to_torchscript(check_trace=False) + app = MidasApp(traced_model, height, width) + original_image = load_image(INPUT_IMAGE_ADDRESS) + output_image = app.estimate_depth(original_image) + output_image_oracle = load_image(OUTPUT_IMAGE_ADDRESS) + + np.testing.assert_allclose( + np.asarray(output_image), np.asarray(output_image_oracle), atol=3 + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/mnasnet05/README.md b/qai_hub_models/models/mnasnet05/README.md index 3921fdd5..f17444f4 100644 --- a/qai_hub_models/models/mnasnet05/README.md +++ b/qai_hub_models/models/mnasnet05/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mnasnet05/export.py b/qai_hub_models/models/mnasnet05/export.py index bf5d429c..046dda81 100644 --- a/qai_hub_models/models/mnasnet05/export.py +++ b/qai_hub_models/models/mnasnet05/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mnasnet05/perf.yaml b/qai_hub_models/models/mnasnet05/perf.yaml index 63b22c4e..8bfa98ec 100644 --- a/qai_hub_models/models/mnasnet05/perf.yaml +++ b/qai_hub_models/models/mnasnet05/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MNASNet05 performance_metrics: - torchscript_onnx_tflite: - inference_time: 800.0 - throughput: 1250.0 + inference_time: 771.0 + throughput: 1297.0168612191958 estimated_peak_memory_range: - min: 16384 - max: 1867832 + min: 49152 + max: 2163152 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 71 - job_id: jopr8w8v5 + job_id: jqpyd384p job_status: Passed torchscript_onnx_qnn: - inference_time: 848.0 - throughput: 1179.245283018868 + inference_time: 824.0 + throughput: 1213.5922330097087 estimated_peak_memory_range: - min: 630784 - max: 4926760 + min: 16384 + max: 45567712 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 103 - job_id: jqpyrmrr5 + job_id: jogkyx6op job_status: Passed torchscript_onnx_ort: - inference_time: 990.0 - throughput: 1010.10101010101 + inference_time: 768.0 + throughput: 1302.0833333333333 estimated_peak_memory_range: - min: 12288 - max: 21275160 + min: 16384 + max: 18880896 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 104 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p8010zg + total_layers: 104 + job_id: j1p3m0ozg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.613841Z' + timestamp: '2024-05-20T16:35:29.865693Z' - torchscript_onnx_tflite: - inference_time: 530.0 - throughput: 1886.7924528301887 + inference_time: 522.0 + throughput: 1915.7088122605364 estimated_peak_memory_range: - min: 12288 - max: 45612800 + min: 16384 + max: 46214320 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 71 - job_id: jep20e0xg + job_id: j2p0r0oep job_status: Passed torchscript_onnx_qnn: - inference_time: 565.0 - throughput: 1769.9115044247787 + inference_time: 562.0 + throughput: 1779.3594306049822 estimated_peak_memory_range: min: 0 - max: 41195552 + max: 38662336 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 103 - job_id: j2p03632p + job_id: jn5q2q4m5 job_status: Passed torchscript_onnx_ort: - inference_time: 641.0 - throughput: 1560.0624024960998 + inference_time: 531.0 + throughput: 1883.2391713747645 estimated_peak_memory_range: - min: 24576 - max: 21468016 + min: 634880 + max: 26749664 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 104 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jogk787yp + total_layers: 104 + job_id: jwgov6dd5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.613900Z' + timestamp: '2024-05-20T16:35:29.865722Z' - torchscript_onnx_tflite: - inference_time: 799.0 - throughput: 1251.5644555694619 + inference_time: 774.0 + throughput: 1291.9896640826873 estimated_peak_memory_range: - min: 20480 - max: 1900528 + min: 28672 + max: 1977952 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 71 - job_id: j1p3vrwzg + job_id: j1p87yj85 job_status: Passed torchscript_onnx_qnn: - inference_time: 852.0 - throughput: 1173.7089201877934 + inference_time: 834.0 + throughput: 1199.0407673860911 estimated_peak_memory_range: - min: 0 - max: 47875160 + min: 16384 + max: 24694288 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 103 - job_id: jlpeenl0p + job_id: jw5614o7p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.613941Z' + timestamp: '2024-05-20T16:35:29.865740Z' + - torchscript_onnx_qnn: + inference_time: 952.0 + throughput: 1050.420168067227 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 103 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 103 + job_id: j1glkmwlp + job_status: Passed + torchscript_onnx_ort: + inference_time: 816.0 + throughput: 1225.4901960784314 + estimated_peak_memory_range: + min: 15839232 + max: 15839232 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 104 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 104 + job_id: j1pvwk2mg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2632.0 + throughput: 379.9392097264438 + estimated_peak_memory_range: + min: 11706368 + max: 11706368 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j7gjln38p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.865762Z' diff --git a/qai_hub_models/models/mobilenet_v2/README.md b/qai_hub_models/models/mobilenet_v2/README.md index 4fe640ba..4c9f4616 100644 --- a/qai_hub_models/models/mobilenet_v2/README.md +++ b/qai_hub_models/models/mobilenet_v2/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mobilenet_v2/export.py b/qai_hub_models/models/mobilenet_v2/export.py index 14a6b5ae..5e134e8a 100644 --- a/qai_hub_models/models/mobilenet_v2/export.py +++ b/qai_hub_models/models/mobilenet_v2/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mobilenet_v2/perf.yaml b/qai_hub_models/models/mobilenet_v2/perf.yaml index 27021fb5..3818b729 100644 --- a/qai_hub_models/models/mobilenet_v2/perf.yaml +++ b/qai_hub_models/models/mobilenet_v2/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MobileNet-v2 performance_metrics: - torchscript_onnx_tflite: - inference_time: 974.0 - throughput: 1026.694045174538 + inference_time: 935.0 + throughput: 1069.51871657754 estimated_peak_memory_range: min: 20480 - max: 1954912 + max: 1805232 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: j1gl6l6eg + job_id: jlpevm605 job_status: Passed torchscript_onnx_qnn: - inference_time: 1281.0 - throughput: 780.64012490242 + inference_time: 1268.0 + throughput: 788.6435331230284 estimated_peak_memory_range: min: 622592 - max: 7823048 + max: 52139528 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 105 - job_id: j1p3v6vxg + job_id: jmg94nov5 job_status: Passed torchscript_onnx_ort: - inference_time: 1110.0 - throughput: 900.9009009009009 + inference_time: 926.0 + throughput: 1079.913606911447 estimated_peak_memory_range: min: 12288 - max: 31867536 + max: 26577912 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 105 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1pv07075 + total_layers: 105 + job_id: jqp4wr4lg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.638108Z' + timestamp: '2024-05-20T16:35:29.896619Z' - torchscript_onnx_tflite: - inference_time: 651.0 - throughput: 1536.0983102918588 + inference_time: 622.0 + throughput: 1607.717041800643 estimated_peak_memory_range: - min: 16384 - max: 56986240 + min: 12288 + max: 56265456 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: jw56ewevg + job_id: jygz7dz6p job_status: Passed torchscript_onnx_qnn: - inference_time: 836.0 - throughput: 1196.1722488038276 + inference_time: 828.0 + throughput: 1207.729468599034 estimated_peak_memory_range: min: 618496 - max: 42487872 + max: 39673920 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 105 - job_id: jwgok8k4p + job_id: jnp18zolg job_status: Passed torchscript_onnx_ort: - inference_time: 750.0 - throughput: 1333.3333333333333 + inference_time: 638.0 + throughput: 1567.398119122257 estimated_peak_memory_range: - min: 12288 - max: 22319216 + min: 471040 + max: 25361728 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 105 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j7gjzqz75 + total_layers: 105 + job_id: j0px1or9g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.638177Z' + timestamp: '2024-05-20T16:35:29.896647Z' - torchscript_onnx_tflite: - inference_time: 957.0 - throughput: 1044.932079414838 + inference_time: 941.0 + throughput: 1062.6992561105208 estimated_peak_memory_range: - min: 57344 - max: 1611720 + min: 28672 + max: 1470792 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: jvgdemzl5 + job_id: jz5w96yjp job_status: Passed torchscript_onnx_qnn: - inference_time: 1265.0 - throughput: 790.5138339920949 + inference_time: 1269.0 + throughput: 788.0220646178093 estimated_peak_memory_range: - min: 618496 - max: 128931224 + min: 16384 + max: 143728912 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 105 - job_id: jo5mql8qp + job_id: jz57drnr5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.638216Z' + timestamp: '2024-05-20T16:35:29.896664Z' + - torchscript_onnx_qnn: + inference_time: 1516.0 + throughput: 659.6306068601583 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 105 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 105 + job_id: jvgdv16lg + job_status: Passed + torchscript_onnx_ort: + inference_time: 994.0 + throughput: 1006.0362173038229 + estimated_peak_memory_range: + min: 17502208 + max: 17502208 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 105 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 105 + job_id: jo5mzxkqp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 8063.0 + throughput: 124.0233163834801 + estimated_peak_memory_range: + min: 798720 + max: 798720 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 57 + total_layers: 57 + job_id: jegnevqmg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.896687Z' diff --git a/qai_hub_models/models/mobilenet_v2_quantized/README.md b/qai_hub_models/models/mobilenet_v2_quantized/README.md index 3fd6afea..2f07b35a 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/README.md +++ b/qai_hub_models/models/mobilenet_v2_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mobilenet_v2_quantized/export.py b/qai_hub_models/models/mobilenet_v2_quantized/export.py index 1d8b1899..4db91c07 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/export.py +++ b/qai_hub_models/models/mobilenet_v2_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml index 361584aa..c7088b78 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml +++ b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MobileNet-v2-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 302.0 - throughput: 3311.2582781456954 + inference_time: 295.0 + throughput: 3389.830508474576 estimated_peak_memory_range: - min: 16384 - max: 1568424 + min: 40960 + max: 6698304 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: jygzonoz5 + job_id: jopry3deg job_status: Passed torchscript_onnx_qnn: - inference_time: 666.0 - throughput: 1501.5015015015015 + inference_time: 654.0 + throughput: 1529.051987767584 estimated_peak_memory_range: - min: 12288 - max: 75287400 + min: 172032 + max: 5185880 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,22 +69,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 71 - job_id: jmg9jdjq5 + job_id: j2p0r09ep job_status: Passed torchscript_onnx_ort: - inference_time: 897.0 - throughput: 1114.8272017837235 + inference_time: 634.0 + throughput: 1577.2870662460568 estimated_peak_memory_range: - min: 12288 - max: 146664848 + min: 200704 + max: 21639208 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 77 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jvgde2ek5 + total_layers: 77 + job_id: j1glkm8lp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -91,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.662420Z' + timestamp: '2024-05-20T16:35:29.927306Z' - torchscript_onnx_tflite: - inference_time: 233.0 - throughput: 4291.845493562232 + inference_time: 238.0 + throughput: 4201.680672268908 estimated_peak_memory_range: min: 12288 - max: 37162256 + max: 37430768 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: jz5w242z5 + job_id: jep2mydm5 job_status: Passed torchscript_onnx_qnn: - inference_time: 480.0 - throughput: 2083.3333333333335 + inference_time: 474.0 + throughput: 2109.7046413502107 estimated_peak_memory_range: - min: 159744 - max: 36918192 + min: 163840 + max: 38345472 primary_compute_unit: NPU precision: int8 layer_info: @@ -120,22 +122,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 71 - job_id: jnp1y6ykp + job_id: j1p87yr85 job_status: Passed torchscript_onnx_ort: - inference_time: 644.0 - throughput: 1552.7950310559006 + inference_time: 463.0 + throughput: 2159.827213822894 estimated_peak_memory_range: min: 0 - max: 18572416 + max: 22362560 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 77 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz57090qg + total_layers: 77 + job_id: jw5614m7p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -144,51 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.662476Z' + timestamp: '2024-05-20T16:35:29.927332Z' - torchscript_onnx_tflite: - inference_time: 949.0 - throughput: 1053.740779768177 + inference_time: 296.0 + throughput: 3378.3783783783783 estimated_peak_memory_range: min: 12288 - max: 23229040 + max: 1719624 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 74 + layers_on_npu: 72 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 74 - job_id: jogk7k8op + total_layers: 72 + job_id: jqpyd324p job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 659.0 + throughput: 1517.4506828528072 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 20480 + max: 75350840 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 71 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jwgok98dp - job_status: Failed - torchscript_onnx_ort: - inference_time: 6507.0 - throughput: 153.68065160596282 + total_layers: 71 + job_id: jn5q2q1m5 + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:29.927350Z' + - torchscript_onnx_tflite: + inference_time: 853.0 + throughput: 1172.3329425556858 estimated_peak_memory_range: - min: 335872 - max: 43247408 - primary_compute_unit: CPU - precision: fp32 + min: 12288 + max: 23360768 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 72 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 72 + job_id: j0pxyrqlg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1400.0 + throughput: 714.2857142857143 + estimated_peak_memory_range: + min: 0 + max: 34410432 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 71 layers_on_gpu: 0 - layers_on_cpu: 84 - total_layers: 84 - job_id: jqp4k3kqg + layers_on_cpu: 0 + total_layers: 71 + job_id: j2p0l9wnp job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -197,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.662525Z' + timestamp: '2024-05-20T16:35:29.927366Z' - torchscript_onnx_tflite: - inference_time: 7442.0 - throughput: 134.37248051599033 + inference_time: 7603.0 + throughput: 131.5270288044193 estimated_peak_memory_range: - min: 12288 - max: 11587968 + min: 20480 + max: 11376824 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 72 + layers_on_npu: 70 layers_on_gpu: 2 layers_on_cpu: 0 - total_layers: 74 - job_id: j1gl2wkep + total_layers: 72 + job_id: jo5m3k79g job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -220,42 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.662544Z' - - torchscript_onnx_tflite: - inference_time: 325.0 - throughput: 3076.923076923077 + timestamp: '2024-05-20T16:35:29.927377Z' + - torchscript_onnx_qnn: + inference_time: 762.0 + throughput: 1312.3359580052493 estimated_peak_memory_range: - min: 20480 - max: 1768808 + min: 573440 + max: 573440 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 74 + layers_on_npu: 71 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 74 - job_id: j1p80k18g + total_layers: 71 + job_id: jogkyx0op job_status: Passed - torchscript_onnx_qnn: - inference_time: 695.0 - throughput: 1438.8489208633093 + torchscript_onnx_ort: + inference_time: 677.0 + throughput: 1477.1048744460857 estimated_peak_memory_range: - min: 20480 - max: 131789128 + min: 19963904 + max: 19963904 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 73 + layers_on_npu: 77 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 73 - job_id: jw56e0w7g + total_layers: 77 + job_id: j1p3m07zg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 43191.0 + throughput: 23.15297168391563 + estimated_peak_memory_range: + min: 20062208 + max: 20062208 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jwgov6wd5 job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.662578Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.927401Z' diff --git a/qai_hub_models/models/mobilenet_v3_large/README.md b/qai_hub_models/models/mobilenet_v3_large/README.md index 7cb2fa15..bf675b22 100644 --- a/qai_hub_models/models/mobilenet_v3_large/README.md +++ b/qai_hub_models/models/mobilenet_v3_large/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mobilenet_v3_large/export.py b/qai_hub_models/models/mobilenet_v3_large/export.py index c75123cf..28b64384 100644 --- a/qai_hub_models/models/mobilenet_v3_large/export.py +++ b/qai_hub_models/models/mobilenet_v3_large/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mobilenet_v3_large/perf.yaml b/qai_hub_models/models/mobilenet_v3_large/perf.yaml index c1e8a4af..07819107 100644 --- a/qai_hub_models/models/mobilenet_v3_large/perf.yaml +++ b/qai_hub_models/models/mobilenet_v3_large/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MobileNet-v3-Large performance_metrics: - torchscript_onnx_tflite: - inference_time: 1022.0 - throughput: 978.4735812133073 + inference_time: 1002.0 + throughput: 998.003992015968 estimated_peak_memory_range: - min: 16384 - max: 1643944 + min: 12288 + max: 1963520 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 136 - job_id: j0pxnxnj5 + job_id: j1pvwkmmg job_status: Passed - torchscript_onnx_ort: - inference_time: 3790.0 - throughput: 263.85224274406335 + torchscript_onnx_qnn: + inference_time: 1037.0 + throughput: 964.3201542912246 estimated_peak_memory_range: min: 0 - max: 28283024 + max: 68891008 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 9 + layers_on_npu: 144 layers_on_gpu: 0 - layers_on_cpu: 8 - total_layers: 17 - job_id: jegnlkmv5 + layers_on_cpu: 0 + total_layers: 144 + job_id: jygz7dy6p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1038.0 + throughput: 963.3911368015414 + estimated_peak_memory_range: + min: 12288 + max: 87795632 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 162 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 162 + job_id: jvgdv14lg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.697406Z' + timestamp: '2024-05-20T16:35:29.966635Z' - torchscript_onnx_tflite: - inference_time: 691.0 - throughput: 1447.178002894356 + inference_time: 702.0 + throughput: 1424.5014245014245 estimated_peak_memory_range: - min: 16384 - max: 61060464 + min: 12288 + max: 61294288 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 136 - job_id: jo5mq8qyp + job_id: j7gjlny8p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 712.0 + throughput: 1404.4943820224719 + estimated_peak_memory_range: + min: 0 + max: 51701120 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: jz5w96zjp job_status: Passed torchscript_onnx_ort: - inference_time: 2766.0 - throughput: 361.53289949385396 + inference_time: 719.0 + throughput: 1390.8205841446454 estimated_peak_memory_range: - min: 12288 - max: 25734304 + min: 618496 + max: 32246576 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 9 + layers_on_npu: 162 layers_on_gpu: 0 - layers_on_cpu: 8 - total_layers: 17 - job_id: jopr8w2v5 + layers_on_cpu: 0 + total_layers: 162 + job_id: jz57dr8r5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.697460Z' + timestamp: '2024-05-20T16:35:29.966661Z' - torchscript_onnx_tflite: - inference_time: 1022.0 - throughput: 978.4735812133073 + inference_time: 1001.0 + throughput: 999.000999000999 estimated_peak_memory_range: - min: 24576 - max: 1929640 + min: 20480 + max: 1880160 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 136 - job_id: jz5w2r4j5 + job_id: jlpevmx05 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1041.0 + throughput: 960.6147934678194 + estimated_peak_memory_range: + min: 20480 + max: 47502336 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: jnp18z1lg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.697490Z' + timestamp: '2024-05-20T16:35:29.966678Z' + - torchscript_onnx_qnn: + inference_time: 1207.0 + throughput: 828.5004142502071 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: jmg94n2v5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 1108.0 + throughput: 902.5270758122743 + estimated_peak_memory_range: + min: 54001664 + max: 54001664 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 162 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 162 + job_id: jqp4wr2lg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 9897.0 + throughput: 101.0407194099222 + estimated_peak_memory_range: + min: 1593344 + max: 1593344 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 126 + total_layers: 126 + job_id: j0px1oz9g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.966700Z' diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/README.md b/qai_hub_models/models/mobilenet_v3_large_quantized/README.md index 1c9f24d6..8873f0b0 100644 --- a/qai_hub_models/models/mobilenet_v3_large_quantized/README.md +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/export.py b/qai_hub_models/models/mobilenet_v3_large_quantized/export.py index 9555199e..0b17aeb4 100644 --- a/qai_hub_models/models/mobilenet_v3_large_quantized/export.py +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/export.py @@ -30,6 +30,7 @@ from qai_hub_models.utils.qai_hub_helpers import ( can_access_qualcomm_ai_hub, export_without_hub_access, + transpose_channel_first_to_last, ) from qai_hub_models.utils.qnn_helpers import get_qnn_inputs @@ -122,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -165,6 +173,14 @@ def export_model( hub_inputs = sample_inputs if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, @@ -200,7 +216,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml b/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml index eabb0a70..fad12147 100644 --- a/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,38 +37,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MobileNet-v3-Large-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 585.0 - throughput: 1709.4017094017095 + inference_time: 357.0 + throughput: 2801.1204481792715 + estimated_peak_memory_range: + min: 16384 + max: 2663832 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 135 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 135 + job_id: jo5mzxlqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 623.0 + throughput: 1605.1364365971108 estimated_peak_memory_range: min: 12288 - max: 1681920 + max: 7124224 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 136 + layers_on_npu: 126 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 136 - job_id: jqpyrmjr5 + total_layers: 126 + job_id: jep2myzm5 job_status: Passed torchscript_onnx_ort: - inference_time: 6430.0 - throughput: 155.52099533437013 + inference_time: 5302.0 + throughput: 188.6080724254998 estimated_peak_memory_range: - min: 15818752 - max: 29085400 + min: 15572992 + max: 31527200 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 9 + layers_on_npu: 150 layers_on_gpu: 0 - layers_on_cpu: 8 - total_layers: 17 - job_id: j1p801mzg + layers_on_cpu: 24 + total_layers: 174 + job_id: jogkyxkop job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,36 +93,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.715082Z' + timestamp: '2024-05-20T16:35:29.996961Z' - torchscript_onnx_tflite: - inference_time: 413.0 - throughput: 2421.3075060532688 + inference_time: 277.0 + throughput: 3610.1083032490974 estimated_peak_memory_range: min: 12288 - max: 46829184 + max: 47595728 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 136 + layers_on_npu: 135 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 136 - job_id: j2p03622p + total_layers: 135 + job_id: jegnevwmg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 452.0 + throughput: 2212.3893805309735 + estimated_peak_memory_range: + min: 0 + max: 45251296 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jqpyd3y4p job_status: Passed torchscript_onnx_ort: - inference_time: 4730.0 - throughput: 211.41649048625794 + inference_time: 4131.0 + throughput: 242.0721374969741 estimated_peak_memory_range: - min: 21893120 - max: 53274160 + min: 21827584 + max: 58653840 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 9 + layers_on_npu: 150 layers_on_gpu: 0 - layers_on_cpu: 8 - total_layers: 17 - job_id: jogk78qyp + layers_on_cpu: 24 + total_layers: 174 + job_id: jn5q2qdm5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,88 +146,156 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.715131Z' + timestamp: '2024-05-20T16:35:29.996991Z' - torchscript_onnx_tflite: - inference_time: 1547.0 - throughput: 646.4124111182934 + inference_time: 351.0 + throughput: 2849.002849002849 estimated_peak_memory_range: min: 12288 - max: 28081232 + max: 1686776 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 138 + layers_on_npu: 135 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 138 - job_id: jegnlwnm5 + total_layers: 135 + job_id: jopry37eg job_status: Passed - torchscript_onnx_ort: - inference_time: 10400.0 - throughput: 96.15384615384616 + torchscript_onnx_qnn: + inference_time: 624.0 + throughput: 1602.5641025641025 estimated_peak_memory_range: - min: 11681792 - max: 108762160 - primary_compute_unit: CPU - precision: fp32 + min: 24576 + max: 15252232 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 126 layers_on_gpu: 0 - layers_on_cpu: 218 - total_layers: 218 - job_id: jn5qevr75 + layers_on_cpu: 0 + total_layers: 126 + job_id: j1p87yk85 job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.715190Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:29.997008Z' - torchscript_onnx_tflite: - inference_time: 5306.0 - throughput: 188.46588767433096 + inference_time: 1189.0 + throughput: 841.0428931875525 estimated_peak_memory_range: - min: 40960 - max: 2748408 + min: 12288 + max: 28245440 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 138 + layers_on_npu: 135 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 138 - job_id: jw56zo1vg + total_layers: 135 + job_id: jqp4v4z1p job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jqpy629l5 + job_status: Failed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.715215Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:29.997028Z' - torchscript_onnx_tflite: - inference_time: 667.0 - throughput: 1499.2503748125937 + inference_time: 6580.0 + throughput: 151.9756838905775 estimated_peak_memory_range: - min: 40960 - max: 1853728 + min: 45056 + max: 10222544 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 138 + layers_on_npu: 135 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 138 - job_id: jo5mqlmqp + total_layers: 135 + job_id: j0pxyrwlg job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.715252Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:29.997039Z' + - torchscript_onnx_qnn: + inference_time: 705.0 + throughput: 1418.4397163120568 + estimated_peak_memory_range: + min: 520192 + max: 520192 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: j2p0r0xep + job_status: Passed + torchscript_onnx_ort: + inference_time: 4772.0 + throughput: 209.55574182732607 + estimated_peak_memory_range: + min: 25464832 + max: 25464832 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 150 + layers_on_gpu: 0 + layers_on_cpu: 24 + total_layers: 174 + job_id: j1glkmqlp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 28805.0 + throughput: 34.71619510501649 + estimated_peak_memory_range: + min: 20099072 + max: 20099072 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jw561407p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:29.997063Z' diff --git a/qai_hub_models/models/mobilenet_v3_small/README.md b/qai_hub_models/models/mobilenet_v3_small/README.md index cb3ce811..9d058839 100644 --- a/qai_hub_models/models/mobilenet_v3_small/README.md +++ b/qai_hub_models/models/mobilenet_v3_small/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/mobilenet_v3_small/export.py b/qai_hub_models/models/mobilenet_v3_small/export.py index c1a8b1ad..57bde8ba 100644 --- a/qai_hub_models/models/mobilenet_v3_small/export.py +++ b/qai_hub_models/models/mobilenet_v3_small/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/mobilenet_v3_small/perf.yaml b/qai_hub_models/models/mobilenet_v3_small/perf.yaml index d5089b1d..5f36a06f 100644 --- a/qai_hub_models/models/mobilenet_v3_small/perf.yaml +++ b/qai_hub_models/models/mobilenet_v3_small/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: MobileNet-v3-Small performance_metrics: - torchscript_onnx_tflite: - inference_time: 840.0 - throughput: 1190.4761904761904 + inference_time: 834.0 + throughput: 1199.0407673860911 estimated_peak_memory_range: - min: 12288 - max: 1842512 + min: 16384 + max: 1577560 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 124 - job_id: j1gl6l2eg + job_id: j1p3m0rzg job_status: Passed - torchscript_onnx_ort: - inference_time: 3404.0 - throughput: 293.7720329024677 + torchscript_onnx_qnn: + inference_time: 866.0 + throughput: 1154.7344110854503 estimated_peak_memory_range: min: 16384 - max: 13250040 + max: 24077256 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: j7gjln88p + job_status: Passed + torchscript_onnx_ort: + inference_time: 813.0 + throughput: 1230.0123001230013 + estimated_peak_memory_range: + min: 12288 + max: 34364368 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 10 + layers_on_npu: 146 layers_on_gpu: 0 - layers_on_cpu: 9 - total_layers: 19 - job_id: j1p3v61xg + layers_on_cpu: 0 + total_layers: 146 + job_id: jmg94nqv5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.742071Z' + timestamp: '2024-05-20T16:35:30.036067Z' - torchscript_onnx_tflite: - inference_time: 547.0 - throughput: 1828.1535648994516 + inference_time: 545.0 + throughput: 1834.8623853211009 estimated_peak_memory_range: - min: 12288 - max: 40731056 + min: 20480 + max: 41085008 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 124 - job_id: jw56ewzvg + job_id: jwgov69d5 job_status: Passed - torchscript_onnx_ort: - inference_time: 3006.0 - throughput: 332.667997338656 + torchscript_onnx_qnn: + inference_time: 582.0 + throughput: 1718.213058419244 estimated_peak_memory_range: min: 12288 - max: 27095152 + max: 46524832 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 10 + layers_on_npu: 126 layers_on_gpu: 0 - layers_on_cpu: 9 - total_layers: 19 - job_id: jwgok8n4p + layers_on_cpu: 0 + total_layers: 126 + job_id: jlpevmn05 + job_status: Passed + torchscript_onnx_ort: + inference_time: 560.0 + throughput: 1785.7142857142858 + estimated_peak_memory_range: + min: 618496 + max: 27970128 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jnp18zmlg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.742122Z' + timestamp: '2024-05-20T16:35:30.036094Z' - torchscript_onnx_tflite: - inference_time: 844.0 - throughput: 1184.8341232227488 + inference_time: 826.0 + throughput: 1210.6537530266344 estimated_peak_memory_range: - min: 12288 - max: 1902856 + min: 24576 + max: 1999704 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 124 - job_id: j1gl6qxlg + job_id: j1pvwknmg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 866.0 + throughput: 1154.7344110854503 + estimated_peak_memory_range: + min: 0 + max: 25356816 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jz5w96rjp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.742147Z' + timestamp: '2024-05-20T16:35:30.036111Z' + - torchscript_onnx_qnn: + inference_time: 1032.0 + throughput: 968.9922480620155 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jygz7d06p + job_status: Passed + torchscript_onnx_ort: + inference_time: 908.0 + throughput: 1101.3215859030836 + estimated_peak_memory_range: + min: 3018752 + max: 3018752 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jvgdv1mlg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 4962.0 + throughput: 201.53164046755342 + estimated_peak_memory_range: + min: 1437696 + max: 1437696 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 114 + total_layers: 114 + job_id: jz57dr1r5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.036134Z' diff --git a/qai_hub_models/models/openai_clip/README.md b/qai_hub_models/models/openai_clip/README.md index d22ca80d..06c429e8 100644 --- a/qai_hub_models/models/openai_clip/README.md +++ b/qai_hub_models/models/openai_clip/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/o a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/openai_clip/export.py b/qai_hub_models/models/openai_clip/export.py index 2c00d8fd..68ef8cab 100644 --- a/qai_hub_models/models/openai_clip/export.py +++ b/qai_hub_models/models/openai_clip/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -226,12 +226,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser( - model_cls=Model, - components=ALL_COMPONENTS, - supports_qnn=False, - supports_ort=False, - ) + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/openai_clip/perf.yaml b/qai_hub_models/models/openai_clip/perf.yaml index 22fec649..c61f9847 100644 --- a/qai_hub_models/models/openai_clip/perf.yaml +++ b/qai_hub_models/models/openai_clip/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: CLIPTextEncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 15395.0 - throughput: 64.95615459564793 + inference_time: 13312.0 + throughput: 75.1201923076923 estimated_peak_memory_range: - min: 32768 - max: 2875584 + min: 20480 + max: 2971744 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 2 total_layers: 576 - job_id: j7gjzq275 + job_id: jqp4wr6lg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 7826.0 + throughput: 127.77919754663941 + estimated_peak_memory_range: + min: 45056 + max: 25299672 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 377 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 377 + job_id: jqpyd3k4p job_status: Passed torchscript_onnx_ort: - inference_time: 33201.0 - throughput: 30.119574711605072 + inference_time: 31411.0 + throughput: 31.83598102575531 estimated_peak_memory_range: - min: 40960 - max: 328459688 + min: 16384 + max: 325180960 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 389 layers_on_gpu: 0 - layers_on_cpu: 1 - total_layers: 2 - job_id: jmg9jdyq5 + layers_on_cpu: 0 + total_layers: 389 + job_id: jwgov67d5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.759891Z' + timestamp: '2024-05-20T16:35:30.066622Z' - torchscript_onnx_tflite: - inference_time: 11237.0 - throughput: 88.99172376968941 + inference_time: 9410.0 + throughput: 106.26992561105207 estimated_peak_memory_range: min: 16384 - max: 219358080 + max: 211565584 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 2 total_layers: 576 - job_id: jygzonjz5 + job_id: jo5mzx1qp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 5494.0 + throughput: 182.01674554058974 + estimated_peak_memory_range: + min: 0 + max: 141191120 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 377 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 377 + job_id: j1p87yd85 job_status: Passed torchscript_onnx_ort: - inference_time: 23967.0 - throughput: 41.7240372178412 + inference_time: 22506.0 + throughput: 44.43259575224385 estimated_peak_memory_range: min: 36864 - max: 216279616 + max: 184881664 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 389 layers_on_gpu: 0 - layers_on_cpu: 1 - total_layers: 2 - job_id: jvgde2qk5 + layers_on_cpu: 0 + total_layers: 389 + job_id: j7gjln68p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.759974Z' + timestamp: '2024-05-20T16:35:30.066650Z' - torchscript_onnx_tflite: - inference_time: 15367.0 - throughput: 65.07451031430989 + inference_time: 13176.0 + throughput: 75.89556769884639 estimated_peak_memory_range: - min: 49152 - max: 3357800 + min: 16384 + max: 3268096 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 2 total_layers: 576 - job_id: j7gjz8785 + job_id: jopry3meg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 7787.0 + throughput: 128.4191601386927 + estimated_peak_memory_range: + min: 32768 + max: 17390072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 377 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 377 + job_id: jw561497p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,15 +178,68 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.760069Z' + timestamp: '2024-05-20T16:35:30.066668Z' + - torchscript_onnx_qnn: + inference_time: 8463.0 + throughput: 118.16140848398913 + estimated_peak_memory_range: + min: 229376 + max: 229376 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 377 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 377 + job_id: jn5q2qxm5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 32986.0 + throughput: 30.315891590371674 + estimated_peak_memory_range: + min: 137265152 + max: 137265152 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 389 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 389 + job_id: jygz7dq6p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 9446.0 + throughput: 105.86491636671607 + estimated_peak_memory_range: + min: 684032 + max: 684032 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 290 + total_layers: 290 + job_id: jmg94n7v5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.066690Z' - name: CLIPImageEncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 126657.0 - throughput: 7.895339381163299 + inference_time: 126619.0 + throughput: 7.8977088746554625 estimated_peak_memory_range: - min: 163840 - max: 3470824 + min: 126976 + max: 4408960 primary_compute_unit: NPU precision: fp16 layer_info: @@ -147,23 +247,38 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 576 - job_id: jlpeeyw7p + job_id: j0px1o89g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 50334.0 + throughput: 19.86728652600628 + estimated_peak_memory_range: + min: 16384 + max: 67772216 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 371 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 371 + job_id: j2p0r08ep job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 173185.0 + throughput: 5.774172128071138 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 40960 + max: 529782032 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 382 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jnp1y6wkp - job_status: Failed + total_layers: 382 + job_id: j1pvwkymg + job_status: Passed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -171,13 +286,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.760150Z' + timestamp: '2024-05-20T16:35:30.066714Z' - torchscript_onnx_tflite: - inference_time: 96976.0 - throughput: 10.31182973106748 + inference_time: 95991.0 + throughput: 10.417643320727985 estimated_peak_memory_range: - min: 229376 - max: 865695568 + min: 204800 + max: 748165536 primary_compute_unit: NPU precision: fp16 layer_info: @@ -185,22 +300,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 576 - job_id: jz5w243z5 + job_id: jegnevdmg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 37870.0 + throughput: 26.406126221283337 + estimated_peak_memory_range: + min: 655360 + max: 195252672 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 371 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 371 + job_id: jogkyxwop job_status: Passed torchscript_onnx_ort: - inference_time: 128177.0 - throughput: 7.801711695546003 + inference_time: 131060.0 + throughput: 7.630093087135663 estimated_peak_memory_range: - min: 774144 - max: 1720363664 + min: 618496 + max: 1274243488 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 382 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz5w243j5 + total_layers: 382 + job_id: jlpevm005 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -209,13 +339,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.760227Z' + timestamp: '2024-05-20T16:35:30.066735Z' - torchscript_onnx_tflite: - inference_time: 127012.0 - throughput: 7.873271816836205 + inference_time: 126196.0 + throughput: 7.924181432058068 estimated_peak_memory_range: - min: 184320 - max: 4508448 + min: 155648 + max: 4526472 primary_compute_unit: NPU precision: fp16 layer_info: @@ -223,7 +353,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 576 - job_id: jlpeenz0p + job_id: jep2myqm5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 50570.0 + throughput: 19.774569903104606 + estimated_peak_memory_range: + min: 57344 + max: 57651824 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 371 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 371 + job_id: j1p3m0lzg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -232,4 +377,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.760295Z' + timestamp: '2024-05-20T16:35:30.066751Z' + - torchscript_onnx_qnn: + inference_time: 48896.0 + throughput: 20.451570680628272 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 369 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 369 + job_id: j1glkm9lp + job_status: Passed + torchscript_onnx_ort: + inference_time: 168856.0 + throughput: 5.922205903254844 + estimated_peak_memory_range: + min: 492744704 + max: 492744704 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 382 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 382 + job_id: jz5w960jp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jnp18zklg + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.066780Z' diff --git a/qai_hub_models/models/openpose/README.md b/qai_hub_models/models/openpose/README.md index 1789e21e..f5b7c4f4 100644 --- a/qai_hub_models/models/openpose/README.md +++ b/qai_hub_models/models/openpose/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/o a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/openpose/export.py b/qai_hub_models/models/openpose/export.py index d8e1d30e..430e7ecb 100644 --- a/qai_hub_models/models/openpose/export.py +++ b/qai_hub_models/models/openpose/export.py @@ -120,12 +120,17 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + + " --force_channel_last_output output_0,output_1" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0,output_1", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +168,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +199,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0,output_1", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0,output_1", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +216,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/openpose/info.yaml b/qai_hub_models/models/openpose/info.yaml index 3ec39a9a..8b1c1e00 100644 --- a/qai_hub_models/models/openpose/info.yaml +++ b/qai_hub_models/models/openpose/info.yaml @@ -31,7 +31,7 @@ related_models: - litehrnet - mediapipe_pose has_static_banner: yes -has_animated_banner: no +has_animated_banner: yes license_type: other deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/openpose/perf.yaml b/qai_hub_models/models/openpose/perf.yaml index df272c6b..196de26e 100644 --- a/qai_hub_models/models/openpose/perf.yaml +++ b/qai_hub_models/models/openpose/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: OpenPose performance_metrics: - torchscript_onnx_tflite: - inference_time: 11751.0 - throughput: 85.09914049868097 + inference_time: 11697.0 + throughput: 85.4920064973925 estimated_peak_memory_range: - min: 225280 - max: 2603680 + min: 204800 + max: 2413880 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 103 - job_id: j0pxnxy95 + job_id: jvgdv1ylg job_status: Passed torchscript_onnx_qnn: - inference_time: 11827.0 - throughput: 84.5522955948254 + inference_time: 11783.0 + throughput: 84.86803021301876 estimated_peak_memory_range: - min: 651264 - max: 242798248 + min: 638976 + max: 240653744 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 186 - job_id: jegnlk3m5 + job_id: jnp18z92g job_status: Passed torchscript_onnx_ort: - inference_time: 12055.0 - throughput: 82.9531314807134 + inference_time: 11925.0 + throughput: 83.85744234800839 estimated_peak_memory_range: - min: 589824 - max: 430729112 + min: 622592 + max: 408558976 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 189 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jep20elmg + total_layers: 189 + job_id: j0px1oq1g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.792235Z' + timestamp: '2024-05-20T16:35:30.123582Z' - torchscript_onnx_tflite: - inference_time: 8779.0 - throughput: 113.90818999886092 + inference_time: 8714.0 + throughput: 114.75786091347257 estimated_peak_memory_range: - min: 196608 - max: 34017488 + min: 212992 + max: 35487584 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 103 - job_id: jo5mq83qp + job_id: jz5w96k6p job_status: Passed torchscript_onnx_qnn: - inference_time: 8774.0 - throughput: 113.97310234784591 + inference_time: 8761.0 + throughput: 114.1422212076247 estimated_peak_memory_range: - min: 638976 - max: 51579776 + min: 618496 + max: 53231792 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 186 - job_id: jopr8wee5 + job_id: jvgdv1keg job_status: Passed torchscript_onnx_ort: - inference_time: 9248.0 - throughput: 108.13148788927336 + inference_time: 9189.0 + throughput: 108.82576994232234 estimated_peak_memory_range: - min: 622592 - max: 22342656 + min: 2715648 + max: 30463376 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 189 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jqpyrm645 + total_layers: 189 + job_id: jo5mzx7wp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.792298Z' + timestamp: '2024-05-20T16:35:30.123609Z' - torchscript_onnx_tflite: - inference_time: 11875.0 - throughput: 84.21052631578948 + inference_time: 11765.0 + throughput: 84.99787505312368 estimated_peak_memory_range: - min: 139264 - max: 2225560 + min: 233472 + max: 2374096 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 103 - job_id: j0pxnzl15 + job_id: jmg94nrl5 job_status: Passed torchscript_onnx_qnn: - inference_time: 11826.0 - throughput: 84.5594452900389 + inference_time: 11798.0 + throughput: 84.76012883539583 estimated_peak_memory_range: - min: 663552 - max: 242581864 + min: 622592 + max: 241415392 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 186 - job_id: jep20zr4g + job_id: jqp4wr7vg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.792350Z' + timestamp: '2024-05-20T16:35:30.123627Z' + - torchscript_onnx_qnn: + inference_time: 14112.0 + throughput: 70.86167800453515 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 186 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 186 + job_id: jz57drml5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 12340.0 + throughput: 81.03727714748784 + estimated_peak_memory_range: + min: 90116096 + max: 90116096 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 189 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 189 + job_id: jegnev4rg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 167707.0 + throughput: 5.962780325210039 + estimated_peak_memory_range: + min: 87339008 + max: 87339008 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 103 + total_layers: 103 + job_id: jopry3r9g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.123650Z' diff --git a/qai_hub_models/models/posenet_mobilenet/README.md b/qai_hub_models/models/posenet_mobilenet/README.md new file mode 100644 index 00000000..38ed61a7 --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/README.md @@ -0,0 +1,56 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Posenet-Mobilenet: Perform accurate human pose estimation](#) + +Posenet performs pose estimation on human images. + +This is based on the implementation of Posenet-Mobilenet found +[here](https://github.com/rwightman/posenet-pytorch). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](#). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.posenet_mobilenet.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.posenet_mobilenet.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Posenet-Mobilenet can be found + [here](https://github.com/rwightman/posenet-pytorch/blob/master/LICENSE.txt). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [PersonLab: Person Pose Estimation and Instance Segmentation with a Bottom-Up, Part-Based, Geometric Embedding Model](https://arxiv.org/abs/1803.08225) +* [Source Model Implementation](https://github.com/rwightman/posenet-pytorch) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/posenet_mobilenet/__init__.py b/qai_hub_models/models/posenet_mobilenet/__init__.py new file mode 100644 index 00000000..87c45151 --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/__init__.py @@ -0,0 +1,7 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from .app import PosenetApp # noqa: F401 +from .model import MODEL_ID # noqa: F401 +from .model import PosenetMobilenet as Model # noqa: F401 diff --git a/qai_hub_models/models/posenet_mobilenet/app.py b/qai_hub_models/models/posenet_mobilenet/app.py new file mode 100644 index 00000000..2ccca2f2 --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/app.py @@ -0,0 +1,588 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- + +from __future__ import annotations + +from typing import Callable, List, Tuple + +import cv2 +import numpy as np +import torch +from PIL import Image +from torchvision import transforms + +from qai_hub_models.models.posenet_mobilenet.model import OUTPUT_STRIDE +from qai_hub_models.utils.draw import draw_points +from qai_hub_models.utils.image_processing import pil_resize_pad, pil_undo_resize_pad + +# Most code here is from the source repo https://github.com/rwightman/posenet-pytorch + +PART_NAMES = [ + "nose", + "leftEye", + "rightEye", + "leftEar", + "rightEar", + "leftShoulder", + "rightShoulder", + "leftElbow", + "rightElbow", + "leftWrist", + "rightWrist", + "leftHip", + "rightHip", + "leftKnee", + "rightKnee", + "leftAnkle", + "rightAnkle", +] + +NUM_KEYPOINTS = len(PART_NAMES) + +PART_IDS = {pn: pid for pid, pn in enumerate(PART_NAMES)} +LOCAL_MAXIMUM_RADIUS = 1 + +POSE_CHAIN = [ + ("nose", "leftEye"), + ("leftEye", "leftEar"), + ("nose", "rightEye"), + ("rightEye", "rightEar"), + ("nose", "leftShoulder"), + ("leftShoulder", "leftElbow"), + ("leftElbow", "leftWrist"), + ("leftShoulder", "leftHip"), + ("leftHip", "leftKnee"), + ("leftKnee", "leftAnkle"), + ("nose", "rightShoulder"), + ("rightShoulder", "rightElbow"), + ("rightElbow", "rightWrist"), + ("rightShoulder", "rightHip"), + ("rightHip", "rightKnee"), + ("rightKnee", "rightAnkle"), +] + +PARENT_CHILD_TUPLES = [ + (PART_IDS[parent], PART_IDS[child]) for parent, child in POSE_CHAIN +] +CONNECTED_PART_NAMES = [ + ("leftHip", "leftShoulder"), + ("leftElbow", "leftShoulder"), + ("leftElbow", "leftWrist"), + ("leftHip", "leftKnee"), + ("leftKnee", "leftAnkle"), + ("rightHip", "rightShoulder"), + ("rightElbow", "rightShoulder"), + ("rightElbow", "rightWrist"), + ("rightHip", "rightKnee"), + ("rightKnee", "rightAnkle"), + ("leftShoulder", "rightShoulder"), + ("leftHip", "rightHip"), +] + +CONNECTED_PART_INDICES = [(PART_IDS[a], PART_IDS[b]) for a, b in CONNECTED_PART_NAMES] + + +def traverse_to_targ_keypoint( + edge_id: int, + source_keypoint: np.ndarray, + target_keypoint_id: int, + scores: np.ndarray, + offsets: np.ndarray, + displacements: np.ndarray, +) -> Tuple[float, np.ndarray]: + """ + Given a source keypoint and target_keypoint_id, + predict the score and coordinates of the target keypoint. + + Parameters: + edge_id: Index of the edge being considered. + Equivalent to the index in `POSE_CHAIN`. + source_keypoint: (y, x) coordinates of the keypoint. + target_keypoint_id: Which body part type of the 17 this keypoint is. + scores: See `decode_multiple_poses`. + offsets: See `decode_multiple_poses`. + displacements: See `decode_multiple_poses`. + + Returns: + Tuple of target keypoint score and coordinates. + """ + height = scores.shape[1] + width = scores.shape[2] + + source_keypoint_indices = np.clip( + np.round(source_keypoint / OUTPUT_STRIDE), + a_min=0, + a_max=[height - 1, width - 1], + ).astype(np.int32) + + displaced_point = ( + source_keypoint + + displacements[edge_id, source_keypoint_indices[0], source_keypoint_indices[1]] + ) + + displaced_point_indices = np.clip( + np.round(displaced_point / OUTPUT_STRIDE), + a_min=0, + a_max=[height - 1, width - 1], + ).astype(np.int32) + + score = scores[ + target_keypoint_id, displaced_point_indices[0], displaced_point_indices[1] + ] + + image_coord = ( + displaced_point_indices * OUTPUT_STRIDE + + offsets[ + target_keypoint_id, displaced_point_indices[0], displaced_point_indices[1] + ] + ) + + return score, image_coord + + +def decode_pose( + root_score: float, + root_id: int, + root_image_coord: np.ndarray, + scores: np.ndarray, + offsets: np.ndarray, + displacements_fwd: np.ndarray, + displacements_bwd: np.ndarray, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Get all keypoint predictions for a pose given a root keypoint with a high score. + + Parameters: + root_score: The confidence score of the root keypoint. + root_id: Which body part type of the 17 this keypoint is. + root_image_coord: (y, x) coordinates of the keypoint. + scores: See `decode_multiple_poses`. + offsets: See `decode_multiple_poses`. + displacements_fwd: See `decode_multiple_poses`. + displacements_bwd: See `decode_multiple_poses`. + + Returns: + Tuple of list of keypoint scores and list of coordinates. + """ + num_parts = scores.shape[0] + num_edges = len(PARENT_CHILD_TUPLES) + + instance_keypoint_scores = np.zeros(num_parts) + instance_keypoint_coords = np.zeros((num_parts, 2)) + instance_keypoint_scores[root_id] = root_score + instance_keypoint_coords[root_id] = root_image_coord + + for edge in reversed(range(num_edges)): + target_keypoint_id, source_keypoint_id = PARENT_CHILD_TUPLES[edge] + if ( + instance_keypoint_scores[source_keypoint_id] > 0.0 + and instance_keypoint_scores[target_keypoint_id] == 0.0 + ): + score, coords = traverse_to_targ_keypoint( + edge, + instance_keypoint_coords[source_keypoint_id], + target_keypoint_id, + scores, + offsets, + displacements_bwd, + ) + instance_keypoint_scores[target_keypoint_id] = score + instance_keypoint_coords[target_keypoint_id] = coords + + for edge in range(num_edges): + source_keypoint_id, target_keypoint_id = PARENT_CHILD_TUPLES[edge] + if ( + instance_keypoint_scores[source_keypoint_id] > 0.0 + and instance_keypoint_scores[target_keypoint_id] == 0.0 + ): + score, coords = traverse_to_targ_keypoint( + edge, + instance_keypoint_coords[source_keypoint_id], + target_keypoint_id, + scores, + offsets, + displacements_fwd, + ) + instance_keypoint_scores[target_keypoint_id] = score + instance_keypoint_coords[target_keypoint_id] = coords + + return instance_keypoint_scores, instance_keypoint_coords + + +def within_nms_radius_fast( + pose_coords: np.ndarray, nms_radius: float, point: np.ndarray +) -> bool: + """ + Whether the candidate point is nearby any existing point in `pose_coords`. + + pose_coords: + Numpy array of points, shape (N, 2). + nms_radius: + The distance between two points for them to be considered nearby. + point: + The candidate point, shape (2,). + """ + if not pose_coords.shape[0]: + return False + return np.any(np.sum((pose_coords - point) ** 2, axis=1) <= nms_radius**2) + + +def get_instance_score_fast( + exist_pose_coords: np.ndarray, + nms_radius: int, + keypoint_scores: np.ndarray, + keypoint_coords: np.ndarray, +) -> float: + """ + Compute a probability that the given pose is real. + Equal to the average confidence of each keypoint, excluding keypoints + that are shared with existing poses. + + Parameters: + exist_pose_coords: Keypoint coordinates of poses that have already been found. + Shape (N, 17, 2) + nms_radius: + If two candidate keypoints for the same body part are within this distance, + they are considered the same, and the lower confidence one discarded. + keypoint_scores: + Keypoint scores for the new pose. Shape (17,) + keypoint_coords: + Coordinates for the new pose. Shape (17, 2) + + Returns: + Confidence score for the pose. + """ + if exist_pose_coords.shape[0]: + s = np.sum((exist_pose_coords - keypoint_coords) ** 2, axis=2) > nms_radius**2 + not_overlapped_scores = np.sum(keypoint_scores[np.all(s, axis=0)]) + else: + not_overlapped_scores = np.sum(keypoint_scores) + return not_overlapped_scores / len(keypoint_scores) + + +def build_part_with_score_torch( + score_threshold: float, max_vals: torch.Tensor, scores: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Get candidate keypoints to be considered the root for a pose. + Score for the keypoint must be >= all neighboring scores. + Score must also be above given score_threshold. + + Parameters: + score_threshold: Minimum score for a keypoint to be considered as a root. + max_vals: See `decode_multiple_poses`. + scores: See `decode_multiple_poses`. + + Returns: + Tuple of: + - Torch scores for each keypoint to be considered. + - Indices of the considered keypoints. Shape (N, 3) where the 3 indices + map to the dimensions of the scores tensor with shape (17, h, w). + """ + max_loc = (scores == max_vals) & (scores >= score_threshold) + max_loc_idx = max_loc.nonzero() + scores_vec = scores[max_loc] + sort_idx = torch.argsort(scores_vec, descending=True) + return scores_vec[sort_idx], max_loc_idx[sort_idx] + + +def decode_multiple_poses( + scores: torch.Tensor, + offsets: torch.Tensor, + displacements_fwd: torch.Tensor, + displacements_bwd: torch.Tensor, + max_vals: torch.Tensor, + max_pose_detections: int = 10, + score_threshold: float = 0.25, + nms_radius: int = 20, + min_pose_score: float = 0.25, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Converts raw model outputs into image with keypoints drawn. + Can detect multiple poses in the same image, up to `max_pose_detections`. + This model has 17 candidate keypoints it predicts. + In this docstring, (h, w) correspond to height and width of the grid + and are roughly equal to input image size divided by 16. + + Parameters: + scores: + Tensor of scores in range [0, 1] indicating probability + a candidate pose is real. Shape [17, h, w]. + offsets: + Tensor of offsets for a given keypoint, relative to the grid point. + Shape [34, h, w]. + displacements_fwd: + When tracing the points for a pose, given a source keypoint, this value + gives the displacement to the next keypoint in the pose. There are 16 + connections from one keypoint to another (it's a minimum spanning tree). + Shape [32, h, w]. + displacements_bwd: + Same as displacements_fwd, except when traversing keypoint connections + in the opposite direction. + max_vals: + Same as scores except with a max pool applied with kernel size 3. + max_pose_detections: + Maximum number of distinct poses to detect in a single image. + score_threshold: + Minimum score for a keypoint to be considered the root for a pose. + nms_radius: + If two candidate keypoints for the same body part are within this distance, + they are considered the same, and the lower confidence one discarded. + min_pose_score: + Minimum confidence that a pose exists for it to be displayed. + + Returns: + Tuple of: + - Numpy array of pose confidence scores. + - Numpy array of keypoint confidence scores. + - Numpy array of keypoint coordinates. + """ + part_scores, part_idx = build_part_with_score_torch( + score_threshold, max_vals, scores + ) + part_scores = part_scores.cpu().numpy() + part_idx = part_idx.cpu().numpy() + + scores = scores.cpu().numpy() + height = scores.shape[1] + width = scores.shape[2] + # change dimensions from (x, h, w) to (x//2, h, w, 2) to allow return of complete coord array + offsets = ( + offsets.cpu().numpy().reshape(2, -1, height, width).transpose((1, 2, 3, 0)) + ) + displacements_fwd = ( + displacements_fwd.cpu() + .numpy() + .reshape(2, -1, height, width) + .transpose((1, 2, 3, 0)) + ) + displacements_bwd = ( + displacements_bwd.cpu() + .numpy() + .reshape(2, -1, height, width) + .transpose((1, 2, 3, 0)) + ) + + pose_count = 0 + pose_scores = np.zeros(max_pose_detections) + pose_keypoint_scores = np.zeros((max_pose_detections, NUM_KEYPOINTS)) + pose_keypoint_coords = np.zeros((max_pose_detections, NUM_KEYPOINTS, 2)) + + for root_score, (root_id, root_coord_y, root_coord_x) in zip(part_scores, part_idx): + root_coord = np.array([root_coord_y, root_coord_x]) + root_image_coords = ( + root_coord * OUTPUT_STRIDE + offsets[root_id, root_coord_y, root_coord_x] + ) + + if within_nms_radius_fast( + pose_keypoint_coords[:pose_count, root_id, :], + nms_radius, + root_image_coords, + ): + continue + + keypoint_scores, keypoint_coords = decode_pose( + root_score, + root_id, + root_image_coords, + scores, + offsets, + displacements_fwd, + displacements_bwd, + ) + + pose_score = get_instance_score_fast( + pose_keypoint_coords[:pose_count, :, :], + nms_radius, + keypoint_scores, + keypoint_coords, + ) + + # NOTE this isn't in the original implementation, but it appears that by initially ordering by + # part scores, and having a max # of detections, we can end up populating the returned poses with + # lower scored poses than if we discard 'bad' ones and continue (higher pose scores can still come later). + # Set min_pose_score to 0. to revert to original behaviour + if min_pose_score == 0.0 or pose_score >= min_pose_score: + pose_scores[pose_count] = pose_score + pose_keypoint_scores[pose_count, :] = keypoint_scores + pose_keypoint_coords[pose_count, :, :] = keypoint_coords + pose_count += 1 + + if pose_count >= max_pose_detections: + break + + return pose_scores, pose_keypoint_scores, pose_keypoint_coords + + +def get_adjacent_keypoints( + keypoint_scores: np.ndarray, keypoint_coords: np.ndarray, score_threshold: float +) -> List[np.ndarray]: + """ + Compute which keypoints should be connected in the image. + + keypoint_scores: + Scores for all candidate keypoints in the pose. + keypoint_coords: + Coordinates for all candidate keypoints in the pose. + score_threshold: + If either keypoint in a candidate edge is below this threshold, omit the edge. + + Returns: + List of (2, 2) numpy arrays containing coordinates of edge endpoints. + """ + results = [] + for left, right in CONNECTED_PART_INDICES: + if ( + keypoint_scores[left] < score_threshold + or keypoint_scores[right] < score_threshold + ): + continue + results.append( + np.array( + [keypoint_coords[left][::-1], keypoint_coords[right][::-1]] + ).astype(np.int32), + ) + return results + + +def draw_skel_and_kp( + img: np.ndarray, + instance_scores: np.ndarray, + keypoint_scores: np.ndarray, + keypoint_coords: np.ndarray, + min_pose_score: float = 0.5, + min_part_score: float = 0.5, +) -> None: + """ + Draw the keypoints and edges on the input numpy array image in-place. + + Parameters: + img: Numpy array of the image. + instance_scores: Numpy array of confidence for each pose. + keypoint_scores: Numpy array of confidence for each keypoint. + keypoint_coords: Numpy array of coordinates for each keypoint. + min_pose_score: Minimum score for a pose to be displayed. + min_part_score: Minimum score for a keypoint to be displayed. + """ + adjacent_keypoints = [] + points = [] + sizes = [] + for ii, score in enumerate(instance_scores): + if score < min_pose_score: + continue + + new_connections = get_adjacent_keypoints( + keypoint_scores[ii, :], keypoint_coords[ii, :, :], min_part_score + ) + adjacent_keypoints.extend(new_connections) + + for ks, kc in zip(keypoint_scores[ii, :], keypoint_coords[ii, :, :]): + if ks < min_part_score: + continue + points.append([kc[1], kc[0]]) + sizes.append(10.0 * ks) + + if points: + points_np = np.array(points) + draw_points(img, points_np, color=(255, 255, 0), size=sizes) + cv2.polylines(img, adjacent_keypoints, isClosed=False, color=(255, 255, 0)) + + +class PosenetApp: + pass + """ + This class consists of light-weight "app code" that is required to perform end to end inference with Posenet. + + The app uses 1 model: + * Posenet + + For a given image input, the app will: + * pre-process the image + * Run Posenet inference + * Convert the output into a list of keypoint coordiates + * Return raw coordinates or an image with keypoints overlayed + """ + + def __init__( + self, + model: Callable[ + [torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ], + input_height: int, + input_width: int, + ): + self.model = model + self.input_height = input_height + self.input_width = input_width + + def predict(self, *args, **kwargs): + # See predict_pose_keypoints. + return self.predict_pose_keypoints(*args, **kwargs) + + def predict_pose_keypoints( + self, + image: Image.Image, + raw_output: bool = False, + ) -> np.ndarray | Image.Image: + """ + Predicts up to 17 pose keypoints for up to 10 people in the image. + + Parameters: + image: Image on which to predict pose keypoints. + raw_output: bool + See "returns" doc section for details. + + Returns: + If raw_output is true, returns: + pose_scores: np.ndarray, shape (10,) + Confidence score that a given pose is real for up to 10 poses. + keypoint_scores: np.ndarray, shape (10, 17) + Confidence score that a given keypoint is real. There can be up to + 10 poses and up to 17 keypoints per pose. + keypoint_coords: np.ndarray, shape (10, 17, 2) + Coordinates of predicted keypoints in (y, x) format. + + Otherwise, returns: + predicted_images: PIL.Image.Image + Image with keypoints drawn. + """ + original_size = (image.size[-2], image.size[-1]) + image, scale, padding = pil_resize_pad( + image, (self.input_height, self.input_width) + ) + tensor = transforms.ToTensor()(image) + tensor = tensor.reshape(1, 3, self.input_height, self.input_width) + + np.save("build/posenet_inputs", tensor.numpy()) + with torch.no_grad(): + ( + heatmaps_result, + offsets_result, + displacement_fwd_result, + displacement_bwd_result, + max_vals, + ) = self.model(tensor) + pose_scores, keypoint_scores, keypoint_coords = decode_multiple_poses( + heatmaps_result.squeeze(0), + offsets_result.squeeze(0), + displacement_fwd_result.squeeze(0), + displacement_bwd_result.squeeze(0), + max_vals.squeeze(0), + max_pose_detections=10, + min_pose_score=0.25, + ) + if raw_output: + return pose_scores, keypoint_scores, keypoint_coords + output_arr = np.array(image) + draw_skel_and_kp( + output_arr, + pose_scores, + keypoint_scores, + keypoint_coords, + min_pose_score=0.25, + min_part_score=0.25, + ) + image_result = Image.fromarray(output_arr) + return pil_undo_resize_pad(image_result, original_size, scale, padding) diff --git a/qai_hub_models/models/posenet_mobilenet/conftest.py b/qai_hub_models/models/posenet_mobilenet/conftest.py new file mode 100644 index 00000000..6c1bdeed --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/conftest.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import inspect + +import pytest + +from qai_hub_models.models.posenet_mobilenet import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +# Instantiate the model only once for all tests. +# Mock from_pretrained to always return the initialized model. +# This speeds up tests and limits memory leaks. +@pytest.fixture(scope="module", autouse=True) +def cached_from_pretrained(): + with pytest.MonkeyPatch.context() as mp: + pretrained_cache = {} + from_pretrained = Model.from_pretrained + sig = inspect.signature(from_pretrained) + + @skip_clone_repo_check + def _cached_from_pretrained(*args, **kwargs): + cache_key = str(args) + str(kwargs) + model = pretrained_cache.get(cache_key, None) + if model: + return model + else: + model = from_pretrained(*args, **kwargs) + pretrained_cache[cache_key] = model + return model + + _cached_from_pretrained.__signature__ = sig + + mp.setattr(Model, "from_pretrained", _cached_from_pretrained) + yield mp diff --git a/qai_hub_models/models/posenet_mobilenet/demo.py b/qai_hub_models/models/posenet_mobilenet/demo.py new file mode 100644 index 00000000..57a045f3 --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/demo.py @@ -0,0 +1,62 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from typing import Type + +from qai_hub_models.models.posenet_mobilenet.app import PosenetApp +from qai_hub_models.models.posenet_mobilenet.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + PosenetMobilenet, +) +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.display import display_or_save_image + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "posenet_demo.jpg" +) + + +# The demo will display a image with the predicted keypoints. +def posenet_demo(model_cls: Type[PosenetMobilenet], is_test: bool = False): + # Demo parameters + parser = get_model_cli_parser(model_cls) + parser = get_on_device_demo_parser(parser, add_output_dir=True) + parser.add_argument( + "--image", + type=str, + default=IMAGE_ADDRESS, + help="image file path or URL", + ) + args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, MODEL_ID) + + # Load image & model + model = demo_model_from_cli_args(model_cls, MODEL_ID, args) + image = load_image(args.image) + print("Model Loaded") + + h, w = model_cls.get_input_spec()["image"][0][2:] + app = PosenetApp(model, h, w) + keypoints = app.predict_pose_keypoints(image) + if not is_test: + display_or_save_image( + keypoints, args.output_dir, "posenet_demo_output.png", "keypoints" + ) + + +def main(is_test: bool = False): + return posenet_demo(PosenetMobilenet, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/posenet_mobilenet/export.py b/qai_hub_models/models/posenet_mobilenet/export.py new file mode 100644 index 00000000..e948410d --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/export.py @@ -0,0 +1,215 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub +import torch + +from qai_hub_models.models.posenet_mobilenet import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "posenet_mobilenet" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "posenet_mobilenet", + "Posenet-Mobilenet", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + model.eval() + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + channel_last_flags, hub_device + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub_device, + name=model_name, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub_device, + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub_device, + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/posenet_mobilenet/info.yaml b/qai_hub_models/models/posenet_mobilenet/info.yaml new file mode 100644 index 00000000..beaeaabb --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/info.yaml @@ -0,0 +1,39 @@ +name: Posenet-Mobilenet +# id must match with the model dir name in qai_hub_models +id: posenet_mobilenet +status: public +headline: Perform accurate human pose estimation. +domain: Computer Vision +use_case: Pose Estimation +description: Posenet performs pose estimation on human images. +tags: [] +research_paper: https://arxiv.org/abs/1803.08225 +research_paper_title: 'PersonLab: Person Pose Estimation and Instance Segmentation + with a Bottom-Up, Part-Based, Geometric Embedding Model' +license: https://github.com/rwightman/posenet-pytorch/blob/master/LICENSE.txt +deploy_license: + https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/rwightman/posenet-pytorch +technical_details: + Model checkpoint: mobilenet_v1_101 + Input resolution: 257x193 + Number of parameters: 3.31M + Model size: 12.7 MB +applicable_scenarios: + - Injury prevention training + - Sports performance analysis + - Posture recognition +form_factors: + - Phone + - Tablet + - IoT +related_models: + - litehrnet + - openpose + - hrnet_pose +has_static_banner: yes +has_animated_banner: yes +license_type: other +deploy_license_type: AI Model Hub License +dataset: + - coco diff --git a/qai_hub_models/models/posenet_mobilenet/model.py b/qai_hub_models/models/posenet_mobilenet/model.py new file mode 100644 index 00000000..c4f17782 --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/model.py @@ -0,0 +1,81 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +import os +from pathlib import Path + +import torch.nn as nn +import torch.nn.functional as F + +from qai_hub_models.models.common import SampleInputsType +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + SourceAsRoot, + load_numpy, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 2 +SOURCE_REPOSITORY = "https://github.com/rwightman/posenet-pytorch" +COMMIT_HASH = "6f7376d47683553b99d6b67734bc8b368dbcda73" +SAMPLE_INPUTS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "posenet_inputs.npy" +) +DEFAULT_MODEL_WEIGHTS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "mobilenet_v1_101.pth" +) +OUTPUT_STRIDE = 16 + + +class PosenetMobilenet(BaseModel): + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained( + cls, + model_id: int = 101, + ) -> PosenetMobilenet: + with SourceAsRoot( + SOURCE_REPOSITORY, + COMMIT_HASH, + MODEL_ID, + MODEL_ASSET_VERSION, + ) as repo_path: + # Built in weights downloading is sometimes flaky. + # Download default weights from Qualcomm AWS + ckpt_path = Path(repo_path) / "_models" / DEFAULT_MODEL_WEIGHTS.path().name + if not ckpt_path.exists(): + DEFAULT_MODEL_WEIGHTS.fetch() + os.makedirs(ckpt_path.parent, exist_ok=True) + os.symlink(DEFAULT_MODEL_WEIGHTS.path(), ckpt_path) + + import posenet + + model = posenet.load_model(model_id) + + return cls(model).eval() + + def forward(self, image): + """ + Image inputs are expected to be in RGB format in the range [0, 1]. + """ + raw_output = self.model(image * 2.0 - 1.0) + max_vals = F.max_pool2d(raw_output[0], 3, stride=1, padding=1) + return (*raw_output, max_vals) + + @staticmethod + def get_input_spec( + height: int = 513, + width: int = 257, + ) -> InputSpec: + return {"image": ((1, 3, height, width), "float32")} + + def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType: + return {"image": [load_numpy(SAMPLE_INPUTS)]} diff --git a/qai_hub_models/models/posenet_mobilenet/test.py b/qai_hub_models/models/posenet_mobilenet/test.py new file mode 100644 index 00000000..889ec641 --- /dev/null +++ b/qai_hub_models/models/posenet_mobilenet/test.py @@ -0,0 +1,53 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import numpy as np + +from qai_hub_models.models.posenet_mobilenet.app import PosenetApp +from qai_hub_models.models.posenet_mobilenet.demo import IMAGE_ADDRESS +from qai_hub_models.models.posenet_mobilenet.demo import main as demo_main +from qai_hub_models.models.posenet_mobilenet.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + PosenetMobilenet, +) +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + load_numpy, +) +from qai_hub_models.utils.testing import skip_clone_repo_check + +KEYPOINT_SCORES_GT = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "keypoint_scores_gt.npy" +) +KEYPOINT_COORDS_GT = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "keypoint_coords_gt.npy" +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + model = PosenetMobilenet.from_pretrained() + h, w = PosenetMobilenet.get_input_spec()["image"][0][2:] + app = PosenetApp(model, h, w) + pose_scores, keypoint_scores, keypoint_coords = app.predict(image, raw_output=True) + + assert pose_scores[0] >= 0.5 + assert pose_scores[1] >= 0.5 + for score in pose_scores[2:]: + assert score < 1e-4 + + np.testing.assert_allclose( + keypoint_scores[:2], load_numpy(KEYPOINT_SCORES_GT), atol=1e-4 + ) + np.testing.assert_allclose( + keypoint_coords[:2], load_numpy(KEYPOINT_COORDS_GT), atol=1e-4 + ) + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/protocols.py b/qai_hub_models/models/protocols.py index adfdcf5e..1d79a391 100644 --- a/qai_hub_models/models/protocols.py +++ b/qai_hub_models/models/protocols.py @@ -20,9 +20,9 @@ from abc import abstractmethod from pathlib import Path -from typing import Any, Protocol, Type, TypeVar, runtime_checkable +from typing import Any, List, Optional, Protocol, Type, TypeVar, runtime_checkable -from qai_hub.client import DatasetEntries, SourceModel +from qai_hub.client import DatasetEntries, Device, SourceModel from qai_hub_models.evaluators.base_evaluators import BaseEvaluator, _DataLoader from qai_hub_models.models.common import ( @@ -200,6 +200,8 @@ def convert_to_hub_source_model( output_path: str | Path, input_spec: InputSpec | None = None, check_trace: bool = True, + external_onnx_weights: bool = False, + output_names: Optional[List[str]] = None, ) -> SourceModel: ... @@ -207,6 +209,7 @@ def get_hub_compile_options( self, target_runtime: TargetRuntime, other_compile_options: str = "", + device: Optional[Device] = None, ) -> str: """ AI Hub compile options recommended for the model. diff --git a/qai_hub_models/models/quicksrnetlarge/README.md b/qai_hub_models/models/quicksrnetlarge/README.md index af704650..12c61b60 100644 --- a/qai_hub_models/models/quicksrnetlarge/README.md +++ b/qai_hub_models/models/quicksrnetlarge/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/quicksrnetlarge/export.py b/qai_hub_models/models/quicksrnetlarge/export.py index c25dbb5a..4ea18e1d 100644 --- a/qai_hub_models/models/quicksrnetlarge/export.py +++ b/qai_hub_models/models/quicksrnetlarge/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/quicksrnetlarge/model.py b/qai_hub_models/models/quicksrnetlarge/model.py index bfed7f6c..bac993cc 100644 --- a/qai_hub_models/models/quicksrnetlarge/model.py +++ b/qai_hub_models/models/quicksrnetlarge/model.py @@ -57,7 +57,7 @@ def from_pretrained(cls) -> QuickSRNetLarge: def get_evaluator(self) -> BaseEvaluator: return SuperResolutionOutputEvaluator() - def forward(self, image: torch.Tensor) -> torch.Tensor: + def forward(self, image): """ Run QuickSRNet-Large on `image`, and produce an upscaled image diff --git a/qai_hub_models/models/quicksrnetlarge/perf.yaml b/qai_hub_models/models/quicksrnetlarge/perf.yaml index c94c1aaa..08f17738 100644 --- a/qai_hub_models/models/quicksrnetlarge/perf.yaml +++ b/qai_hub_models/models/quicksrnetlarge/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: QuickSRNetLarge performance_metrics: - torchscript_onnx_tflite: - inference_time: 2492.0 - throughput: 401.2841091492777 + inference_time: 2434.0 + throughput: 410.84634346754314 estimated_peak_memory_range: - min: 16384 - max: 8350520 + min: 24576 + max: 1530712 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 31 - job_id: j1p801z8g + job_id: j7gjln9xp job_status: Passed torchscript_onnx_qnn: - inference_time: 2101.0 - throughput: 475.9638267491671 + inference_time: 2102.0 + throughput: 475.7373929590866 estimated_peak_memory_range: - min: 225280 - max: 5584760 + min: 16384 + max: 6719848 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: jn5qev3m5 + job_id: jz5w96j6p job_status: Passed torchscript_onnx_ort: - inference_time: 2738.0 - throughput: 365.23009495982467 + inference_time: 2677.0 + throughput: 373.55248412401943 estimated_peak_memory_range: - min: 12288 - max: 5692928 + min: 28672 + max: 47131704 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 33 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jw56ewn7g + total_layers: 33 + job_id: jz57drql5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.816274Z' + timestamp: '2024-05-20T16:35:30.164349Z' - torchscript_onnx_tflite: - inference_time: 1917.0 - throughput: 521.6484089723526 + inference_time: 1778.0 + throughput: 562.429696287964 estimated_peak_memory_range: min: 16384 - max: 28332832 + max: 28468960 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 31 - job_id: jogk783op + job_id: jlpevmq15 job_status: Passed torchscript_onnx_qnn: - inference_time: 1500.0 - throughput: 666.6666666666666 + inference_time: 1506.0 + throughput: 664.0106241699867 estimated_peak_memory_range: - min: 208896 - max: 17648384 + min: 204800 + max: 21459584 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: j1gl6l3lg + job_id: jmg94n6l5 job_status: Passed torchscript_onnx_ort: - inference_time: 1897.0 - throughput: 527.1481286241434 + inference_time: 1850.0 + throughput: 540.5405405405405 estimated_peak_memory_range: min: 212992 - max: 19230192 + max: 18821168 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 33 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p3v6ezg + total_layers: 33 + job_id: jqp4wrzvg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.816325Z' + timestamp: '2024-05-20T16:35:30.164374Z' - torchscript_onnx_tflite: - inference_time: 2485.0 - throughput: 402.4144869215292 + inference_time: 2448.0 + throughput: 408.4967320261438 estimated_peak_memory_range: - min: 32768 - max: 1755936 + min: 16384 + max: 7574720 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 31 - job_id: j1p80kexg + job_id: jygz7d6kp job_status: Passed torchscript_onnx_qnn: inference_time: 2097.0 throughput: 476.87172150691464 estimated_peak_memory_range: - min: 225280 - max: 13035320 + min: 212992 + max: 78311448 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: jw56e080g + job_id: jvgdv1jeg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.816351Z' + timestamp: '2024-05-20T16:35:30.164392Z' + - torchscript_onnx_qnn: + inference_time: 2961.0 + throughput: 337.7237419790611 + estimated_peak_memory_range: + min: 217088 + max: 217088 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 31 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 31 + job_id: jnp18zr2g + job_status: Passed + torchscript_onnx_ort: + inference_time: 2660.0 + throughput: 375.9398496240602 + estimated_peak_memory_range: + min: 13025280 + max: 13025280 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 33 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 33 + job_id: j0px1ow1g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 14976.0 + throughput: 66.77350427350427 + estimated_peak_memory_range: + min: 31150080 + max: 31150080 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 15 + total_layers: 15 + job_id: jo5mzxjwp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.164414Z' diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/README.md b/qai_hub_models/models/quicksrnetlarge_quantized/README.md index 23624f4c..f749b985 100644 --- a/qai_hub_models/models/quicksrnetlarge_quantized/README.md +++ b/qai_hub_models/models/quicksrnetlarge_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/export.py b/qai_hub_models/models/quicksrnetlarge_quantized/export.py index 4cb7e308..125242cf 100644 --- a/qai_hub_models/models/quicksrnetlarge_quantized/export.py +++ b/qai_hub_models/models/quicksrnetlarge_quantized/export.py @@ -124,12 +124,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -171,8 +175,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -200,8 +206,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -213,7 +223,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/model.py b/qai_hub_models/models/quicksrnetlarge_quantized/model.py index e9185b68..b1541f6d 100644 --- a/qai_hub_models/models/quicksrnetlarge_quantized/model.py +++ b/qai_hub_models/models/quicksrnetlarge_quantized/model.py @@ -8,30 +8,24 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, + constrain_quantized_inputs_to_image_range, ) # isort: on import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim -from qai_hub_models.models.common import SourceModelFormat, TargetRuntime from qai_hub_models.models.quicksrnetlarge.model import QuickSRNetLarge -from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2 +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 2 +MODEL_ASSET_VERSION = 3 -# Weights and config stored in S3 are sourced from -# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_large_4x_w8a8.json: -# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_large_4x_checkpoint_int8.pth -# and -# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js -# Encodings were generated with AIMET QuantSim library -QUANTIZED_WEIGHTS = "quicksrnet_large_4x_checkpoint_int8.pth" -AIMET_ENCODINGS = "aimet_quantization_encodings.json" +DEFAULT_ENCODINGS = "quicksrnetlarge_quantized_encodings.json" SCALING_FACTOR = 4 @@ -46,9 +40,7 @@ def __init__( quicksrnet_model: QuantizationSimModel, ) -> None: QuickSRNetLarge.__init__(self, quicksrnet_model.model) - AIMETQuantizableMixin.__init__( - self, quicksrnet_model, needs_onnx_direct_aimet_export=True - ) + AIMETQuantizableMixin.__init__(self, quicksrnet_model) @classmethod def from_pretrained( @@ -63,46 +55,27 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ # Load Model - quicksrnet = QuickSRNetLarge.from_pretrained() - input_shape = quicksrnet.get_input_spec()["image"][0] - equalize_model(quicksrnet, input_shape) - - # Download weights and quantization parameters - weights = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS - ).fetch() - aimet_config = get_default_aimet_config_legacy_v2() + fp16_model = QuickSRNetLarge.from_pretrained() + input_shape = cls.get_input_spec()["image"][0] + model = prepare_model(fp16_model) + equalize_model(model, input_shape) - # Load the model weights and quantization parameters - # In this particular instance, the state_dict keys from the model are all named "model." - # where is the name of each key in the weights file - without the word model. - # We rename all the keys to add the word model - state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] - new_state_dict = {"model." + key: value for key, value in state_dict.items()} - quicksrnet.load_state_dict(new_state_dict) sim = QuantizationSimModel( - quicksrnet, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=aimet_config, + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) + constrain_quantized_inputs_to_image_range(sim) if aimet_encodings: if aimet_encodings == "DEFAULT": aimet_encodings = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS ).fetch() load_encodings_to_sim(sim, aimet_encodings) sim.model.eval() return cls(sim) - - def preferred_hub_source_model_format( - self, target_runtime: TargetRuntime - ) -> SourceModelFormat: - if target_runtime == TargetRuntime.QNN: - return SourceModelFormat.ONNX - else: - return SourceModelFormat.TORCHSCRIPT diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml b/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml index 5108803d..6ba5074a 100644 --- a/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml +++ b/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: QuickSRNetLarge-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1512.0 - throughput: 661.3756613756614 + inference_time: 1340.0 + throughput: 746.2686567164179 estimated_peak_memory_range: - min: 20480 - max: 1404424 + min: 16384 + max: 1701800 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,7 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 31 - job_id: j1pv07vm5 + job_id: jegnevjrg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1156.0 + throughput: 865.0519031141869 + estimated_peak_memory_range: + min: 16384 + max: 8330600 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: jqpyd397p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1455.0 + throughput: 687.2852233676975 + estimated_peak_memory_range: + min: 212992 + max: 8065904 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 24 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 24 + job_id: jn5q2qj45 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -61,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.840385Z' + timestamp: '2024-05-20T16:35:30.195379Z' - torchscript_onnx_tflite: - inference_time: 1167.0 - throughput: 856.898029134533 + inference_time: 996.0 + throughput: 1004.0160642570281 estimated_peak_memory_range: - min: 12288 - max: 25644128 + min: 16384 + max: 24755152 primary_compute_unit: NPU precision: int8 layer_info: @@ -75,7 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 31 - job_id: j7gjzqe85 + job_id: jopry3z9g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 811.0 + throughput: 1233.0456226880394 + estimated_peak_memory_range: + min: 12288 + max: 18436512 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: j2p0r0n6p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1054.0 + throughput: 948.7666034155598 + estimated_peak_memory_range: + min: 0 + max: 16738208 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 24 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 24 + job_id: j1glkmj8p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -84,73 +146,156 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.840407Z' + timestamp: '2024-05-20T16:35:30.195406Z' - torchscript_onnx_tflite: - inference_time: 6024.0 - throughput: 166.00265604249668 + inference_time: 1313.0 + throughput: 761.6146230007616 estimated_peak_memory_range: - min: 40960 - max: 19668928 + min: 360448 + max: 2507680 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 30 + layers_on_npu: 28 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 33 - job_id: jz5w2ry65 + total_layers: 31 + job_id: jep2my245 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1162.0 + throughput: 860.5851979345955 + estimated_peak_memory_range: + min: 20480 + max: 11496296 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: jogkyxj2p job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.840436Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.195422Z' - torchscript_onnx_tflite: - inference_time: 41995.0 - throughput: 23.81235861412073 + inference_time: 4195.0 + throughput: 238.37902264600714 estimated_peak_memory_range: - min: 1863680 - max: 4699224 + min: 45056 + max: 18644448 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 30 + layers_on_npu: 28 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 33 - job_id: j1p31omxg + total_layers: 31 + job_id: jz5wqzvm5 job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j0pxyrjlg + job_status: Failed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.840455Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:30.195441Z' - torchscript_onnx_tflite: - inference_time: 1874.0 - throughput: 533.6179295624333 + inference_time: 37890.0 + throughput: 26.392187912377935 estimated_peak_memory_range: - min: 24576 - max: 6948872 + min: 3629056 + max: 6133384 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 30 + layers_on_npu: 28 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 33 - job_id: jygzo0lk5 + total_layers: 31 + job_id: jmg9w218p job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.840469Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:30.195453Z' + - torchscript_onnx_qnn: + inference_time: 1082.0 + throughput: 924.2144177449168 + estimated_peak_memory_range: + min: 53248 + max: 53248 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: j1p87ylx5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 1417.0 + throughput: 705.7163020465773 + estimated_peak_memory_range: + min: 8822784 + max: 8822784 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 24 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 24 + job_id: jw5614k0p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 212078.0 + throughput: 4.715246277313064 + estimated_peak_memory_range: + min: 29732864 + max: 29732864 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j1p3m0ylg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.195475Z' diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/test.py b/qai_hub_models/models/quicksrnetlarge_quantized/test.py index 32337b60..16e59332 100644 --- a/qai_hub_models/models/quicksrnetlarge_quantized/test.py +++ b/qai_hub_models/models/quicksrnetlarge_quantized/test.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import os -import tempfile import zipfile import numpy as np @@ -18,7 +17,11 @@ MODEL_ID, QuickSRNetLargeQuantizable, ) -from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + qaihm_temp_dir, +) from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( @@ -69,7 +72,7 @@ def test_trace(): def test_aimet_export(): model = QuickSRNetLargeQuantizable.from_pretrained() name = model.__class__.__name__ - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: output_zip = model.convert_to_onnx_and_aimet_encodings( tmpdir, ) diff --git a/qai_hub_models/models/quicksrnetmedium/README.md b/qai_hub_models/models/quicksrnetmedium/README.md index 191dd8dc..cb5b80f1 100644 --- a/qai_hub_models/models/quicksrnetmedium/README.md +++ b/qai_hub_models/models/quicksrnetmedium/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/quicksrnetmedium/export.py b/qai_hub_models/models/quicksrnetmedium/export.py index 6c99ed79..32246017 100644 --- a/qai_hub_models/models/quicksrnetmedium/export.py +++ b/qai_hub_models/models/quicksrnetmedium/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/quicksrnetmedium/model.py b/qai_hub_models/models/quicksrnetmedium/model.py index e050d160..abb5817a 100644 --- a/qai_hub_models/models/quicksrnetmedium/model.py +++ b/qai_hub_models/models/quicksrnetmedium/model.py @@ -35,6 +35,7 @@ def __init__( quicksrnet_model: torch.nn.Module, ) -> None: super().__init__() + self.relu = torch.nn.ReLU() self.model = quicksrnet_model @classmethod @@ -57,7 +58,7 @@ def from_pretrained(cls) -> QuickSRNetMedium: def get_evaluator(self) -> BaseEvaluator: return SuperResolutionOutputEvaluator() - def forward(self, image: torch.Tensor) -> torch.Tensor: + def forward(self, image): """ Run QuickSRNet-Medium on `image`, and produce an upscaled image @@ -71,7 +72,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: Range: float[0, 1] 3-channel Color Space: RGB """ - + # image = self.relu(image) return self.model(image) @staticmethod diff --git a/qai_hub_models/models/quicksrnetmedium/perf.yaml b/qai_hub_models/models/quicksrnetmedium/perf.yaml index d7547648..bf8575e7 100644 --- a/qai_hub_models/models/quicksrnetmedium/perf.yaml +++ b/qai_hub_models/models/quicksrnetmedium/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: QuickSRNetMedium performance_metrics: - torchscript_onnx_tflite: - inference_time: 1385.0 - throughput: 722.0216606498195 + inference_time: 1388.0 + throughput: 720.4610951008646 estimated_peak_memory_range: - min: 16384 - max: 1507064 + min: 32768 + max: 1844064 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - job_id: jlpeeyk0p + job_id: jwgov6jx5 job_status: Passed torchscript_onnx_qnn: - inference_time: 998.0 - throughput: 1002.0040080160321 + inference_time: 1011.0 + throughput: 989.1196834817013 estimated_peak_memory_range: - min: 221184 - max: 7358048 + min: 28672 + max: 8507224 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 17 - job_id: jz5w24qj5 + job_id: jlpevmj15 job_status: Passed torchscript_onnx_ort: - inference_time: 1500.0 - throughput: 666.6666666666666 + inference_time: 1498.0 + throughput: 667.5567423230974 estimated_peak_memory_range: - min: 212992 - max: 8597144 + min: 12288 + max: 8500872 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 19 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jnp1y6elp + total_layers: 19 + job_id: jnp18z02g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.860888Z' + timestamp: '2024-05-20T16:35:30.235442Z' - torchscript_onnx_tflite: - inference_time: 871.0 - throughput: 1148.105625717566 + inference_time: 923.0 + throughput: 1083.4236186348862 estimated_peak_memory_range: min: 16384 - max: 19182544 + max: 19845568 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - job_id: jygzonr65 + job_id: j1pvwkjjg job_status: Passed torchscript_onnx_qnn: - inference_time: 641.0 - throughput: 1560.0624024960998 + inference_time: 648.0 + throughput: 1543.20987654321 estimated_peak_memory_range: min: 208896 - max: 14603312 + max: 15787072 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 17 - job_id: jmg9jdwv5 + job_id: jygz7d1kp job_status: Passed torchscript_onnx_ort: - inference_time: 1118.0 - throughput: 894.4543828264758 + inference_time: 1030.0 + throughput: 970.8737864077669 estimated_peak_memory_range: - min: 217088 - max: 15048656 + min: 0 + max: 14123616 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 19 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jvgde2ol5 + total_layers: 19 + job_id: jvgdv1weg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.860931Z' + timestamp: '2024-05-20T16:35:30.235469Z' - torchscript_onnx_tflite: - inference_time: 1365.0 - throughput: 732.6007326007326 + inference_time: 1370.0 + throughput: 729.92700729927 estimated_peak_memory_range: min: 24576 - max: 16231088 + max: 1369376 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - job_id: jz5708olg + job_id: j7gjlnjxp job_status: Passed torchscript_onnx_qnn: - inference_time: 1005.0 - throughput: 995.0248756218906 + inference_time: 1008.0 + throughput: 992.063492063492 estimated_peak_memory_range: min: 221184 - max: 6072368 + max: 12353904 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 17 - job_id: jegnlw1r5 + job_id: jmg94nvl5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.860962Z' + timestamp: '2024-05-20T16:35:30.235492Z' + - torchscript_onnx_qnn: + inference_time: 1129.0 + throughput: 885.7395925597874 + estimated_peak_memory_range: + min: 217088 + max: 217088 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 17 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 17 + job_id: jz5w96o6p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1559.0 + throughput: 641.4368184733804 + estimated_peak_memory_range: + min: 8896512 + max: 8896512 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: jz57drzl5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 3368.0 + throughput: 296.91211401425176 + estimated_peak_memory_range: + min: 33103872 + max: 33103872 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jqp4wrqvg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.235514Z' diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/README.md b/qai_hub_models/models/quicksrnetmedium_quantized/README.md index 732326e4..4b09275b 100644 --- a/qai_hub_models/models/quicksrnetmedium_quantized/README.md +++ b/qai_hub_models/models/quicksrnetmedium_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/export.py b/qai_hub_models/models/quicksrnetmedium_quantized/export.py index fad49cad..9c4ced9f 100644 --- a/qai_hub_models/models/quicksrnetmedium_quantized/export.py +++ b/qai_hub_models/models/quicksrnetmedium_quantized/export.py @@ -124,12 +124,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -171,8 +175,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -200,8 +206,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -213,7 +223,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/model.py b/qai_hub_models/models/quicksrnetmedium_quantized/model.py index 939d8e67..1c17a3dc 100644 --- a/qai_hub_models/models/quicksrnetmedium_quantized/model.py +++ b/qai_hub_models/models/quicksrnetmedium_quantized/model.py @@ -8,30 +8,24 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, + constrain_quantized_inputs_to_image_range, ) # isort: on import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim -from qai_hub_models.models.common import SourceModelFormat, TargetRuntime from qai_hub_models.models.quicksrnetmedium.model import QuickSRNetMedium -from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2 +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 2 +MODEL_ASSET_VERSION = 4 -# Weights and config stored in S3 are sourced from -# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_medium_4x_w8a8.json: -# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_medium_4x_checkpoint_int8.pth -# and -# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js -# Encodings were generated with AIMET QuantSim library -QUANTIZED_WEIGHTS = "quicksrnet_medium_4x_checkpoint_int8.pth" -AIMET_ENCODINGS = "aimet_quantization_encodings.json" +DEFAULT_ENCODINGS = "quicksrnetmedium_quantized_encodings.json" SCALING_FACTOR = 4 @@ -45,9 +39,7 @@ def __init__( quicksrnet_model: QuantizationSimModel, ) -> None: QuickSRNetMedium.__init__(self, quicksrnet_model.model) - AIMETQuantizableMixin.__init__( - self, quicksrnet_model, needs_onnx_direct_aimet_export=True - ) + AIMETQuantizableMixin.__init__(self, quicksrnet_model) @classmethod def from_pretrained( @@ -62,46 +54,27 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ # Load Model - quicksrnet = QuickSRNetMedium.from_pretrained() - input_shape = quicksrnet.get_input_spec()["image"][0] - equalize_model(quicksrnet, input_shape) - - # Download weights and quantization parameters - weights = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS - ).fetch() - aimet_config = get_default_aimet_config_legacy_v2() - - # Load the model weights and quantization parameters - # In this particular instance, the state_dict keys from the model are all named "model." - # where is the name of each key in the weights file - without the word model. - # We rename all the keys to add the word model - state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] - new_state_dict = {"model." + key: value for key, value in state_dict.items()} - quicksrnet.load_state_dict(new_state_dict) + fp16_model = QuickSRNetMedium.from_pretrained() + input_shape = cls.get_input_spec()["image"][0] + model = prepare_model(fp16_model) + equalize_model(model, input_shape) sim = QuantizationSimModel( - quicksrnet, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=aimet_config, + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) + constrain_quantized_inputs_to_image_range(sim) + if aimet_encodings: if aimet_encodings == "DEFAULT": aimet_encodings = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS ).fetch() load_encodings_to_sim(sim, aimet_encodings) sim.model.eval() return cls(sim) - - def preferred_hub_source_model_format( - self, target_runtime: TargetRuntime - ) -> SourceModelFormat: - if target_runtime == TargetRuntime.QNN: - return SourceModelFormat.ONNX - else: - return SourceModelFormat.TORCHSCRIPT diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml b/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml index 4c7d84e5..d69ab3d0 100644 --- a/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml +++ b/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: QuickSRNetMedium-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1046.0 - throughput: 956.0229445506692 + inference_time: 992.0 + throughput: 1008.0645161290323 estimated_peak_memory_range: - min: 1339392 - max: 2781424 + min: 12288 + max: 1410992 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,7 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - job_id: jqp4k3wlg + job_id: j0px1ov1g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 801.0 + throughput: 1248.4394506866417 + estimated_peak_memory_range: + min: 65536 + max: 68916056 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 11 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 11 + job_id: jopry3k9g + job_status: Passed + torchscript_onnx_ort: + inference_time: 1215.0 + throughput: 823.0452674897119 + estimated_peak_memory_range: + min: 12288 + max: 9491496 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 16 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 16 + job_id: j1p87yox5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -61,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.884809Z' + timestamp: '2024-05-20T16:35:30.266116Z' - torchscript_onnx_tflite: - inference_time: 871.0 - throughput: 1148.105625717566 + inference_time: 865.0 + throughput: 1156.0693641618498 estimated_peak_memory_range: min: 16384 - max: 19479952 + max: 19816736 primary_compute_unit: NPU precision: int8 layer_info: @@ -75,7 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - job_id: j0pxnx195 + job_id: jo5mzxrwp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 550.0 + throughput: 1818.1818181818182 + estimated_peak_memory_range: + min: 65536 + max: 15505168 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 11 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 11 + job_id: jep2my845 + job_status: Passed + torchscript_onnx_ort: + inference_time: 882.0 + throughput: 1133.7868480725624 + estimated_peak_memory_range: + min: 0 + max: 14140464 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 16 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 16 + job_id: jogkyxz2p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -84,73 +146,156 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.884829Z' + timestamp: '2024-05-20T16:35:30.266144Z' - torchscript_onnx_tflite: - inference_time: 3381.0 - throughput: 295.77048210588583 + inference_time: 1016.0 + throughput: 984.2519685039371 estimated_peak_memory_range: - min: 12288 - max: 15175488 + min: 69632 + max: 1384896 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 16 + layers_on_npu: 14 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 19 - job_id: j1p80kjxg + total_layers: 17 + job_id: jegnev2rg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 803.0 + throughput: 1245.3300124533 + estimated_peak_memory_range: + min: 65536 + max: 70718264 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 11 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 11 + job_id: j2p0r0y6p job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.884857Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.266161Z' - torchscript_onnx_tflite: - inference_time: 15536.0 - throughput: 64.36663233779609 + inference_time: 1823.0 + throughput: 548.5463521667581 estimated_peak_memory_range: - min: 1720320 - max: 4755304 + min: 20480 + max: 13941344 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 16 + layers_on_npu: 14 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 19 - job_id: jwgondv4p + total_layers: 17 + job_id: jygzrykx5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1392.0 + throughput: 718.3908045977012 + estimated_peak_memory_range: + min: 65536 + max: 15064032 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 11 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 11 + job_id: jmg9w2emp job_status: Passed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.884871Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:30.266177Z' - torchscript_onnx_tflite: - inference_time: 1396.0 - throughput: 716.3323782234957 + inference_time: 9357.0 + throughput: 106.87186063909373 estimated_peak_memory_range: - min: 32768 - max: 1677424 + min: 3276800 + max: 6753144 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 16 + layers_on_npu: 14 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 19 - job_id: j1gl6qw8g + total_layers: 17 + job_id: jz5wqznm5 job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.884887Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:30.266189Z' + - torchscript_onnx_qnn: + inference_time: 794.0 + throughput: 1259.4458438287154 + estimated_peak_memory_range: + min: 53248 + max: 53248 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 11 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 11 + job_id: jqpyd3e7p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1142.0 + throughput: 875.6567425569177 + estimated_peak_memory_range: + min: 8826880 + max: 8826880 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 16 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 16 + job_id: jn5q2q845 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 3479.0 + throughput: 287.4389192296637 + estimated_peak_memory_range: + min: 15757312 + max: 15757312 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 17 + total_layers: 17 + job_id: j1glkmn8p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.266211Z' diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/test.py b/qai_hub_models/models/quicksrnetmedium_quantized/test.py index 4da76b9d..c8c6ea58 100644 --- a/qai_hub_models/models/quicksrnetmedium_quantized/test.py +++ b/qai_hub_models/models/quicksrnetmedium_quantized/test.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import os -import tempfile import zipfile import numpy as np @@ -18,7 +17,11 @@ MODEL_ID, QuickSRNetMediumQuantizable, ) -from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + qaihm_temp_dir, +) from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( @@ -69,7 +72,7 @@ def test_trace(): def test_aimet_export(): model = QuickSRNetMediumQuantizable.from_pretrained() name = model.__class__.__name__ - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: output_zip = model.convert_to_onnx_and_aimet_encodings( tmpdir, ) diff --git a/qai_hub_models/models/quicksrnetsmall/README.md b/qai_hub_models/models/quicksrnetsmall/README.md index 3c3e06ac..665e005e 100644 --- a/qai_hub_models/models/quicksrnetsmall/README.md +++ b/qai_hub_models/models/quicksrnetsmall/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/quicksrnetsmall/export.py b/qai_hub_models/models/quicksrnetsmall/export.py index bc672fe9..0449ff6c 100644 --- a/qai_hub_models/models/quicksrnetsmall/export.py +++ b/qai_hub_models/models/quicksrnetsmall/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/quicksrnetsmall/model.py b/qai_hub_models/models/quicksrnetsmall/model.py index 54b22d82..f1eb380f 100644 --- a/qai_hub_models/models/quicksrnetsmall/model.py +++ b/qai_hub_models/models/quicksrnetsmall/model.py @@ -57,7 +57,7 @@ def from_pretrained(cls) -> QuickSRNetSmall: def get_evaluator(self) -> BaseEvaluator: return SuperResolutionOutputEvaluator() - def forward(self, image: torch.Tensor) -> torch.Tensor: + def forward(self, image): """ Run QuickSRNet-Small on `image`, and produce an upscaled image diff --git a/qai_hub_models/models/quicksrnetsmall/perf.yaml b/qai_hub_models/models/quicksrnetsmall/perf.yaml index 41a8c83d..311e6769 100644 --- a/qai_hub_models/models/quicksrnetsmall/perf.yaml +++ b/qai_hub_models/models/quicksrnetsmall/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: QuickSRNetSmall performance_metrics: - torchscript_onnx_tflite: - inference_time: 1316.0 - throughput: 759.8784194528876 + inference_time: 1315.0 + throughput: 760.4562737642585 estimated_peak_memory_range: - min: 24576 - max: 8392968 + min: 16384 + max: 8193912 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 11 - job_id: jo5mq8zqp + job_id: jw561460p job_status: Passed torchscript_onnx_qnn: - inference_time: 1010.0 - throughput: 990.0990099009902 + inference_time: 999.0 + throughput: 1001.001001001001 estimated_peak_memory_range: - min: 217088 - max: 51877032 + min: 229376 + max: 63786312 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 11 - job_id: jopr8wye5 + job_id: j1pvwk3jg job_status: Passed torchscript_onnx_ort: - inference_time: 1411.0 - throughput: 708.7172218284904 + inference_time: 1418.0 + throughput: 705.2186177715091 estimated_peak_memory_range: - min: 217088 - max: 8686544 + min: 90112 + max: 2421520 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 13 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jqpyrmd45 + total_layers: 13 + job_id: jz5w96v6p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.905242Z' + timestamp: '2024-05-20T16:35:30.306313Z' - torchscript_onnx_tflite: - inference_time: 914.0 - throughput: 1094.0919037199126 + inference_time: 884.0 + throughput: 1131.2217194570135 estimated_peak_memory_range: - min: 16384 - max: 18347856 + min: 20480 + max: 18573536 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 11 - job_id: jegnlkem5 + job_id: j1p3m0klg job_status: Passed torchscript_onnx_qnn: - inference_time: 617.0 - throughput: 1620.7455429497568 + inference_time: 621.0 + throughput: 1610.3059581320451 estimated_peak_memory_range: - min: 208896 - max: 14414800 + min: 0 + max: 14770544 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 11 - job_id: jep20emmg + job_id: j7gjlnxxp job_status: Passed torchscript_onnx_ort: - inference_time: 1011.0 - throughput: 989.1196834817013 + inference_time: 931.0 + throughput: 1074.1138560687432 estimated_peak_memory_range: - min: 0 - max: 12267184 + min: 12288 + max: 12222752 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 13 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j2p036rep + total_layers: 13 + job_id: jmg94n1l5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.905287Z' + timestamp: '2024-05-20T16:35:30.306339Z' - torchscript_onnx_tflite: - inference_time: 1327.0 - throughput: 753.5795026375282 + inference_time: 1314.0 + throughput: 761.03500761035 estimated_peak_memory_range: - min: 28672 - max: 8134240 + min: 20480 + max: 7936728 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 11 - job_id: j1p3vrolg + job_id: jwgov6yx5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1021.0 - throughput: 979.4319294809011 + inference_time: 996.0 + throughput: 1004.0160642570281 estimated_peak_memory_range: - min: 249856 - max: 7951808 + min: 229376 + max: 3511288 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 11 - job_id: jlpeen61p + job_id: jygz7dekp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.905312Z' + timestamp: '2024-05-20T16:35:30.306357Z' + - torchscript_onnx_qnn: + inference_time: 1089.0 + throughput: 918.2736455463728 + estimated_peak_memory_range: + min: 241664 + max: 241664 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 11 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 11 + job_id: jlpevm915 + job_status: Passed + torchscript_onnx_ort: + inference_time: 1478.0 + throughput: 676.5899864682003 + estimated_peak_memory_range: + min: 8847360 + max: 8847360 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 13 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 13 + job_id: jnp18zl2g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2483.0 + throughput: 402.7386226339106 + estimated_peak_memory_range: + min: 33112064 + max: 33112064 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jvgdv19eg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.306381Z' diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/README.md b/qai_hub_models/models/quicksrnetsmall_quantized/README.md index 20fa4de8..af0f5e82 100644 --- a/qai_hub_models/models/quicksrnetsmall_quantized/README.md +++ b/qai_hub_models/models/quicksrnetsmall_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/export.py b/qai_hub_models/models/quicksrnetsmall_quantized/export.py index f573669f..5a2eb8d2 100644 --- a/qai_hub_models/models/quicksrnetsmall_quantized/export.py +++ b/qai_hub_models/models/quicksrnetsmall_quantized/export.py @@ -124,12 +124,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -171,8 +175,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -200,8 +206,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -213,7 +223,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/model.py b/qai_hub_models/models/quicksrnetsmall_quantized/model.py index 9102f5f9..57c495a8 100644 --- a/qai_hub_models/models/quicksrnetsmall_quantized/model.py +++ b/qai_hub_models/models/quicksrnetsmall_quantized/model.py @@ -8,30 +8,24 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, + constrain_quantized_inputs_to_image_range, ) # isort: on import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim -from qai_hub_models.models.common import SourceModelFormat, TargetRuntime from qai_hub_models.models.quicksrnetsmall.model import QuickSRNetSmall -from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2 +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 2 +MODEL_ASSET_VERSION = 4 -# Weights and config stored in S3 are sourced from -# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_small_4x_w8a8.json: -# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_small_4x_checkpoint_int8.pth -# and -# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js -# Encodings were generated with AIMET QuantSim library -QUANTIZED_WEIGHTS = "quicksrnet_small_4x_checkpoint_int8.pth" -AIMET_ENCODINGS = "aimet_quantization_encodings.json" +DEFAULT_ENCODINGS = "quicksrnetsmall_quantized_encodings.json" SCALING_FACTOR = 4 @@ -45,9 +39,7 @@ def __init__( quicksrnet_model: QuantizationSimModel, ) -> None: QuickSRNetSmall.__init__(self, quicksrnet_model.model) - AIMETQuantizableMixin.__init__( - self, quicksrnet_model, needs_onnx_direct_aimet_export=True - ) + AIMETQuantizableMixin.__init__(self, quicksrnet_model) @classmethod def from_pretrained( @@ -61,46 +53,27 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ # Load Model - quicksrnet = QuickSRNetSmall.from_pretrained() - input_shape = quicksrnet.get_input_spec()["image"][0] - equalize_model(quicksrnet, input_shape) - - # Download weights and quantization parameters - weights = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS - ).fetch() - aimet_config = get_default_aimet_config_legacy_v2() - - # Load the model weights and quantization parameters - # In this particular instance, the state_dict keys from the model are all named "model." - # where is the name of each key in the weights file - without the word model. - # We rename all the keys to add the word model - state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] - new_state_dict = {"model." + key: value for key, value in state_dict.items()} - quicksrnet.load_state_dict(new_state_dict) + fp16_model = QuickSRNetSmall.from_pretrained() + input_shape = cls.get_input_spec()["image"][0] + model = prepare_model(fp16_model) + equalize_model(model, input_shape) sim = QuantizationSimModel( - quicksrnet, + fp16_model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=aimet_config, + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) + constrain_quantized_inputs_to_image_range(sim) + if aimet_encodings: if aimet_encodings == "DEFAULT": aimet_encodings = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS ).fetch() load_encodings_to_sim(sim, aimet_encodings) sim.model.eval() return cls(sim) - - def preferred_hub_source_model_format( - self, target_runtime: TargetRuntime - ) -> SourceModelFormat: - if target_runtime == TargetRuntime.QNN: - return SourceModelFormat.ONNX - else: - return SourceModelFormat.TORCHSCRIPT diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml b/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml index 829787bd..20ec2659 100644 --- a/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml +++ b/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: QuickSRNetSmall-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 987.0 - throughput: 1013.1712259371834 + inference_time: 957.0 + throughput: 1044.932079414838 estimated_peak_memory_range: - min: 20480 - max: 1821960 + min: 1048576 + max: 3323920 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,7 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 11 - job_id: jogk78yop + job_id: jz57drwl5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 662.0 + throughput: 1510.5740181268882 + estimated_peak_memory_range: + min: 20480 + max: 2419512 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 8 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 8 + job_id: jo5mzx2wp + job_status: Passed + torchscript_onnx_ort: + inference_time: 1143.0 + throughput: 874.8906386701663 + estimated_peak_memory_range: + min: 212992 + max: 2520600 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 14 + job_id: jqpyd3w7p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -61,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.929166Z' + timestamp: '2024-05-20T16:35:30.337379Z' - torchscript_onnx_tflite: - inference_time: 1612.0 - throughput: 620.3473945409429 + inference_time: 788.0 + throughput: 1269.0355329949239 estimated_peak_memory_range: - min: 16384 - max: 18121488 + min: 0 + max: 18194848 primary_compute_unit: NPU precision: int8 layer_info: @@ -75,7 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 11 - job_id: jn5qev2m5 + job_id: jqp4wrovg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 450.0 + throughput: 2222.222222222222 + estimated_peak_memory_range: + min: 61440 + max: 12988496 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 8 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 8 + job_id: jegnevyrg + job_status: Passed + torchscript_onnx_ort: + inference_time: 818.0 + throughput: 1222.4938875305625 + estimated_peak_memory_range: + min: 212992 + max: 14543472 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 14 + job_id: j2p0r0q6p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -84,73 +146,156 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.929186Z' + timestamp: '2024-05-20T16:35:30.337406Z' - torchscript_onnx_tflite: - inference_time: 3227.0 - throughput: 309.88534242330337 + inference_time: 979.0 + throughput: 1021.4504596527069 estimated_peak_memory_range: - min: 49152 - max: 15102016 + min: 28672 + max: 2811096 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 10 + layers_on_npu: 8 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 13 - job_id: jqp4k24vg + total_layers: 11 + job_id: j0px1oj1g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 661.0 + throughput: 1512.8593040847202 + estimated_peak_memory_range: + min: 20480 + max: 11468640 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 8 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 8 + job_id: jep2my645 job_status: Passed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:32.929214Z' + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.337423Z' - torchscript_onnx_tflite: - inference_time: 12108.0 - throughput: 82.59002312520647 + inference_time: 1682.0 + throughput: 594.5303210463734 estimated_peak_memory_range: - min: 5685248 - max: 13091440 + min: 12288 + max: 13230640 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 10 + layers_on_npu: 8 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 13 - job_id: j1pvr2w75 + total_layers: 11 + job_id: j1p3er0m5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1294.0 + throughput: 772.7975270479135 + estimated_peak_memory_range: + min: 65536 + max: 12983280 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 8 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 8 + job_id: jz5wqr645 job_status: Passed reference_device_info: - name: RB5 (Proxy) + name: RB3 Gen 2 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8250 - timestamp: '2024-04-23T18:42:32.929227Z' + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:30.337440Z' - torchscript_onnx_tflite: - inference_time: 1388.0 - throughput: 720.4610951008646 + inference_time: 5698.0 + throughput: 175.5001755001755 estimated_peak_memory_range: - min: 24576 - max: 1828056 + min: 3362816 + max: 13394304 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 10 + layers_on_npu: 8 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 13 - job_id: j0pxnzr15 + total_layers: 11 + job_id: jwgo3961g job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.929244Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:30.337451Z' + - torchscript_onnx_qnn: + inference_time: 762.0 + throughput: 1312.3359580052493 + estimated_peak_memory_range: + min: 49152 + max: 49152 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 8 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 8 + job_id: jopry3q9g + job_status: Passed + torchscript_onnx_ort: + inference_time: 1088.0 + throughput: 919.1176470588235 + estimated_peak_memory_range: + min: 9007104 + max: 9007104 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 14 + job_id: j1p87y9x5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 8332.0 + throughput: 120.01920307249159 + estimated_peak_memory_range: + min: 33210368 + max: 33210368 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jogkyxn2p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.337478Z' diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/test.py b/qai_hub_models/models/quicksrnetsmall_quantized/test.py index be878b99..b23accfd 100644 --- a/qai_hub_models/models/quicksrnetsmall_quantized/test.py +++ b/qai_hub_models/models/quicksrnetsmall_quantized/test.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import os -import tempfile import zipfile import numpy as np @@ -18,7 +17,11 @@ MODEL_ID, QuickSRNetSmallQuantizable, ) -from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + qaihm_temp_dir, +) from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( @@ -69,7 +72,7 @@ def test_trace(): def test_aimet_export(): model = QuickSRNetSmallQuantizable.from_pretrained() name = model.__class__.__name__ - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: output_zip = model.convert_to_onnx_and_aimet_encodings( tmpdir, ) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/README.md b/qai_hub_models/models/real_esrgan_general_x4v3/README.md index f3f03e6e..c25f5606 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/README.md +++ b/qai_hub_models/models/real_esrgan_general_x4v3/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/export.py b/qai_hub_models/models/real_esrgan_general_x4v3/export.py index d1672a8d..88259dab 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/export.py +++ b/qai_hub_models/models/real_esrgan_general_x4v3/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml index f10d3449..d62918c6 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml +++ b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Real-ESRGAN-General-x4v3 performance_metrics: - torchscript_onnx_tflite: - inference_time: 7205.0 - throughput: 138.79250520471894 + inference_time: 7261.0 + throughput: 137.72207684891887 estimated_peak_memory_range: - min: 15941632 - max: 27205736 + min: 17612800 + max: 21719648 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 72 - job_id: j1gl6lklg + job_id: jn5q2qk45 job_status: Passed torchscript_onnx_qnn: - inference_time: 7008.0 - throughput: 142.69406392694063 + inference_time: 6254.0 + throughput: 159.89766549408378 estimated_peak_memory_range: - min: 45056 - max: 45937496 + min: 245760 + max: 5108560 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: j1p3v6mzg + job_id: j1p3m03lg job_status: Passed torchscript_onnx_ort: - inference_time: 7130.0 - throughput: 140.25245441795232 + inference_time: 6861.0 + throughput: 145.75134819997086 estimated_peak_memory_range: - min: 8429568 - max: 23590888 + min: 6336512 + max: 17772656 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 74 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1pv07wm5 + total_layers: 74 + job_id: jlpevm115 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.949581Z' + timestamp: '2024-05-20T16:35:30.377608Z' - torchscript_onnx_tflite: - inference_time: 5369.0 - throughput: 186.25442354255912 + inference_time: 5603.0 + throughput: 178.4758165268606 estimated_peak_memory_range: - min: 20480 - max: 55365360 + min: 16384 + max: 55868880 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 72 - job_id: jw56ew17g + job_id: j1glkmz8p job_status: Passed torchscript_onnx_qnn: - inference_time: 4934.0 - throughput: 202.67531414673692 + inference_time: 4592.0 + throughput: 217.77003484320556 estimated_peak_memory_range: - min: 12288 - max: 31445424 + min: 208896 + max: 33800560 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: jwgok8vdp + job_id: jwgov60x5 job_status: Passed torchscript_onnx_ort: - inference_time: 5279.0 - throughput: 189.42981625307823 + inference_time: 5149.0 + throughput: 194.21246844047388 estimated_peak_memory_range: - min: 8392704 - max: 47488976 + min: 2310144 + max: 36369760 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 74 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j7gjzql85 + total_layers: 74 + job_id: jygz7d9kp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.949640Z' + timestamp: '2024-05-20T16:35:30.377636Z' - torchscript_onnx_tflite: - inference_time: 7123.0 - throughput: 140.39028499227854 + inference_time: 7335.0 + throughput: 136.332651670075 estimated_peak_memory_range: - min: 15777792 - max: 23652120 + min: 9465856 + max: 18689240 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 72 - job_id: jopr87d95 + job_id: jw5614j0p job_status: Passed torchscript_onnx_qnn: - inference_time: 7016.0 - throughput: 142.53135689851769 + inference_time: 6280.0 + throughput: 159.23566878980893 estimated_peak_memory_range: - min: 32768 - max: 10477536 + min: 53248 + max: 43875408 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 72 - job_id: j1p80krxg + job_id: j7gjlnmxp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.949674Z' + timestamp: '2024-05-20T16:35:30.377655Z' + - torchscript_onnx_qnn: + inference_time: 8724.0 + throughput: 114.62631820265933 + estimated_peak_memory_range: + min: 229376 + max: 229376 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 72 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 72 + job_id: j1pvwkojg + job_status: Passed + torchscript_onnx_ort: + inference_time: 7228.0 + throughput: 138.35085777531822 + estimated_peak_memory_range: + min: 8613888 + max: 8613888 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 74 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 74 + job_id: jz5w96n6p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 58952.0 + throughput: 16.96295291084272 + estimated_peak_memory_range: + min: 26607616 + max: 26607616 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 70 + total_layers: 70 + job_id: jmg94nel5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.377681Z' diff --git a/qai_hub_models/models/real_esrgan_x4plus/README.md b/qai_hub_models/models/real_esrgan_x4plus/README.md index c3e6d01a..89551a63 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/README.md +++ b/qai_hub_models/models/real_esrgan_x4plus/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/real_esrgan_x4plus/export.py b/qai_hub_models/models/real_esrgan_x4plus/export.py index a5693ae6..f10bbd12 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/export.py +++ b/qai_hub_models/models/real_esrgan_x4plus/export.py @@ -120,7 +120,7 @@ def export_model( # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -192,7 +192,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/real_esrgan_x4plus/perf.yaml b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml index a51c1d3e..02636e72 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/perf.yaml +++ b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,31 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Real-ESRGAN-x4plus performance_metrics: - - torchscript_onnx_qnn: - inference_time: 65726.0 - throughput: 15.214679122417309 + - torchscript_onnx_tflite: + inference_time: 68854.0 + throughput: 14.523484474395097 estimated_peak_memory_range: - min: 102400 - max: 107703704 + min: 28672 + max: 3752144 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1028 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1028 + job_id: jnp18zx2g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 68240.0 + throughput: 14.654161781946073 + estimated_peak_memory_range: + min: 94208 + max: 108186752 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1031 - job_id: jygzon765 + job_id: jmg94new5 job_status: Passed torchscript_onnx_ort: - inference_time: 69431.0 - throughput: 14.402788379830335 + inference_time: 67823.0 + throughput: 14.744260796484967 estimated_peak_memory_range: - min: 6467584 - max: 119585224 + min: 6422528 + max: 150577760 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1030 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jmg9jd4v5 + total_layers: 1030 + job_id: jqp4wr08g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,28 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.973754Z' - - torchscript_onnx_qnn: - inference_time: 50526.0 - throughput: 19.79179036535645 + timestamp: '2024-05-20T16:35:30.408294Z' + - torchscript_onnx_tflite: + inference_time: 54608.0 + throughput: 18.312335188983297 + estimated_peak_memory_range: + min: 3264512 + max: 587498384 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1028 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1028 + job_id: jvgdv1leg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 50248.0 + throughput: 19.901289603566312 estimated_peak_memory_range: - min: 53248 - max: 259398784 + min: 86016 + max: 262075680 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1031 - job_id: jz5w249j5 + job_id: jnp18zx8g job_status: Passed torchscript_onnx_ort: - inference_time: 50628.0 - throughput: 19.751915935845776 + inference_time: 51447.0 + throughput: 19.43747934767819 estimated_peak_memory_range: - min: 7217152 - max: 193898256 + min: 6303744 + max: 192645232 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1030 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jvgde2vl5 + total_layers: 1030 + job_id: j0px1o23g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,28 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.973885Z' - - torchscript_onnx_qnn: - inference_time: 67718.0 - throughput: 14.767122478513837 + timestamp: '2024-05-20T16:35:30.408323Z' + - torchscript_onnx_tflite: + inference_time: 74054.0 + throughput: 13.503659491722257 + estimated_peak_memory_range: + min: 3284992 + max: 5941440 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1028 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1028 + job_id: jz5w96n3p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 64798.0 + throughput: 15.432575079477761 estimated_peak_memory_range: - min: 163840 - max: 107805352 + min: 102400 + max: 107714376 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1031 - job_id: j1p3vr7lg + job_id: jz57dr3v5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.974009Z' + timestamp: '2024-05-20T16:35:30.408341Z' + - torchscript_onnx_qnn: + inference_time: 73958.0 + throughput: 13.521187701127667 + estimated_peak_memory_range: + min: 217088 + max: 217088 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1030 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1030 + job_id: jvgdv1lrg + job_status: Passed + torchscript_onnx_ort: + inference_time: 65800.0 + throughput: 15.19756838905775 + estimated_peak_memory_range: + min: 1351680 + max: 1351680 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1030 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1030 + job_id: jo5mzxydp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 598980.0 + throughput: 1.669504824868944 + estimated_peak_memory_range: + min: 550260736 + max: 550260736 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jegnev8kg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.408364Z' diff --git a/qai_hub_models/models/regnet/README.md b/qai_hub_models/models/regnet/README.md index e47cbccd..96c82923 100644 --- a/qai_hub_models/models/regnet/README.md +++ b/qai_hub_models/models/regnet/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/regnet/export.py b/qai_hub_models/models/regnet/export.py index b7eea153..079e7a95 100644 --- a/qai_hub_models/models/regnet/export.py +++ b/qai_hub_models/models/regnet/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/regnet/perf.yaml b/qai_hub_models/models/regnet/perf.yaml index 4e124163..1569c2bd 100644 --- a/qai_hub_models/models/regnet/perf.yaml +++ b/qai_hub_models/models/regnet/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: RegNet performance_metrics: - torchscript_onnx_tflite: - inference_time: 2314.0 - throughput: 432.152117545376 + inference_time: 2321.0 + throughput: 430.8487720809996 estimated_peak_memory_range: - min: 16384 - max: 2190392 + min: 28672 + max: 2093984 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 114 - job_id: jqp4k3xlg + job_id: jopry3j0g job_status: Passed torchscript_onnx_qnn: - inference_time: 2128.0 - throughput: 469.9248120300752 + inference_time: 2130.0 + throughput: 469.4835680751174 estimated_peak_memory_range: - min: 20480 - max: 15932376 + min: 16384 + max: 16919216 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 188 - job_id: jo5mq8wqp + job_id: j2p0r079p job_status: Passed torchscript_onnx_ort: - inference_time: 2423.0 - throughput: 412.71151465125877 + inference_time: 2312.0 + throughput: 432.52595155709344 estimated_peak_memory_range: - min: 12288 - max: 87079712 + min: 49152 + max: 79165336 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 190 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jopr8w4e5 + total_layers: 190 + job_id: j1glkmrjp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:32.991772Z' + timestamp: '2024-05-20T16:35:30.439253Z' - torchscript_onnx_tflite: - inference_time: 1616.0 - throughput: 618.8118811881188 + inference_time: 1625.0 + throughput: 615.3846153846154 estimated_peak_memory_range: - min: 12288 - max: 134209840 + min: 78073856 + max: 211737456 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 114 - job_id: j0pxnx795 + job_id: jep2mynr5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1506.0 - throughput: 664.0106241699867 + inference_time: 1481.0 + throughput: 675.219446320054 estimated_peak_memory_range: - min: 618496 - max: 77239488 + min: 0 + max: 72188080 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 188 - job_id: jegnlk9m5 + job_id: j1p87yvk5 job_status: Passed torchscript_onnx_ort: - inference_time: 1699.0 - throughput: 588.5815185403178 + inference_time: 1586.0 + throughput: 630.517023959647 estimated_peak_memory_range: - min: 618496 - max: 36167024 + min: 0 + max: 38564464 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 190 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jep20e7mg + total_layers: 190 + job_id: jw5614l6p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:32.991836Z' + timestamp: '2024-05-20T16:35:30.439280Z' - torchscript_onnx_tflite: - inference_time: 2329.0 - throughput: 429.36882782310005 + inference_time: 2331.0 + throughput: 429.000429000429 estimated_peak_memory_range: - min: 24576 - max: 2315288 + min: 32768 + max: 2367144 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 114 - job_id: jvgdemme5 + job_id: jqpyd308p job_status: Passed torchscript_onnx_qnn: - inference_time: 2130.0 - throughput: 469.4835680751174 + inference_time: 2139.0 + throughput: 467.50818139317437 estimated_peak_memory_range: - min: 12288 - max: 56502216 + min: 16384 + max: 66236928 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 188 - job_id: jo5mqllwp + job_id: jn5q2qon5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:32.991902Z' + timestamp: '2024-05-20T16:35:30.439298Z' + - torchscript_onnx_qnn: + inference_time: 2466.0 + throughput: 405.51500405515003 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 188 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 188 + job_id: jogkyxmwp + job_status: Passed + torchscript_onnx_ort: + inference_time: 2190.0 + throughput: 456.62100456621005 + estimated_peak_memory_range: + min: 34840576 + max: 34840576 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 190 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 190 + job_id: j1p3m023g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 14744.0 + throughput: 67.82419967444385 + estimated_peak_memory_range: + min: 70148096 + max: 70148096 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 85 + total_layers: 85 + job_id: jwgov6qq5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.439321Z' diff --git a/qai_hub_models/models/resnet101/README.md b/qai_hub_models/models/resnet101/README.md index 145fa7e8..218a6131 100644 --- a/qai_hub_models/models/resnet101/README.md +++ b/qai_hub_models/models/resnet101/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnet101/export.py b/qai_hub_models/models/resnet101/export.py index feaaa511..d2b19892 100644 --- a/qai_hub_models/models/resnet101/export.py +++ b/qai_hub_models/models/resnet101/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnet101/perf.yaml b/qai_hub_models/models/resnet101/perf.yaml index ec4a90fd..b8e31514 100644 --- a/qai_hub_models/models/resnet101/perf.yaml +++ b/qai_hub_models/models/resnet101/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNet101 performance_metrics: - torchscript_onnx_tflite: - inference_time: 3390.0 - throughput: 294.9852507374631 + inference_time: 3366.0 + throughput: 297.08853238265004 estimated_peak_memory_range: - min: 28672 - max: 1775440 + min: 36864 + max: 2178824 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 147 - job_id: j7gjzq085 + job_id: jo5mznxdp job_status: Passed torchscript_onnx_qnn: - inference_time: 3448.0 - throughput: 290.0232018561485 + inference_time: 3453.0 + throughput: 289.6032435563278 estimated_peak_memory_range: - min: 638976 - max: 216598456 + min: 618496 + max: 173565024 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: jygzonx65 + job_id: jep2mkyr5 job_status: Passed torchscript_onnx_ort: - inference_time: 3747.0 - throughput: 266.88017080330934 + inference_time: 3601.0 + throughput: 277.700638711469 estimated_peak_memory_range: - min: 618496 - max: 366172984 + min: 12288 + max: 300122744 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 247 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jmg9jd3v5 + total_layers: 247 + job_id: jn5q26qn5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.051295Z' + timestamp: '2024-05-20T16:35:30.509475Z' - torchscript_onnx_tflite: - inference_time: 2446.0 - throughput: 408.8307440719542 + inference_time: 2430.0 + throughput: 411.52263374485597 estimated_peak_memory_range: - min: 212992 - max: 104476752 + min: 16384 + max: 107021088 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 147 - job_id: jlpeeyr0p + job_id: jegne6vkg job_status: Passed torchscript_onnx_qnn: - inference_time: 2469.0 - throughput: 405.0222762251924 + inference_time: 2501.0 + throughput: 399.8400639744102 estimated_peak_memory_range: - min: 434176 - max: 81113840 + min: 618496 + max: 81769760 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: jz5w24dj5 + job_id: jqpyd138p job_status: Passed torchscript_onnx_ort: - inference_time: 2676.0 - throughput: 373.69207772795215 + inference_time: 2626.0 + throughput: 380.8073115003808 estimated_peak_memory_range: min: 618496 - max: 44227744 + max: 47698672 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 247 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jnp1y6dlp + total_layers: 247 + job_id: j1glkvmjp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.051366Z' + timestamp: '2024-05-20T16:35:30.509503Z' - torchscript_onnx_tflite: - inference_time: 3443.0 - throughput: 290.4443799012489 + inference_time: 3408.0 + throughput: 293.42723004694835 estimated_peak_memory_range: min: 24576 - max: 2329152 + max: 2314664 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 147 - job_id: jep20zqrg + job_id: jopryv30g job_status: Passed torchscript_onnx_qnn: - inference_time: 3473.0 - throughput: 287.93550244745177 + inference_time: 3469.0 + throughput: 288.2675122513693 estimated_peak_memory_range: min: 622592 - max: 217592784 + max: 173821024 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: jn5qedxn5 + job_id: j1p87qyk5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.051462Z' + timestamp: '2024-05-20T16:35:30.509521Z' + - torchscript_onnx_qnn: + inference_time: 3993.0 + throughput: 250.4382669671926 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 245 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 245 + job_id: j2p0rz09p + job_status: Passed + torchscript_onnx_ort: + inference_time: 3504.0 + throughput: 285.38812785388126 + estimated_peak_memory_range: + min: 56750080 + max: 56750080 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 247 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 247 + job_id: jw561y46p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 28994.0 + throughput: 34.48989446092295 + estimated_peak_memory_range: + min: 51179520 + max: 51179520 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 110 + total_layers: 110 + job_id: j1p3mj03g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.509545Z' diff --git a/qai_hub_models/models/resnet101_quantized/README.md b/qai_hub_models/models/resnet101_quantized/README.md index 61c6fb55..f8ee5f88 100644 --- a/qai_hub_models/models/resnet101_quantized/README.md +++ b/qai_hub_models/models/resnet101_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnet101_quantized/export.py b/qai_hub_models/models/resnet101_quantized/export.py index aacb445c..46a96d0c 100644 --- a/qai_hub_models/models/resnet101_quantized/export.py +++ b/qai_hub_models/models/resnet101_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnet101_quantized/perf.yaml b/qai_hub_models/models/resnet101_quantized/perf.yaml index 30dae2ef..e3636200 100644 --- a/qai_hub_models/models/resnet101_quantized/perf.yaml +++ b/qai_hub_models/models/resnet101_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNet101Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1171.0 - throughput: 853.9709649871904 + inference_time: 1181.0 + throughput: 846.740050804403 estimated_peak_memory_range: - min: 28672 - max: 1746016 + min: 40960 + max: 2202864 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 148 - job_id: jz5709vrg + job_id: jlpevddo5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1394.0 - throughput: 717.3601147776184 + inference_time: 1381.0 + throughput: 724.112961622013 estimated_peak_memory_range: - min: 12288 - max: 186309248 + min: 172032 + max: 8857136 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,22 +69,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 146 - job_id: jopr8w1e5 + job_id: jmg94llw5 job_status: Passed torchscript_onnx_ort: - inference_time: 1804.0 - throughput: 554.3237250554324 + inference_time: 1574.0 + throughput: 635.3240152477764 estimated_peak_memory_range: - min: 12288 - max: 70503128 + min: 28672 + max: 151107432 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jogk78rop + total_layers: 154 + job_id: jqp4wll8g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -91,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.075407Z' + timestamp: '2024-05-20T16:35:30.539412Z' - torchscript_onnx_tflite: - inference_time: 922.0 - throughput: 1084.5986984815618 + inference_time: 889.0 + throughput: 1124.859392575928 estimated_peak_memory_range: - min: 16384 - max: 92718400 + min: 12288 + max: 92553280 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 148 - job_id: jo5mq8vqp + job_id: jygz733op job_status: Passed torchscript_onnx_qnn: - inference_time: 1061.0 - throughput: 942.5070688030161 + inference_time: 1045.0 + throughput: 956.9377990430622 estimated_peak_memory_range: - min: 167936 - max: 59048544 + min: 116203520 + max: 179474976 primary_compute_unit: NPU precision: int8 layer_info: @@ -120,22 +122,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 146 - job_id: jep20e3mg + job_id: jnp18448g job_status: Passed torchscript_onnx_ort: - inference_time: 1380.0 - throughput: 724.6376811594203 + inference_time: 1217.0 + throughput: 821.6926869350863 estimated_peak_memory_range: - min: 618496 - max: 46374032 + min: 0 + max: 43890864 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jn5qev9m5 + total_layers: 154 + job_id: j0px1kk3g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -144,51 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.075467Z' + timestamp: '2024-05-20T16:35:30.539439Z' - torchscript_onnx_tflite: - inference_time: 4806.0 - throughput: 208.07324178110696 + inference_time: 1190.0 + throughput: 840.3361344537815 estimated_peak_memory_range: - min: 24576 - max: 27299616 + min: 45056 + max: 1732448 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 150 + layers_on_npu: 148 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 150 - job_id: jvgdemjr5 + total_layers: 148 + job_id: jz5w9ee3p job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 1380.0 + throughput: 724.6376811594203 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 20480 + max: 100094264 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 146 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: j1gl6qnjg - job_status: Failed - torchscript_onnx_ort: - inference_time: 53190.0 - throughput: 18.80052641473961 + total_layers: 146 + job_id: jz57dyyv5 + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.539460Z' + - torchscript_onnx_tflite: + inference_time: 4782.0 + throughput: 209.11752404851526 estimated_peak_memory_range: - min: 12480512 - max: 88971072 - primary_compute_unit: CPU - precision: fp32 + min: 12288 + max: 29359024 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 148 layers_on_gpu: 0 - layers_on_cpu: 156 - total_layers: 156 - job_id: j1gl6lelg + layers_on_cpu: 0 + total_layers: 148 + job_id: j2p0lx30p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 5013.0 + throughput: 199.48134849391582 + estimated_peak_memory_range: + min: 163840 + max: 61142352 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: j1p3ervm5 job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -197,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.075531Z' + timestamp: '2024-05-20T16:35:30.539477Z' - torchscript_onnx_tflite: - inference_time: 17430.0 - throughput: 57.37234652897303 + inference_time: 17166.0 + throughput: 58.25468950250495 estimated_peak_memory_range: - min: 16384 - max: 2096256 + min: 36864 + max: 4633128 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 150 + layers_on_npu: 148 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 150 - job_id: jlpew6v7p + total_layers: 148 + job_id: j1p8zk0qp job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -220,42 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.075558Z' - - torchscript_onnx_tflite: - inference_time: 1196.0 - throughput: 836.1204013377926 + timestamp: '2024-05-20T16:35:30.539488Z' + - torchscript_onnx_qnn: + inference_time: 1408.0 + throughput: 710.2272727272727 estimated_peak_memory_range: - min: 24576 - max: 2116496 + min: 356352 + max: 356352 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 150 + layers_on_npu: 146 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 150 - job_id: jygzo01o5 + total_layers: 146 + job_id: jvgdvxxrg job_status: Passed - torchscript_onnx_qnn: - inference_time: 1433.0 - throughput: 697.8367062107467 + torchscript_onnx_ort: + inference_time: 1445.0 + throughput: 692.0415224913495 estimated_peak_memory_range: - min: 61440 - max: 10820048 + min: 43081728 + max: 43081728 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 148 + layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 148 - job_id: jlpeen9op + total_layers: 154 + job_id: jo5mznndp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 6842.0 + throughput: 146.15609470914936 + estimated_peak_memory_range: + min: 1634304 + max: 1634304 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 156 + total_layers: 156 + job_id: jegne66kg job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.075622Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.539511Z' diff --git a/qai_hub_models/models/resnet18/README.md b/qai_hub_models/models/resnet18/README.md index 956e4791..de48498f 100644 --- a/qai_hub_models/models/resnet18/README.md +++ b/qai_hub_models/models/resnet18/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnet18/export.py b/qai_hub_models/models/resnet18/export.py index bb2f3c45..c7157f5b 100644 --- a/qai_hub_models/models/resnet18/export.py +++ b/qai_hub_models/models/resnet18/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnet18/perf.yaml b/qai_hub_models/models/resnet18/perf.yaml index 43a76a31..10d59660 100644 --- a/qai_hub_models/models/resnet18/perf.yaml +++ b/qai_hub_models/models/resnet18/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNet18 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1398.0 - throughput: 715.307582260372 + inference_time: 1410.0 + throughput: 709.2198581560284 estimated_peak_memory_range: - min: 24576 - max: 2046480 + min: 12288 + max: 1495520 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 38 - job_id: jw56ewq7g + job_id: jopryvv0g job_status: Passed torchscript_onnx_qnn: - inference_time: 1489.0 - throughput: 671.591672263264 + inference_time: 1471.0 + throughput: 679.8096532970768 estimated_peak_memory_range: - min: 12288 - max: 83625152 + min: 16384 + max: 94295528 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 53 - job_id: jwgok8edp + job_id: j2p0rzz9p job_status: Passed torchscript_onnx_ort: - inference_time: 1543.0 - throughput: 648.0881399870383 + inference_time: 1335.0 + throughput: 749.0636704119851 estimated_peak_memory_range: - min: 16384 - max: 82413040 + min: 61440 + max: 90905104 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 55 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j7gjzqk85 + total_layers: 55 + job_id: j1glkvvjp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.110933Z' + timestamp: '2024-05-20T16:35:30.578687Z' - torchscript_onnx_tflite: - inference_time: 987.0 - throughput: 1013.1712259371834 + inference_time: 981.0 + throughput: 1019.367991845056 estimated_peak_memory_range: min: 12288 - max: 24202432 + max: 24130336 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 38 - job_id: j1p3v6qzg + job_id: jep2mkkr5 job_status: Passed torchscript_onnx_qnn: inference_time: 1015.0 throughput: 985.2216748768473 estimated_peak_memory_range: - min: 0 - max: 31898144 + min: 618496 + max: 30836368 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 53 - job_id: j1pv07zm5 + job_id: j1p87qqk5 job_status: Passed torchscript_onnx_ort: - inference_time: 1128.0 - throughput: 886.5248226950355 + inference_time: 947.0 + throughput: 1055.9662090813094 estimated_peak_memory_range: - min: 618496 - max: 19073216 + min: 0 + max: 20884768 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 55 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jlpeey40p + total_layers: 55 + job_id: jw561yy6p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.110984Z' + timestamp: '2024-05-20T16:35:30.578715Z' - torchscript_onnx_tflite: - inference_time: 1376.0 - throughput: 726.7441860465116 + inference_time: 1408.0 + throughput: 710.2272727272727 estimated_peak_memory_range: - min: 20480 - max: 1963688 + min: 24576 + max: 1608360 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 38 - job_id: j2p03xqnp + job_id: jqpyd118p job_status: Passed torchscript_onnx_qnn: - inference_time: 1485.0 - throughput: 673.4006734006734 + inference_time: 1473.0 + throughput: 678.8866259334691 estimated_peak_memory_range: - min: 16384 - max: 83668248 + min: 20480 + max: 83818904 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 53 - job_id: j1gl6qrmg + job_id: jn5q266n5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.111013Z' + timestamp: '2024-05-20T16:35:30.578738Z' + - torchscript_onnx_qnn: + inference_time: 1572.0 + throughput: 636.1323155216285 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 53 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 53 + job_id: jogkyeewp + job_status: Passed + torchscript_onnx_ort: + inference_time: 1329.0 + throughput: 752.4454477050414 + estimated_peak_memory_range: + min: 32423936 + max: 32423936 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 55 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 55 + job_id: j1p3mjj3g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 6023.0 + throughput: 166.03021749958492 + estimated_peak_memory_range: + min: 22114304 + max: 22114304 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 26 + total_layers: 26 + job_id: jwgov22q5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.578760Z' diff --git a/qai_hub_models/models/resnet18_quantized/README.md b/qai_hub_models/models/resnet18_quantized/README.md index 4096efd2..5232da7c 100644 --- a/qai_hub_models/models/resnet18_quantized/README.md +++ b/qai_hub_models/models/resnet18_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnet18_quantized/export.py b/qai_hub_models/models/resnet18_quantized/export.py index d3a4be9d..51943fbc 100644 --- a/qai_hub_models/models/resnet18_quantized/export.py +++ b/qai_hub_models/models/resnet18_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnet18_quantized/perf.yaml b/qai_hub_models/models/resnet18_quantized/perf.yaml index f0cea05c..a879df3c 100644 --- a/qai_hub_models/models/resnet18_quantized/perf.yaml +++ b/qai_hub_models/models/resnet18_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNet18Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 427.0 - throughput: 2341.92037470726 + inference_time: 421.0 + throughput: 2375.296912114014 estimated_peak_memory_range: - min: 24576 - max: 14744816 + min: 16384 + max: 14552648 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 39 - job_id: jz5w24mj5 + job_id: j1pvw6qkg job_status: Passed torchscript_onnx_qnn: - inference_time: 633.0 - throughput: 1579.778830963665 + inference_time: 636.0 + throughput: 1572.3270440251572 estimated_peak_memory_range: min: 16384 - max: 61110464 + max: 29686208 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,22 +69,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 37 - job_id: jnp1y6qlp + job_id: jygz732op job_status: Passed torchscript_onnx_ort: - inference_time: 977.0 - throughput: 1023.5414534288639 + inference_time: 752.0 + throughput: 1329.787234042553 estimated_peak_memory_range: - min: 45056 - max: 142126416 + min: 24576 + max: 30406712 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 45 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz5w24x65 + total_layers: 45 + job_id: jvgdvxnrg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -91,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.135184Z' + timestamp: '2024-05-20T16:35:30.609007Z' - torchscript_onnx_tflite: - inference_time: 351.0 - throughput: 2849.002849002849 + inference_time: 343.0 + throughput: 2915.451895043732 estimated_peak_memory_range: min: 12288 - max: 24268608 + max: 23898080 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 39 - job_id: jmg9jd9v5 + job_id: j7gjlvdvp job_status: Passed torchscript_onnx_qnn: inference_time: 480.0 throughput: 2083.3333333333335 estimated_peak_memory_range: - min: 0 - max: 26088768 + min: 163840 + max: 27124384 primary_compute_unit: NPU precision: int8 layer_info: @@ -120,22 +122,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 37 - job_id: jvgde27l5 + job_id: jz5w9ew3p job_status: Passed torchscript_onnx_ort: - inference_time: 750.0 - throughput: 1333.3333333333333 + inference_time: 631.0 + throughput: 1584.7860538827258 estimated_peak_memory_range: min: 0 - max: 19250192 + max: 21432704 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 45 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jmg9jd8l5 + total_layers: 45 + job_id: jz57dy2v5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -144,51 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.135231Z' + timestamp: '2024-05-20T16:35:30.609034Z' - torchscript_onnx_tflite: - inference_time: 1555.0 - throughput: 643.0868167202573 + inference_time: 419.0 + throughput: 2386.634844868735 estimated_peak_memory_range: - min: 16384 - max: 14843920 + min: 12288 + max: 1584296 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 41 + layers_on_npu: 39 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 41 - job_id: jygzo0kx5 + total_layers: 39 + job_id: jlpevdoo5 job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 630.0 + throughput: 1587.3015873015872 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 20480 + max: 29451032 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 37 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jqpyry0l5 - job_status: Failed - torchscript_onnx_ort: - inference_time: 11826.0 - throughput: 84.5594452900389 + total_layers: 37 + job_id: jnp18428g + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.609055Z' + - torchscript_onnx_tflite: + inference_time: 1452.0 + throughput: 688.7052341597796 estimated_peak_memory_range: - min: 1556480 - max: 29105488 - primary_compute_unit: CPU - precision: fp32 + min: 12288 + max: 14834800 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 39 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 39 + job_id: jo5m3lqyg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1817.0 + throughput: 550.357732526142 + estimated_peak_memory_range: + min: 12288 + max: 24293456 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 37 layers_on_gpu: 0 - layers_on_cpu: 47 - total_layers: 47 - job_id: jvgde20e5 + layers_on_cpu: 0 + total_layers: 37 + job_id: j1p8zkmzp job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -197,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.135269Z' + timestamp: '2024-05-20T16:35:30.609072Z' - torchscript_onnx_tflite: - inference_time: 7308.0 - throughput: 136.83634373289544 + inference_time: 7043.0 + throughput: 141.9849495953429 estimated_peak_memory_range: min: 12288 - max: 6786960 + max: 6989920 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 41 + layers_on_npu: 39 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 41 - job_id: jygzjz7zp + total_layers: 39 + job_id: jegn3wmv5 job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -220,42 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.135284Z' - - torchscript_onnx_tflite: - inference_time: 463.0 - throughput: 2159.827213822894 + timestamp: '2024-05-20T16:35:30.609083Z' + - torchscript_onnx_qnn: + inference_time: 768.0 + throughput: 1302.0833333333333 estimated_peak_memory_range: - min: 20480 - max: 15182520 + min: 569344 + max: 569344 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 41 + layers_on_npu: 37 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 41 - job_id: jlpeen3vp + total_layers: 37 + job_id: jmg94l0w5 job_status: Passed - torchscript_onnx_qnn: - inference_time: 680.0 - throughput: 1470.5882352941176 + torchscript_onnx_ort: + inference_time: 714.0 + throughput: 1400.5602240896358 estimated_peak_memory_range: - min: 24576 - max: 60765408 + min: 11710464 + max: 11710464 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 39 + layers_on_npu: 45 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 39 - job_id: jo5mqly9p + total_layers: 45 + job_id: jqp4wln8g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 143079.0 + throughput: 6.989145856484879 + estimated_peak_memory_range: + min: 7467008 + max: 7467008 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j0px1k93g job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.135313Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.609107Z' diff --git a/qai_hub_models/models/resnet50/README.md b/qai_hub_models/models/resnet50/README.md index 9723fbec..6abe8dfa 100644 --- a/qai_hub_models/models/resnet50/README.md +++ b/qai_hub_models/models/resnet50/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnet50/export.py b/qai_hub_models/models/resnet50/export.py index d36c2229..b7c78bc7 100644 --- a/qai_hub_models/models/resnet50/export.py +++ b/qai_hub_models/models/resnet50/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnet50/perf.yaml b/qai_hub_models/models/resnet50/perf.yaml index 91e9cb28..9c0750f2 100644 --- a/qai_hub_models/models/resnet50/perf.yaml +++ b/qai_hub_models/models/resnet50/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 2302.0 - throughput: 434.4048653344918 + inference_time: 2272.0 + throughput: 440.14084507042253 estimated_peak_memory_range: - min: 20480 - max: 2370264 + min: 12288 + max: 1939880 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jqp4k38vg + job_id: jo5mznedp job_status: Passed torchscript_onnx_qnn: - inference_time: 2340.0 - throughput: 427.35042735042737 + inference_time: 2382.0 + throughput: 419.81528127623847 estimated_peak_memory_range: - min: 20480 - max: 185567384 + min: 622592 + max: 186262680 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jegnlkxr5 + job_id: jep2mkxr5 job_status: Passed torchscript_onnx_ort: - inference_time: 2587.0 - throughput: 386.5481252415926 + inference_time: 2370.0 + throughput: 421.9409282700422 estimated_peak_memory_range: min: 12288 - max: 217558712 + max: 205580248 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 128 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jep20ej4g + total_layers: 128 + job_id: jogkyevwp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.170270Z' + timestamp: '2024-05-20T16:35:30.648313Z' - torchscript_onnx_tflite: - inference_time: 1648.0 - throughput: 606.7961165048544 + inference_time: 1645.0 + throughput: 607.90273556231 estimated_peak_memory_range: min: 16384 - max: 69510112 + max: 70261792 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jo5mq84wp + job_id: jegne60kg job_status: Passed torchscript_onnx_qnn: - inference_time: 1630.0 - throughput: 613.4969325153374 + inference_time: 1682.0 + throughput: 594.5303210463734 estimated_peak_memory_range: min: 618496 - max: 51350896 + max: 50091680 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jopr8w995 + job_id: jqpyd1z8p job_status: Passed torchscript_onnx_ort: - inference_time: 1868.0 - throughput: 535.3319057815846 + inference_time: 1734.0 + throughput: 576.7012687427913 estimated_peak_memory_range: - min: 0 - max: 35536992 + min: 142139392 + max: 174512736 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 128 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jqpyrmn75 + total_layers: 128 + job_id: jn5q260n5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.170321Z' + timestamp: '2024-05-20T16:35:30.648341Z' - torchscript_onnx_tflite: - inference_time: 2299.0 - throughput: 434.97172683775557 + inference_time: 2272.0 + throughput: 440.14084507042253 estimated_peak_memory_range: - min: 24576 - max: 2160472 + min: 28672 + max: 2414432 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jvgdey1z5 + job_id: jopryv60g job_status: Passed torchscript_onnx_qnn: - inference_time: 2343.0 - throughput: 426.8032437046522 + inference_time: 2386.0 + throughput: 419.11148365465215 estimated_peak_memory_range: - min: 626688 - max: 186221872 + min: 618496 + max: 186113032 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jopr8m375 + job_id: j1p87q2k5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.170363Z' + timestamp: '2024-05-20T16:35:30.648359Z' + - torchscript_onnx_qnn: + inference_time: 2691.0 + throughput: 371.6090672612412 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: j2p0rz49p + job_status: Passed + torchscript_onnx_ort: + inference_time: 2284.0 + throughput: 437.82837127845886 + estimated_peak_memory_range: + min: 76500992 + max: 76500992 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 128 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 128 + job_id: j1glkv4jp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 15563.0 + throughput: 64.2549636959455 + estimated_peak_memory_range: + min: 40939520 + max: 40939520 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 59 + total_layers: 59 + job_id: jw561y26p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.648386Z' diff --git a/qai_hub_models/models/resnext101/README.md b/qai_hub_models/models/resnext101/README.md index f738cf44..dbe49a2a 100644 --- a/qai_hub_models/models/resnext101/README.md +++ b/qai_hub_models/models/resnext101/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnext101/export.py b/qai_hub_models/models/resnext101/export.py index a8654b64..703436a5 100644 --- a/qai_hub_models/models/resnext101/export.py +++ b/qai_hub_models/models/resnext101/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnext101/perf.yaml b/qai_hub_models/models/resnext101/perf.yaml index 779de30a..042adf13 100644 --- a/qai_hub_models/models/resnext101/perf.yaml +++ b/qai_hub_models/models/resnext101/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNeXt101 performance_metrics: - torchscript_onnx_tflite: - inference_time: 6665.0 - throughput: 150.03750937734435 + inference_time: 6708.0 + throughput: 149.0757304710793 estimated_peak_memory_range: - min: 53248 - max: 3235600 + min: 24576 + max: 2889376 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 147 - job_id: jogk78o2p + job_id: jo5mzn69p job_status: Passed torchscript_onnx_qnn: - inference_time: 6665.0 - throughput: 150.03750937734435 + inference_time: 6648.0 + throughput: 150.42117930204572 estimated_peak_memory_range: - min: 94208 - max: 34973960 + min: 16384 + max: 35804344 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: j1gl6lo8g + job_id: jep2mk9q5 job_status: Passed torchscript_onnx_ort: - inference_time: 7040.0 - throughput: 142.04545454545453 + inference_time: 6983.0 + throughput: 143.20492624946297 estimated_peak_memory_range: - min: 0 - max: 454692632 + min: 32768 + max: 451743424 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 247 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p3v6xlg + total_layers: 247 + job_id: jogkyeqnp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.282825Z' + timestamp: '2024-05-20T16:35:30.789744Z' - torchscript_onnx_tflite: - inference_time: 4816.0 - throughput: 207.64119601328903 + inference_time: 4868.0 + throughput: 205.42317173377157 estimated_peak_memory_range: min: 20480 - max: 366481792 + max: 365272832 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 147 - job_id: jn5qevz45 + job_id: jegne6mqg job_status: Passed torchscript_onnx_qnn: - inference_time: 4797.0 - throughput: 208.46362309776944 + inference_time: 4799.0 + throughput: 208.37674515524068 estimated_peak_memory_range: - min: 618496 - max: 131176640 + min: 0 + max: 123278800 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: jw56ewr0g + job_id: jqpyd1jlp job_status: Passed torchscript_onnx_ort: - inference_time: 5231.0 - throughput: 191.16803670426305 + inference_time: 5124.0 + throughput: 195.160031225605 estimated_peak_memory_range: - min: 618496 - max: 100656704 + min: 626688 + max: 90094496 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 247 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jwgok8oxp + total_layers: 247 + job_id: jn5q26ro5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.282896Z' + timestamp: '2024-05-20T16:35:30.789769Z' - torchscript_onnx_tflite: - inference_time: 6712.0 - throughput: 148.98688915375448 + inference_time: 6665.0 + throughput: 150.03750937734435 estimated_peak_memory_range: - min: 36864 - max: 3053288 + min: 57344 + max: 2680608 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 147 - job_id: j7gjz6215 + job_id: jopryv27g job_status: Passed torchscript_onnx_qnn: - inference_time: 6586.0 - throughput: 151.83723048891588 + inference_time: 6622.0 + throughput: 151.01177891875565 estimated_peak_memory_range: - min: 16384 - max: 36067624 + min: 0 + max: 37100696 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 245 - job_id: jmg9j7ym5 + job_id: j1p87qmo5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.282959Z' + timestamp: '2024-05-20T16:35:30.789786Z' + - torchscript_onnx_qnn: + inference_time: 9078.0 + throughput: 110.15642211940956 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 245 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 245 + job_id: j2p0rz2np + job_status: Passed + torchscript_onnx_ort: + inference_time: 6736.0 + throughput: 148.45605700712588 + estimated_peak_memory_range: + min: 108900352 + max: 108900352 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 247 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 247 + job_id: j1glkv3mp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 63884.0 + throughput: 15.653371736271993 + estimated_peak_memory_range: + min: 101425152 + max: 101425152 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 110 + total_layers: 110 + job_id: jw561ynyp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.789808Z' diff --git a/qai_hub_models/models/resnext101_quantized/README.md b/qai_hub_models/models/resnext101_quantized/README.md index bc51c825..ae6f1db4 100644 --- a/qai_hub_models/models/resnext101_quantized/README.md +++ b/qai_hub_models/models/resnext101_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnext101_quantized/export.py b/qai_hub_models/models/resnext101_quantized/export.py index 50847416..d449e32d 100644 --- a/qai_hub_models/models/resnext101_quantized/export.py +++ b/qai_hub_models/models/resnext101_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnext101_quantized/perf.yaml b/qai_hub_models/models/resnext101_quantized/perf.yaml index 1e112f32..f6ab2ff3 100644 --- a/qai_hub_models/models/resnext101_quantized/perf.yaml +++ b/qai_hub_models/models/resnext101_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNeXt101Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 2913.0 - throughput: 343.2887058015791 + inference_time: 3033.0 + throughput: 329.7065611605671 estimated_peak_memory_range: - min: 24576 - max: 1706912 + min: 16384 + max: 2184152 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,22 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 148 - job_id: j7gjzqox5 + job_id: jlpevdkv5 job_status: Passed - torchscript_onnx_ort: - inference_time: 3921.0 - throughput: 255.03698036215252 + torchscript_onnx_qnn: + inference_time: 3107.0 + throughput: 321.853878339234 estimated_peak_memory_range: min: 12288 - max: 136560960 + max: 32784840 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jmg94lw85 + job_status: Passed + torchscript_onnx_ort: + inference_time: 3421.0 + throughput: 292.3121894182987 + estimated_peak_memory_range: + min: 0 + max: 137016264 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jygzon8k5 + total_layers: 154 + job_id: jqp4wlv1g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.306938Z' + timestamp: '2024-05-20T16:35:30.820290Z' - torchscript_onnx_tflite: - inference_time: 2167.0 - throughput: 461.4674665436087 + inference_time: 2053.0 + throughput: 487.0920603994155 estimated_peak_memory_range: min: 12288 - max: 262604528 + max: 258014032 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,22 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 148 - job_id: jlpeey81p + job_id: jygz73rxp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2280.0 + throughput: 438.5964912280702 + estimated_peak_memory_range: + min: 12288 + max: 118044256 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jnp184e7g job_status: Passed torchscript_onnx_ort: - inference_time: 2990.0 - throughput: 334.44816053511704 + inference_time: 2540.0 + throughput: 393.7007874015748 estimated_peak_memory_range: min: 618496 - max: 95251808 + max: 92001632 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz5w24165 + total_layers: 154 + job_id: j0px1kylg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,36 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.306993Z' + timestamp: '2024-05-20T16:35:30.820316Z' - torchscript_onnx_tflite: - inference_time: 10468.0 - throughput: 95.52923194497517 + inference_time: 2932.0 + throughput: 341.06412005457025 estimated_peak_memory_range: - min: 32768 - max: 199144352 + min: 24576 + max: 2554384 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 150 + layers_on_npu: 148 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 150 - job_id: jlpee0v8p + total_layers: 148 + job_id: jz5w9eqmp job_status: Passed - torchscript_onnx_ort: - inference_time: 88885.0 - throughput: 11.250492209034146 + torchscript_onnx_qnn: + inference_time: 3081.0 + throughput: 324.5699448231094 estimated_peak_memory_range: - min: 8159232 - max: 88001424 - primary_compute_unit: CPU - precision: fp32 + min: 16384 + max: 35435296 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 146 layers_on_gpu: 0 - layers_on_cpu: 156 - total_layers: 156 - job_id: jmg9jdxl5 + layers_on_cpu: 0 + total_layers: 146 + job_id: jz57dyx95 + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.820333Z' + - torchscript_onnx_tflite: + inference_time: 10331.0 + throughput: 96.79605072113058 + estimated_peak_memory_range: + min: 45056 + max: 199157040 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 148 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 148 + job_id: jygzr07z5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 11010.0 + throughput: 90.82652134423252 + estimated_peak_memory_range: + min: 167936 + max: 124990144 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jqp4v2wqp job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -152,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.307043Z' + timestamp: '2024-05-20T16:35:30.820350Z' - torchscript_onnx_tflite: - inference_time: 134216.0 - throughput: 7.450676521428146 + inference_time: 133798.0 + throughput: 7.473953272844138 estimated_peak_memory_range: - min: 24576 - max: 357047544 + min: 184320 + max: 355878408 primary_compute_unit: GPU precision: int8 layer_info: - layers_on_npu: 14 + layers_on_npu: 12 layers_on_gpu: 125 layers_on_cpu: 11 - total_layers: 150 - job_id: jmg9yo4q5 + total_layers: 148 + job_id: jz5wqr9z5 job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -175,27 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.307071Z' - - torchscript_onnx_tflite: - inference_time: 2909.0 - throughput: 343.7607425232039 + timestamp: '2024-05-20T16:35:30.820361Z' + - torchscript_onnx_qnn: + inference_time: 3328.0 + throughput: 300.4807692307692 estimated_peak_memory_range: - min: 16384 - max: 2753672 + min: 249856 + max: 249856 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 150 + layers_on_npu: 146 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 150 - job_id: j1p3vlqxg + total_layers: 146 + job_id: jvgdvxozg + job_status: Passed + torchscript_onnx_ort: + inference_time: 3366.0 + throughput: 297.08853238265004 + estimated_peak_memory_range: + min: 137375744 + max: 137375744 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 154 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 154 + job_id: jo5mzn39p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 228816.0 + throughput: 4.370323753583666 + estimated_peak_memory_range: + min: 1384448 + max: 1384448 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 156 + total_layers: 156 + job_id: jegne63qg job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.307097Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.820384Z' diff --git a/qai_hub_models/models/resnext50/README.md b/qai_hub_models/models/resnext50/README.md index 18cce04c..57bd0206 100644 --- a/qai_hub_models/models/resnext50/README.md +++ b/qai_hub_models/models/resnext50/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnext50/export.py b/qai_hub_models/models/resnext50/export.py index 25d7ee96..3fc8a566 100644 --- a/qai_hub_models/models/resnext50/export.py +++ b/qai_hub_models/models/resnext50/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnext50/perf.yaml b/qai_hub_models/models/resnext50/perf.yaml index b9ae5b5d..8552317d 100644 --- a/qai_hub_models/models/resnext50/perf.yaml +++ b/qai_hub_models/models/resnext50/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNeXt50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 2502.0 - throughput: 399.68025579536373 + inference_time: 2512.0 + throughput: 398.0891719745223 estimated_peak_memory_range: - min: 16384 - max: 2039136 + min: 12288 + max: 2465560 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jnp1y6v2p + job_id: jopryve7g job_status: Passed torchscript_onnx_qnn: - inference_time: 2619.0 - throughput: 381.82512409316536 + inference_time: 2556.0 + throughput: 391.23630672926447 estimated_peak_memory_range: - min: 12288 - max: 67332096 + min: 16384 + max: 87753520 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jz57097lg + job_id: j2p0rzlnp job_status: Passed torchscript_onnx_ort: - inference_time: 2938.0 - throughput: 340.3675970047652 + inference_time: 2844.0 + throughput: 351.6174402250352 estimated_peak_memory_range: - min: 90112 - max: 153500352 + min: 229376 + max: 171515144 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 128 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j0pxnxd15 + total_layers: 128 + job_id: j1glkvkmp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.333746Z' + timestamp: '2024-05-20T16:35:30.859710Z' - torchscript_onnx_tflite: - inference_time: 1788.0 - throughput: 559.2841163310962 + inference_time: 1790.0 + throughput: 558.659217877095 estimated_peak_memory_range: min: 16384 - max: 164107600 + max: 160881424 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jvgde2ze5 + job_id: jep2mklq5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1857.0 - throughput: 538.5029617662897 + inference_time: 1858.0 + throughput: 538.2131324004306 estimated_peak_memory_range: - min: 0 - max: 60102256 + min: 618496 + max: 60637072 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jqp4k39vg + job_id: j1p87qzo5 job_status: Passed torchscript_onnx_ort: - inference_time: 2158.0 - throughput: 463.3920296570899 + inference_time: 2037.0 + throughput: 490.9180166912126 estimated_peak_memory_range: min: 618496 - max: 42526736 + max: 41012496 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 128 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jo5mq8dwp + total_layers: 128 + job_id: jw561y1yp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.333800Z' + timestamp: '2024-05-20T16:35:30.859737Z' - torchscript_onnx_tflite: - inference_time: 2497.0 - throughput: 400.4805766920304 + inference_time: 2499.0 + throughput: 400.16006402561027 estimated_peak_memory_range: - min: 53248 - max: 2221936 + min: 24576 + max: 2189296 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jnp1yk3kp + job_id: jqpyd16lp job_status: Passed torchscript_onnx_qnn: - inference_time: 2594.0 - throughput: 385.50501156515037 + inference_time: 2548.0 + throughput: 392.4646781789639 estimated_peak_memory_range: - min: 618496 - max: 68165536 + min: 622592 + max: 88624416 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: j0pxn8mj5 + job_id: jn5q263o5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.333840Z' + timestamp: '2024-05-20T16:35:30.859755Z' + - torchscript_onnx_qnn: + inference_time: 2925.0 + throughput: 341.88034188034186 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jogkye3np + job_status: Passed + torchscript_onnx_ort: + inference_time: 2645.0 + throughput: 378.0718336483932 + estimated_peak_memory_range: + min: 75046912 + max: 75046912 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 128 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 128 + job_id: j1p3mjmng + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 23055.0 + throughput: 43.37453914552158 + estimated_peak_memory_range: + min: 31170560 + max: 31170560 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 59 + total_layers: 59 + job_id: jwgov2vk5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.859779Z' diff --git a/qai_hub_models/models/resnext50_quantized/README.md b/qai_hub_models/models/resnext50_quantized/README.md index a4d1fd1d..60f8d368 100644 --- a/qai_hub_models/models/resnext50_quantized/README.md +++ b/qai_hub_models/models/resnext50_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/resnext50_quantized/export.py b/qai_hub_models/models/resnext50_quantized/export.py index ae5a4d3a..b6afc50c 100644 --- a/qai_hub_models/models/resnext50_quantized/export.py +++ b/qai_hub_models/models/resnext50_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -207,7 +218,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnext50_quantized/perf.yaml b/qai_hub_models/models/resnext50_quantized/perf.yaml index 57e9183e..5e55abcf 100644 --- a/qai_hub_models/models/resnext50_quantized/perf.yaml +++ b/qai_hub_models/models/resnext50_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: ResNeXt50Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 949.0 - throughput: 1053.740779768177 + inference_time: 945.0 + throughput: 1058.2010582010582 estimated_peak_memory_range: - min: 40960 - max: 32336880 + min: 16384 + max: 1978440 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,22 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 80 - job_id: jopr8wn95 + job_id: j1pvw6wrg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1182.0 + throughput: 846.0236886632825 + estimated_peak_memory_range: + min: 12288 + max: 98529472 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 78 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 78 + job_id: jygz737xp job_status: Passed torchscript_onnx_ort: - inference_time: 1749.0 - throughput: 571.7552887364208 + inference_time: 1456.0 + throughput: 686.8131868131868 estimated_peak_memory_range: min: 12288 - max: 65405552 + max: 110506920 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jqpyrm775 + total_layers: 86 + job_id: jvgdvxvzg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.357768Z' + timestamp: '2024-05-20T16:35:30.890153Z' - torchscript_onnx_tflite: - inference_time: 724.0 - throughput: 1381.2154696132598 + inference_time: 710.0 + throughput: 1408.4507042253522 estimated_peak_memory_range: min: 12288 - max: 99522896 + max: 99630928 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,22 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 80 - job_id: jep20ev4g + job_id: j7gjlvlep + job_status: Passed + torchscript_onnx_qnn: + inference_time: 882.0 + throughput: 1133.7868480725624 + estimated_peak_memory_range: + min: 167936 + max: 57911616 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 78 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 78 + job_id: jz5w9e9mp job_status: Passed torchscript_onnx_ort: - inference_time: 1274.0 - throughput: 784.9293563579278 + inference_time: 1110.0 + throughput: 900.9009009009009 estimated_peak_memory_range: min: 0 - max: 42945536 + max: 41473776 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j2p036v6p + total_layers: 86 + job_id: jz57dyd95 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -114,36 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.357805Z' + timestamp: '2024-05-20T16:35:30.890180Z' - torchscript_onnx_tflite: - inference_time: 3105.0 - throughput: 322.061191626409 + inference_time: 940.0 + throughput: 1063.8297872340424 estimated_peak_memory_range: min: 12288 - max: 54933392 + max: 1980696 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 82 + layers_on_npu: 80 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 82 - job_id: jygzoq8z5 + total_layers: 80 + job_id: jlpevdvv5 job_status: Passed - torchscript_onnx_ort: - inference_time: 31790.0 - throughput: 31.456432840515884 + torchscript_onnx_qnn: + inference_time: 1178.0 + throughput: 848.8964346349745 estimated_peak_memory_range: - min: 8765440 - max: 56053712 - primary_compute_unit: CPU - precision: fp32 + min: 167936 + max: 10889000 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 78 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 78 + job_id: jnp18487g + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.890198Z' + - torchscript_onnx_tflite: + inference_time: 3222.0 + throughput: 310.36623215394167 + estimated_peak_memory_range: + min: 16384 + max: 54803712 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 80 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 80 + job_id: jlpeknr7p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 3456.0 + throughput: 289.35185185185185 + estimated_peak_memory_range: + min: 163840 + max: 51993520 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 78 layers_on_gpu: 0 - layers_on_cpu: 88 - total_layers: 88 - job_id: j1p8014xg + layers_on_cpu: 0 + total_layers: 78 + job_id: jz5wqrdj5 job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -152,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.357842Z' + timestamp: '2024-05-20T16:35:30.890215Z' - torchscript_onnx_tflite: - inference_time: 64556.0 - throughput: 15.49042691616581 + inference_time: 65861.0 + throughput: 15.183492506946449 estimated_peak_memory_range: - min: 0 - max: 94711912 + min: 8355840 + max: 26461872 primary_compute_unit: GPU precision: int8 layer_info: - layers_on_npu: 14 + layers_on_npu: 12 layers_on_gpu: 57 layers_on_cpu: 11 - total_layers: 82 - job_id: jnp1wo8kg + total_layers: 80 + job_id: jygzr0xz5 job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -175,27 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.357866Z' - - torchscript_onnx_tflite: - inference_time: 987.0 - throughput: 1013.1712259371834 + timestamp: '2024-05-20T16:35:30.890226Z' + - torchscript_onnx_qnn: + inference_time: 1353.0 + throughput: 739.0983000739099 + estimated_peak_memory_range: + min: 438272 + max: 438272 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 78 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 78 + job_id: jmg94l485 + job_status: Passed + torchscript_onnx_ort: + inference_time: 1402.0 + throughput: 713.2667617689016 estimated_peak_memory_range: - min: 24576 - max: 1688688 + min: 52183040 + max: 52183040 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 82 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 82 - job_id: jogk7woyp + total_layers: 86 + job_id: jqp4wlw1g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 202223.0 + throughput: 4.945035925686 + estimated_peak_memory_range: + min: 20660224 + max: 20660224 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j0px1k1lg job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.357887Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.890248Z' diff --git a/qai_hub_models/models/riffusion_quantized/README.md b/qai_hub_models/models/riffusion_quantized/README.md new file mode 100644 index 00000000..e4090d13 --- /dev/null +++ b/qai_hub_models/models/riffusion_quantized/README.md @@ -0,0 +1,83 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Riffusion: State-of-the-art generative AI model used to generate spectrogram images given any text input. These spectrograms can be converted into audio clips](#) + +Generates high resolution spectrograms images from text prompts using a latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, and VAE based decoder to generate the final image. + +This is based on the implementation of Riffusion found +[here](https://github.com/CompVis/stable-diffusion/tree/main). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](#). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[riffusion_quantized]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.riffusion_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.riffusion_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Riffusion can be found + [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) +* [Source Model Implementation](https://github.com/CompVis/stable-diffusion/tree/main) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + +## Usage and Limitations + +This model may not be used for or in connection with any of the following applications: + +- Accessing essential private and public services and benefits; +- Administration of justice and democratic processes; +- Assessing or recognizing the emotional state of a person; +- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics; +- Education and vocational training; +- Employment and workers management; +- Exploitation of the vulnerabilities of persons resulting in harmful behavior; +- General purpose social scoring; +- Law enforcement; +- Management and operation of critical infrastructure; +- Migration, asylum and border control management; +- Predictive policing; +- Real-time remote biometric identification in public spaces; +- Recommender systems of social media platforms; +- Scraping of facial images (from the internet or otherwise); and/or +- Subliminal manipulation + + diff --git a/qai_hub_models/models/riffusion_quantized/__init__.py b/qai_hub_models/models/riffusion_quantized/__init__.py new file mode 100644 index 00000000..74856492 --- /dev/null +++ b/qai_hub_models/models/riffusion_quantized/__init__.py @@ -0,0 +1,8 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.riffusion_quantized.model import MODEL_ID # noqa: F401 +from qai_hub_models.models.riffusion_quantized.model import ( # noqa: F401 + RiffusionQuantized as Model, +) diff --git a/qai_hub_models/models/riffusion_quantized/demo.py b/qai_hub_models/models/riffusion_quantized/demo.py new file mode 100644 index 00000000..410a1e06 --- /dev/null +++ b/qai_hub_models/models/riffusion_quantized/demo.py @@ -0,0 +1,53 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel +from transformers import CLIPTokenizer + +from qai_hub_models.models._shared.stable_diffusion.demo import stable_diffusion_demo +from qai_hub_models.models.riffusion_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ClipVITTextEncoder, + Unet, + VAEDecoder, +) + + +# Run Riffuison end-to-end on a given prompt. The demo will output an +# AI-generated image based on the description in the prompt. +def main(is_test: bool = False): + tokenizer = CLIPTokenizer.from_pretrained( + "openai/clip-vit-large-patch14", subfolder="", revision="main" + ) + + scheduler = DPMSolverMultistepScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) + + time_embedding = UNet2DConditionModel.from_pretrained( + "riffusion/riffusion-model-v1", subfolder="unet" + ).time_embedding + + text_encoder = ClipVITTextEncoder.from_precompiled() + unet = Unet.from_precompiled() + vae_decoder = VAEDecoder.from_precompiled() + stable_diffusion_demo( + model_id=MODEL_ID, + model_asset_version=MODEL_ASSET_VERSION, + text_encoder=text_encoder, + unet=unet, + vae_decoder=vae_decoder, + tokenizer=tokenizer, + scheduler=scheduler, + time_embedding=time_embedding, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion_quantized/export.py b/qai_hub_models/models/riffusion_quantized/export.py similarity index 97% rename from qai_hub_models/models/stable_diffusion_quantized/export.py rename to qai_hub_models/models/riffusion_quantized/export.py index 7242bbb7..428d3b0f 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/export.py +++ b/qai_hub_models/models/riffusion_quantized/export.py @@ -13,7 +13,7 @@ import qai_hub as hub -from qai_hub_models.models.stable_diffusion_quantized import Model +from qai_hub_models.models.riffusion_quantized import Model from qai_hub_models.utils.args import export_parser from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime from qai_hub_models.utils.printing import print_profile_metrics_from_job @@ -74,7 +74,7 @@ def export_model( * A ProfileJob containing metadata about the profile job (None if profiling skipped). * An InferenceJob containing metadata about the inference job (None if inferencing skipped). """ - model_name = "stable_diffusion_quantized" + model_name = "riffusion_quantized" output_path = Path(output_dir or Path.cwd() / "build" / model_name) if chipset: hub_device = hub.Device(attributes=f"chipset:{chipset}") @@ -87,8 +87,8 @@ def export_model( raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( - "stable_diffusion_quantized", - "Stable-Diffusion", + "riffusion_quantized", + "Riffusion", device, skip_profiling, skip_inferencing, diff --git a/qai_hub_models/models/riffusion_quantized/info.yaml b/qai_hub_models/models/riffusion_quantized/info.yaml new file mode 100644 index 00000000..e2a2669c --- /dev/null +++ b/qai_hub_models/models/riffusion_quantized/info.yaml @@ -0,0 +1,39 @@ +name: Riffusion +id: riffusion_quantized +status: public +headline: State-of-the-art generative AI model used to generate spectrogram images given + any text input. These spectrograms can be converted into audio clips. +domain: Generative AI +description: Generates high resolution spectrograms images from text prompts using a + latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based + latent denoising, and VAE based decoder to generate the final image. +use_case: Image Generation +tags: + - generative-ai + - quantized +research_paper: https://arxiv.org/abs/2112.10752 +research_paper_title: High-Resolution Image Synthesis with Latent Diffusion Models +license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE +deploy_license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE +source_repo: https://github.com/CompVis/stable-diffusion/tree/main +technical_details: + Input: Text prompt to generate spectrogram image + QNN-SDK: '2.20' + Text Encoder Number of parameters: 340M + UNet Number of parameters: 865M + VAE Decoder Number of parameters: 83M + Model size: 1GB +applicable_scenarios: + - Music Generation + - Music Editing + - Content Creation +related_models: + - stable_diffusion_v1_5_quantized +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: creativeml-openrail-m +deploy_license_type: creativeml-openrail-m +dataset: [] diff --git a/qai_hub_models/models/riffusion_quantized/model.py b/qai_hub_models/models/riffusion_quantized/model.py new file mode 100644 index 00000000..8e26b375 --- /dev/null +++ b/qai_hub_models/models/riffusion_quantized/model.py @@ -0,0 +1,105 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from qai_hub_models.models.protocols import FromPrecompiledProtocol +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BasePrecompiledModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +TEXT_ENCODER = "text_encoder.serialized.bin" +UNET_DIFFUSER = "unet.serialized.bin" +VAE_DECODER = "vae_decoder.serialized.bin" + + +class RiffusionQuantized(FromPrecompiledProtocol, CollectionModel): + """ + Riffusion wrapper class consists of + - Text Encoder + - UNet based diffuser + - VAE decoder + + All three models are pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, text_encoder, unet, vae_decoder) -> None: + self.text_encoder = text_encoder + self.unet = unet + self.vae_decoder = vae_decoder + + @classmethod + def from_precompiled(cls) -> "RiffusionQuantized": + return RiffusionQuantized( + text_encoder=ClipVITTextEncoder.from_precompiled(), + unet=Unet.from_precompiled(), + vae_decoder=VAEDecoder.from_precompiled(), + ) + + +class ClipVITTextEncoder(BasePrecompiledModel): + """ + CLIP-ViT based Text Encoder. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + @classmethod + def from_precompiled(cls) -> "ClipVITTextEncoder": + text_encoder_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, TEXT_ENCODER + ).fetch() + return ClipVITTextEncoder(text_encoder_path) + + @staticmethod + def get_input_spec() -> InputSpec: + return {"input_1": ((1, 77), "int32")} + + +class Unet(BasePrecompiledModel): + """ + UNet model to denoise image in latent space. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + @classmethod + def from_precompiled(cls) -> "Unet": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, UNET_DIFFUSER + ).fetch() + return Unet(model_path) + + @staticmethod + def get_input_spec() -> InputSpec: + return { + "input_1": ((1, 64, 64, 4), "float32"), + "input_2": ((1, 1280), "float32"), + "input_3": ((1, 77, 768), "float32"), + } + + +class VAEDecoder(BasePrecompiledModel): + """ + Decodes image from latent into output generated image. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + @classmethod + def from_precompiled(cls) -> "VAEDecoder": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, VAE_DECODER + ).fetch() + return VAEDecoder(model_path) + + @staticmethod + def get_input_spec() -> InputSpec: + return {"input_1": ((1, 64, 64, 4), "float32")} diff --git a/qai_hub_models/models/stable_diffusion_quantized/requirements.txt b/qai_hub_models/models/riffusion_quantized/requirements.txt similarity index 100% rename from qai_hub_models/models/stable_diffusion_quantized/requirements.txt rename to qai_hub_models/models/riffusion_quantized/requirements.txt diff --git a/qai_hub_models/models/riffusion_quantized/test.py b/qai_hub_models/models/riffusion_quantized/test.py new file mode 100644 index 00000000..b4f5bd0e --- /dev/null +++ b/qai_hub_models/models/riffusion_quantized/test.py @@ -0,0 +1,28 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import pytest + +from qai_hub_models.models._shared.stable_diffusion.test_utils import ( + export_for_component, +) +from qai_hub_models.models.riffusion_quantized.demo import main as demo_main +from qai_hub_models.models.riffusion_quantized.export import export_model +from qai_hub_models.models.riffusion_quantized.model import RiffusionQuantized + + +def test_from_precompiled(): + RiffusionQuantized.from_precompiled() + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_export(): + export_for_component(export_model, "TextEncoder_Quantized") + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/sam/README.md b/qai_hub_models/models/sam/README.md index 2297b9ae..e4bc8748 100644 --- a/qai_hub_models/models/sam/README.md +++ b/qai_hub_models/models/sam/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/sam/export.py b/qai_hub_models/models/sam/export.py index 0216f3ab..56e34545 100644 --- a/qai_hub_models/models/sam/export.py +++ b/qai_hub_models/models/sam/export.py @@ -31,7 +31,6 @@ ) ALL_COMPONENTS = ["SAMDecoder", "SAMEncoder"] -DEFAULT_COMPONENTS = ["SAMDecoder"] def export_model( @@ -97,7 +96,7 @@ def export_model( else: hub_device = hub.Device(name=device) component_arg = components - components = components or DEFAULT_COMPONENTS + components = components or ALL_COMPONENTS for component_name in components: if component_name not in ALL_COMPONENTS: raise ValueError(f"Invalid component {component_name}.") @@ -145,7 +144,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( diff --git a/qai_hub_models/models/sam/model.py b/qai_hub_models/models/sam/model.py index 7b730c71..2b8d3c3d 100644 --- a/qai_hub_models/models/sam/model.py +++ b/qai_hub_models/models/sam/model.py @@ -6,7 +6,6 @@ import os import sys -import tempfile from typing import Callable, Tuple import numpy as np @@ -16,6 +15,7 @@ CachedWebModelAsset, load_path, maybe_clone_git_repo, + qaihm_temp_dir, ) from qai_hub_models.utils.base_model import BaseModel, CollectionModel from qai_hub_models.utils.input_spec import InputSpec @@ -290,7 +290,7 @@ def load_sam_model( ) -> torch.nn.Module: """Loads SAM model of given model type""" weights_url = _get_weights_url(model_type) - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: weights_path = load_path(weights_url, tmpdir) sam = sam_model_registry[model_type](weights_path) sam.eval() @@ -311,8 +311,10 @@ def _patch_sam_with_qaihm_modules(): SamPredictor: segment_anything.SamPredictor Python class wrapper to call image encoder - decoder """ - sam_repo_path = maybe_clone_git_repo( - SAM_SOURCE_REPO, SAM_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION + sam_repo_path = str( + maybe_clone_git_repo( + SAM_SOURCE_REPO, SAM_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION + ) ) cwd = os.getcwd() try: diff --git a/qai_hub_models/models/sam/perf.yaml b/qai_hub_models/models/sam/perf.yaml index c39ffd7a..894f56df 100644 --- a/qai_hub_models/models/sam/perf.yaml +++ b/qai_hub_models/models/sam/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,38 +31,39 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: SAMDecoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 47957.0 - throughput: 20.852013261880433 + inference_time: 48417.0 + throughput: 20.653902554887747 estimated_peak_memory_range: - min: 4009984 - max: 23686696 + min: 4046848 + max: 13471792 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 340 + layers_on_npu: 342 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 340 - job_id: jogk7892p + total_layers: 342 + job_id: jo5mznz9p job_status: Passed torchscript_onnx_ort: - inference_time: 1089085.0 - throughput: 0.9182019768888563 + inference_time: 35687.0 + throughput: 28.021408355983972 estimated_peak_memory_range: - min: 15695872 - max: 53847464 + min: 21266432 + max: 62118592 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 2 + layers_on_npu: 351 layers_on_gpu: 0 layers_on_cpu: 1 - total_layers: 3 - job_id: j1gl6l18g + total_layers: 352 + job_id: j1p87q7o5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,36 +72,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.384775Z' + timestamp: '2024-05-20T16:35:30.929656Z' - torchscript_onnx_tflite: - inference_time: 33609.0 - throughput: 29.75393495789818 + inference_time: 34847.0 + throughput: 28.696874910322265 estimated_peak_memory_range: - min: 61440 - max: 246507888 + min: 2396160 + max: 250176160 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 340 + layers_on_npu: 342 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 340 - job_id: jn5qevm45 + total_layers: 342 + job_id: jopryvy7g job_status: Passed torchscript_onnx_ort: - inference_time: 809800.0 - throughput: 1.2348728081007656 + inference_time: 25375.0 + throughput: 39.40886699507389 estimated_peak_memory_range: - min: 19857408 - max: 115862864 + min: 27185152 + max: 114627488 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 2 + layers_on_npu: 351 layers_on_gpu: 0 layers_on_cpu: 1 - total_layers: 3 - job_id: jw56ewd0g + total_layers: 352 + job_id: jn5q262o5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,21 +110,21 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.384834Z' + timestamp: '2024-05-20T16:35:30.929678Z' - torchscript_onnx_tflite: - inference_time: 48295.0 - throughput: 20.706077233668083 + inference_time: 48322.0 + throughput: 20.694507677662347 estimated_peak_memory_range: - min: 3977216 - max: 12384360 + min: 4030464 + max: 7393624 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 340 + layers_on_npu: 342 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 340 - job_id: jnp1yk7kp + total_layers: 342 + job_id: jqpyd1dlp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +133,181 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.384881Z' + timestamp: '2024-05-20T16:35:30.929690Z' + - torchscript_onnx_ort: + inference_time: 35991.0 + throughput: 27.78472395876747 + estimated_peak_memory_range: + min: 38920192 + max: 38920192 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 351 + layers_on_gpu: 0 + layers_on_cpu: 1 + total_layers: 352 + job_id: jw561y3yp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jwgov21k5 + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.929711Z' +- name: SAMEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 12002934.0 + throughput: 0.08331296331380311 + estimated_peak_memory_range: + min: 2745298944 + max: 2749256400 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 37 + layers_on_cpu: 771 + total_layers: 808 + job_id: jegne6eqg + job_status: Passed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jogkyeynp + job_status: Failed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-05-20T16:35:30.929729Z' + - torchscript_onnx_tflite: + inference_time: 10788785.0 + throughput: 0.09268884309030165 + estimated_peak_memory_range: + min: 2551681024 + max: 2911589120 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 37 + layers_on_cpu: 771 + total_layers: 808 + job_id: jep2mkmq5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1glkv0mp + job_status: Failed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-05-20T16:35:30.929747Z' + - torchscript_onnx_tflite: + inference_time: 11903922.0 + throughput: 0.08400592678614661 + estimated_peak_memory_range: + min: 2721533952 + max: 2726534168 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 37 + layers_on_cpu: 771 + total_layers: 808 + job_id: j2p0rzrnp + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:30.929757Z' + - torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1p3mj4ng + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1pvw61rg + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.929773Z' diff --git a/qai_hub_models/models/sesr_m5/README.md b/qai_hub_models/models/sesr_m5/README.md index 37bc4f6d..eb36ea36 100644 --- a/qai_hub_models/models/sesr_m5/README.md +++ b/qai_hub_models/models/sesr_m5/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/sesr_m5/export.py b/qai_hub_models/models/sesr_m5/export.py index eba502da..56a3d124 100644 --- a/qai_hub_models/models/sesr_m5/export.py +++ b/qai_hub_models/models/sesr_m5/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/sesr_m5/perf.yaml b/qai_hub_models/models/sesr_m5/perf.yaml index f52e91e4..9f832592 100644 --- a/qai_hub_models/models/sesr_m5/perf.yaml +++ b/qai_hub_models/models/sesr_m5/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: SESR-M5 performance_metrics: - torchscript_onnx_tflite: - inference_time: 2236.0 - throughput: 447.2271914132379 + inference_time: 2229.0 + throughput: 448.63167339614176 estimated_peak_memory_range: - min: 24576 - max: 1639560 + min: 28672 + max: 1751584 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 25 - job_id: jwgok84xp + job_id: j7gjlv0ep job_status: Passed torchscript_onnx_qnn: - inference_time: 2141.0 - throughput: 467.07146193367583 + inference_time: 2149.0 + throughput: 465.33271288971616 estimated_peak_memory_range: - min: 217088 - max: 66412728 + min: 24576 + max: 3705880 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: j7gjzqwx5 + job_id: jz5w9edmp job_status: Passed torchscript_onnx_ort: - inference_time: 2959.0 - throughput: 337.95201081446436 + inference_time: 2907.0 + throughput: 343.9972480220158 estimated_peak_memory_range: - min: 28672 - max: 6879728 + min: 12288 + max: 5644152 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 33 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jygzon4k5 + total_layers: 33 + job_id: jz5w9ed4p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.402615Z' + timestamp: '2024-05-20T16:35:30.969933Z' - torchscript_onnx_tflite: - inference_time: 1608.0 - throughput: 621.8905472636816 + inference_time: 1652.0 + throughput: 605.3268765133172 estimated_peak_memory_range: min: 16384 - max: 24474768 + max: 24934032 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 25 - job_id: j1pv079j5 + job_id: jlpevdrv5 job_status: Passed torchscript_onnx_qnn: - inference_time: 1452.0 - throughput: 688.7052341597796 + inference_time: 1450.0 + throughput: 689.6551724137931 estimated_peak_memory_range: - min: 208896 - max: 24978944 + min: 9527296 + max: 32336704 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,23 +116,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: jlpeeyl1p + job_id: jmg94l385 job_status: Passed torchscript_onnx_ort: - inference_time: 2024.0 - throughput: 494.0711462450593 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 208896 - max: 16041184 - primary_compute_unit: NPU - precision: fp16 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: - layers_on_npu: 1 + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz5w24465 - job_status: Passed + total_layers: 0 + job_id: jmg94l3m5 + job_status: Failed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.402651Z' + timestamp: '2024-05-20T16:35:30.969973Z' - torchscript_onnx_tflite: - inference_time: 2223.0 - throughput: 449.842555105713 + inference_time: 2266.0 + throughput: 441.306266548985 estimated_peak_memory_range: - min: 20480 - max: 8844744 + min: 12607488 + max: 14159192 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 25 - job_id: j0pxn8d95 + job_id: jygz73xxp job_status: Passed torchscript_onnx_qnn: - inference_time: 2148.0 - throughput: 465.54934823091247 + inference_time: 2141.0 + throughput: 467.07146193367583 estimated_peak_memory_range: - min: 229376 - max: 4684448 + min: 221184 + max: 4063112 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: jep20qvmg + job_id: jvgdvxrzg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.402678Z' + timestamp: '2024-05-20T16:35:30.969990Z' + - torchscript_onnx_qnn: + inference_time: 2969.0 + throughput: 336.81374200067364 + estimated_peak_memory_range: + min: 212992 + max: 212992 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 31 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 31 + job_id: jnp184d7g + job_status: Passed + torchscript_onnx_ort: + inference_time: 2971.0 + throughput: 336.58700774150117 + estimated_peak_memory_range: + min: 13090816 + max: 13090816 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 33 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 33 + job_id: jnp184dng + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 17098.0 + throughput: 58.486372675166685 + estimated_peak_memory_range: + min: 83427328 + max: 83427328 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jvgdvxr6g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:30.970016Z' diff --git a/qai_hub_models/models/sesr_m5_quantized/README.md b/qai_hub_models/models/sesr_m5_quantized/README.md index 1f6bd0dc..e93dd579 100644 --- a/qai_hub_models/models/sesr_m5_quantized/README.md +++ b/qai_hub_models/models/sesr_m5_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/sesr_m5_quantized/export.py b/qai_hub_models/models/sesr_m5_quantized/export.py index 03f22916..27c722a5 100644 --- a/qai_hub_models/models/sesr_m5_quantized/export.py +++ b/qai_hub_models/models/sesr_m5_quantized/export.py @@ -122,9 +122,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -166,8 +173,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -201,7 +210,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/sesr_m5_quantized/model.py b/qai_hub_models/models/sesr_m5_quantized/model.py index 8782ceaf..de5c875e 100644 --- a/qai_hub_models/models/sesr_m5_quantized/model.py +++ b/qai_hub_models/models/sesr_m5_quantized/model.py @@ -14,20 +14,24 @@ import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models._shared.sesr.common import _load_sesr_source_model -from qai_hub_models.models.common import SourceModelFormat, TargetRuntime from qai_hub_models.models.sesr_m5.model import ( NUM_CHANNELS, NUM_LBLOCKS, SCALING_FACTOR, SESR_M5, ) +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.quantization_aimet import ( + constrain_quantized_inputs_to_image_range, +) MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 2 +MODEL_ASSET_VERSION = 3 # Weights and config stored in S3 are sourced from # https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/sesr/model/model_cards/sesr_m5_4x_w8a8.json: @@ -37,7 +41,6 @@ # Encodings were generated with AIMET QuantSim library QUANTIZED_WEIGHTS = "sesr_m5_4x_checkpoint_int8.pth" AIMET_ENCODINGS = "sesr_m5_quantized_encodings.json" -AIMET_CONFIG = "default_config_per_channel.json" class SESR_M5Quantizable(AIMETQuantizableMixin, SESR_M5): @@ -51,9 +54,7 @@ def __init__( sesr_model: QuantizationSimModel, ) -> None: SESR_M5.__init__(self, sesr_model.model) - AIMETQuantizableMixin.__init__( - self, sesr_model, needs_onnx_direct_aimet_export=False - ) + AIMETQuantizableMixin.__init__(self, sesr_model) @classmethod def from_pretrained( @@ -62,32 +63,31 @@ def from_pretrained( ) -> SESR_M5Quantizable: # Load Model sesr = _load_sesr_source_model(SCALING_FACTOR, NUM_CHANNELS, NUM_LBLOCKS) + # The model is collapsed pre-quantization - see + # https://github.com/quic/aimet-model-zoo/blob/d09d2b0404d10f71a7640a87e9d5e5257b028802/aimet_zoo_torch/common/super_resolution/models.py#L110 + sesr.collapse() input_shape = SESR_M5.get_input_spec()["image"][0] + sesr = prepare_model(sesr) equalize_model(sesr, input_shape) # Download weights and quantization parameters weights = CachedWebModelAsset.from_asset_store( MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS ).fetch() - aimet_config = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, AIMET_CONFIG - ).fetch() # Load the model weights and quantization parameters state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] - # Here we collapse before loading the quantized weights. - # The model is collapsed pre-quantization - see - # https://github.com/quic/aimet-model-zoo/blob/d09d2b0404d10f71a7640a87e9d5e5257b028802/aimet_zoo_torch/common/super_resolution/models.py#L110 - sesr.collapse() sesr.load_state_dict(state_dict) sim = QuantizationSimModel( sesr, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=aimet_config, + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) + constrain_quantized_inputs_to_image_range(sim) + if aimet_encodings: if aimet_encodings == "DEFAULT": aimet_encodings = CachedWebModelAsset.from_asset_store( @@ -98,11 +98,3 @@ def from_pretrained( sim.model.eval() return cls(sim) - - def preferred_hub_source_model_format( - self, target_runtime: TargetRuntime - ) -> SourceModelFormat: - if target_runtime == TargetRuntime.QNN: - return SourceModelFormat.ONNX - else: - return SourceModelFormat.TORCHSCRIPT diff --git a/qai_hub_models/models/sesr_m5_quantized/perf.yaml b/qai_hub_models/models/sesr_m5_quantized/perf.yaml index 581307ec..e83193b4 100644 --- a/qai_hub_models/models/sesr_m5_quantized/perf.yaml +++ b/qai_hub_models/models/sesr_m5_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: SESR-M5-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1356.0 - throughput: 737.4631268436578 + inference_time: 1329.0 + throughput: 752.4454477050414 estimated_peak_memory_range: - min: 24576 - max: 1678184 + min: 32768 + max: 2149856 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,7 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 14 - job_id: jnp1y662p + job_id: jz57dyjn5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 774.0 + throughput: 1291.9896640826873 + estimated_peak_memory_range: + min: 28672 + max: 18606256 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 14 + job_id: jegne69jg + job_status: Passed + torchscript_onnx_ort: + inference_time: 1153.0 + throughput: 867.3026886383348 + estimated_peak_memory_range: + min: 2109440 + max: 19388976 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: j2p0rze0p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -61,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.426676Z' + timestamp: '2024-05-20T16:35:31.000448Z' - torchscript_onnx_tflite: - inference_time: 1067.0 - throughput: 937.207122774133 + inference_time: 1111.0 + throughput: 900.0900090009001 estimated_peak_memory_range: min: 12288 - max: 21689744 + max: 21726352 primary_compute_unit: NPU precision: int8 layer_info: @@ -75,7 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 14 - job_id: jvgde22e5 + job_id: jqp4wlx2g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 530.0 + throughput: 1886.7924528301887 + estimated_peak_memory_range: + min: 65536 + max: 16933392 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 14 + job_id: jopryv4kg + job_status: Passed + torchscript_onnx_ort: + inference_time: 834.0 + throughput: 1199.0407673860911 + estimated_peak_memory_range: + min: 212992 + max: 13346208 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: j1p87qwq5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -84,21 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.426696Z' + timestamp: '2024-05-20T16:35:31.000474Z' - torchscript_onnx_tflite: - inference_time: 3752.0 - throughput: 266.52452025586354 + inference_time: 1328.0 + throughput: 753.0120481927711 estimated_peak_memory_range: - min: 49152 - max: 14587664 + min: 24576 + max: 1624240 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 13 + layers_on_npu: 11 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 16 - job_id: jwgok74dp + total_layers: 14 + job_id: j0px1k78g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 778.0 + throughput: 1285.3470437017995 + estimated_peak_memory_range: + min: 28672 + max: 12397048 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 14 + job_id: jqpyd140p + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.000504Z' + - torchscript_onnx_tflite: + inference_time: 3342.0 + throughput: 299.22202274087374 + estimated_peak_memory_range: + min: 45056 + max: 14433024 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 11 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 14 + job_id: jw56n0q7g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1809.0 + throughput: 552.791597567717 + estimated_peak_memory_range: + min: 61440 + max: 17655776 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 14 + job_id: jygzr0v65 job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -107,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.426710Z' + timestamp: '2024-05-20T16:35:31.000520Z' - torchscript_onnx_tflite: - inference_time: 12810.0 - throughput: 78.06401249024199 + inference_time: 5039.0 + throughput: 198.45207382417146 estimated_peak_memory_range: - min: 5787648 - max: 13604584 + min: 1916928 + max: 9296352 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 12 + layers_on_npu: 10 layers_on_gpu: 0 layers_on_cpu: 4 - total_layers: 16 - job_id: jvgdq6vk5 + total_layers: 14 + job_id: j1p3erqz5 job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -130,27 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.426724Z' - - torchscript_onnx_tflite: - inference_time: 1743.0 - throughput: 573.7234652897304 + timestamp: '2024-05-20T16:35:31.000535Z' + - torchscript_onnx_qnn: + inference_time: 745.0 + throughput: 1342.2818791946308 estimated_peak_memory_range: - min: 28672 - max: 1454440 + min: 49152 + max: 49152 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 13 + layers_on_npu: 14 layers_on_gpu: 0 - layers_on_cpu: 3 - total_layers: 16 - job_id: j1p3vlwzg + layers_on_cpu: 0 + total_layers: 14 + job_id: jep2mk765 + job_status: Passed + torchscript_onnx_ort: + inference_time: 1179.0 + throughput: 848.1764206955047 + estimated_peak_memory_range: + min: 8998912 + max: 8998912 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 19 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 19 + job_id: jogkyervp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 72803.0 + throughput: 13.735697704764913 + estimated_peak_memory_range: + min: 32956416 + max: 32956416 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jn5q269e5 job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.426738Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.000558Z' diff --git a/qai_hub_models/models/sesr_m5_quantized/test.py b/qai_hub_models/models/sesr_m5_quantized/test.py index 86bb6543..0ed36c55 100644 --- a/qai_hub_models/models/sesr_m5_quantized/test.py +++ b/qai_hub_models/models/sesr_m5_quantized/test.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import os -import tempfile import zipfile import numpy as np @@ -18,7 +17,11 @@ MODEL_ID, SESR_M5Quantizable, ) -from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + qaihm_temp_dir, +) from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check OUTPUT_IMAGE_LOCAL_PATH = "sesr_m5_quantized_demo_output.png" @@ -69,7 +72,7 @@ def test_trace(): def test_aimet_export(): model = SESR_M5Quantizable.from_pretrained() name = model.__class__.__name__ - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: output_zip = model.convert_to_onnx_and_aimet_encodings( tmpdir, model.get_input_spec(), diff --git a/qai_hub_models/models/shufflenet_v2/README.md b/qai_hub_models/models/shufflenet_v2/README.md index 420ef994..97694e8a 100644 --- a/qai_hub_models/models/shufflenet_v2/README.md +++ b/qai_hub_models/models/shufflenet_v2/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/shufflenet_v2/export.py b/qai_hub_models/models/shufflenet_v2/export.py index 9fe96bb7..046c346c 100644 --- a/qai_hub_models/models/shufflenet_v2/export.py +++ b/qai_hub_models/models/shufflenet_v2/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/shufflenet_v2/perf.yaml b/qai_hub_models/models/shufflenet_v2/perf.yaml index 686e2320..cc6f20b7 100644 --- a/qai_hub_models/models/shufflenet_v2/perf.yaml +++ b/qai_hub_models/models/shufflenet_v2/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Shufflenet-v2 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1290.0 - throughput: 775.1937984496124 + inference_time: 1228.0 + throughput: 814.3322475570033 estimated_peak_memory_range: - min: 16384 - max: 6876504 + min: 12288 + max: 2415688 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 204 - job_id: jz57099lg + job_id: j1p3mjqmg job_status: Passed torchscript_onnx_qnn: - inference_time: 797.0 - throughput: 1254.7051442910915 + inference_time: 765.0 + throughput: 1307.18954248366 estimated_peak_memory_range: - min: 622592 - max: 68665608 + min: 16384 + max: 4038080 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 158 - job_id: j0pxnxx15 + job_id: j7gjlvk1p job_status: Passed torchscript_onnx_ort: - inference_time: 1264.0 - throughput: 791.1392405063291 + inference_time: 1085.0 + throughput: 921.6589861751152 estimated_peak_memory_range: - min: 12288 - max: 11265544 + min: 315392 + max: 4250040 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 223 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jegnlkkr5 + total_layers: 223 + job_id: jmg94l9m5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.447120Z' + timestamp: '2024-05-20T16:35:31.039692Z' - torchscript_onnx_tflite: - inference_time: 855.0 - throughput: 1169.5906432748538 + inference_time: 791.0 + throughput: 1264.2225031605562 estimated_peak_memory_range: - min: 16384 - max: 33284208 + min: 20480 + max: 33699040 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 204 - job_id: jqp4k33vg + job_id: jwgov2e15 job_status: Passed torchscript_onnx_qnn: - inference_time: 528.0 - throughput: 1893.939393939394 + inference_time: 515.0 + throughput: 1941.7475728155339 estimated_peak_memory_range: - min: 618496 - max: 53183776 + min: 12288 + max: 56897984 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 158 - job_id: jo5mq88wp + job_id: jlpevd485 job_status: Passed torchscript_onnx_ort: - inference_time: 836.0 - throughput: 1196.1722488038276 + inference_time: 742.0 + throughput: 1347.7088948787061 estimated_peak_memory_range: min: 12288 - max: 17464352 + max: 24844160 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 223 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jopr8ww95 + total_layers: 223 + job_id: jnp184qng job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.447190Z' + timestamp: '2024-05-20T16:35:31.039719Z' - torchscript_onnx_tflite: - inference_time: 1291.0 - throughput: 774.5933384972889 + inference_time: 1227.0 + throughput: 814.9959250203749 estimated_peak_memory_range: min: 20480 - max: 6952312 + max: 1798552 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 204 - job_id: jz5w201j5 + job_id: j1pvw6zzg job_status: Passed torchscript_onnx_qnn: - inference_time: 803.0 - throughput: 1245.3300124533 + inference_time: 762.0 + throughput: 1312.3359580052493 estimated_peak_memory_range: - min: 618496 - max: 103577192 + min: 622592 + max: 4805336 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 158 - job_id: j0pxn8x95 + job_id: jz5w9em4p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.447245Z' + timestamp: '2024-05-20T16:35:31.039737Z' + - torchscript_onnx_qnn: + inference_time: 929.0 + throughput: 1076.4262648008612 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 158 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 158 + job_id: jygz73v4p + job_status: Passed + torchscript_onnx_ort: + inference_time: 1125.0 + throughput: 888.8888888888889 + estimated_peak_memory_range: + min: 10477568 + max: 10477568 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 223 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 223 + job_id: jvgdvx76g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 1715.0 + throughput: 583.0903790087464 + estimated_peak_memory_range: + min: 12304384 + max: 12304384 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jz57dyvn5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.039760Z' diff --git a/qai_hub_models/models/shufflenet_v2_quantized/README.md b/qai_hub_models/models/shufflenet_v2_quantized/README.md index 3a11090f..2d50ee72 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/README.md +++ b/qai_hub_models/models/shufflenet_v2_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/shufflenet_v2_quantized/export.py b/qai_hub_models/models/shufflenet_v2_quantized/export.py index 59ef9fee..beda56ff 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/export.py +++ b/qai_hub_models/models/shufflenet_v2_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), diff --git a/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml index b3cf96f4..fe83b379 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml +++ b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Shufflenet-v2Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 644.0 - throughput: 1552.7950310559006 + inference_time: 629.0 + throughput: 1589.825119236884 estimated_peak_memory_range: min: 12288 - max: 1838712 + max: 1960224 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 205 - job_id: jqpyrmm75 + job_id: jqp4wlj2g job_status: Passed torchscript_onnx_qnn: - inference_time: 592.0 - throughput: 1689.1891891891892 + inference_time: 584.0 + throughput: 1712.3287671232877 estimated_peak_memory_range: - min: 172032 - max: 9372520 + min: 24576 + max: 3645424 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,8 +69,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 122 - job_id: j1p8011xg + job_id: jegne6rjg job_status: Passed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j2p0rzk0p + job_status: Failed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -76,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.471327Z' + timestamp: '2024-05-20T16:48:45.827261Z' - torchscript_onnx_tflite: - inference_time: 464.0 - throughput: 2155.1724137931033 + inference_time: 458.0 + throughput: 2183.406113537118 estimated_peak_memory_range: min: 12288 - max: 22792592 + max: 22451232 primary_compute_unit: NPU precision: int8 layer_info: @@ -90,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 205 - job_id: j2p03666p + job_id: j0px1ke8g job_status: Passed torchscript_onnx_qnn: - inference_time: 424.0 - throughput: 2358.490566037736 + inference_time: 419.0 + throughput: 2386.634844868735 estimated_peak_memory_range: min: 163840 - max: 45354944 + max: 45935136 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,8 +122,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 122 - job_id: jogk7882p + job_id: jopryv1kg job_status: Passed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1p87q8q5 + job_status: Failed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -114,37 +146,75 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.471384Z' + timestamp: '2024-05-20T16:48:45.827325Z' - torchscript_onnx_tflite: - inference_time: 1064.0 - throughput: 939.8496240601504 + inference_time: 649.0 + throughput: 1540.8320493066255 estimated_peak_memory_range: min: 12288 - max: 16582800 + max: 1657808 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 207 + layers_on_npu: 205 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 207 - job_id: jogk7w8op + total_layers: 205 + job_id: jn5q3dwmp job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 585.0 + throughput: 1709.4017094017095 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 16384 + max: 13811248 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 122 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jwgok78dp - job_status: Failed + total_layers: 122 + job_id: jqpyd1v0p + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:48:45.827382Z' + - torchscript_onnx_tflite: + inference_time: 946.0 + throughput: 1057.0824524312895 + estimated_peak_memory_range: + min: 12288 + max: 16954944 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 205 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 205 + job_id: j1gl3q7lg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1140.0 + throughput: 877.1929824561404 + estimated_peak_memory_range: + min: 294912 + max: 42839088 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 122 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 122 + job_id: jlpekn20p + job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) os: '12' @@ -152,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.471427Z' + timestamp: '2024-05-20T16:48:45.827429Z' - torchscript_onnx_tflite: - inference_time: 10090.0 - throughput: 99.10802775024777 + inference_time: 8918.0 + throughput: 112.13276519398968 estimated_peak_memory_range: - min: 12288 - max: 6455280 + min: 53248 + max: 6490632 primary_compute_unit: CPU precision: fp32 layer_info: - layers_on_npu: 44 + layers_on_npu: 43 layers_on_gpu: 9 - layers_on_cpu: 154 - total_layers: 207 - job_id: jz5w3y9jp + layers_on_cpu: 153 + total_layers: 205 + job_id: jw56n0v7g job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -175,42 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.471459Z' - - torchscript_onnx_tflite: - inference_time: 667.0 - throughput: 1499.2503748125937 + timestamp: '2024-05-20T16:48:45.827459Z' + - torchscript_onnx_qnn: + inference_time: 669.0 + throughput: 1494.7683109118086 estimated_peak_memory_range: - min: 24576 - max: 2164120 + min: 532480 + max: 532480 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 207 + layers_on_npu: 122 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 207 - job_id: jn5qexvm5 + total_layers: 122 + job_id: jep2mk365 job_status: Passed - torchscript_onnx_qnn: - inference_time: 618.0 - throughput: 1618.1229773462783 + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 634880 - max: 8982056 - primary_compute_unit: NPU - precision: int8 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: - layers_on_npu: 124 + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 124 - job_id: j7gjz6q85 + total_layers: 0 + job_id: jogkyedvp + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 1478.0 + throughput: 676.5899864682003 + estimated_peak_memory_range: + min: 6258688 + max: 6258688 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 263 + total_layers: 263 + job_id: jn5q26we5 job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.471511Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:48:45.827519Z' diff --git a/qai_hub_models/models/sinet/README.md b/qai_hub_models/models/sinet/README.md index 48577d92..601b6d46 100644 --- a/qai_hub_models/models/sinet/README.md +++ b/qai_hub_models/models/sinet/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/sinet/export.py b/qai_hub_models/models/sinet/export.py index ad102c99..6840f297 100644 --- a/qai_hub_models/models/sinet/export.py +++ b/qai_hub_models/models/sinet/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/sinet/perf.yaml b/qai_hub_models/models/sinet/perf.yaml index 9f70c128..f4e03b55 100644 --- a/qai_hub_models/models/sinet/perf.yaml +++ b/qai_hub_models/models/sinet/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: SINet performance_metrics: - torchscript_onnx_tflite: - inference_time: 1826.0 - throughput: 547.645125958379 + inference_time: 1797.0 + throughput: 556.4830272676684 estimated_peak_memory_range: min: 12288 - max: 2609144 + max: 2452968 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 240 - job_id: jn5qevv45 + job_id: j1glkv72p job_status: Passed torchscript_onnx_qnn: - inference_time: 1184.0 - throughput: 844.5945945945946 + inference_time: 1171.0 + throughput: 853.9709649871904 estimated_peak_memory_range: - min: 618496 - max: 4714320 + min: 2113536 + max: 14886760 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,23 +63,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 186 - job_id: jw56eww0g + job_id: jwgov2m15 job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 2285.0 + throughput: 437.636761487965 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 618496 + max: 35536752 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 229 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jwgok88xp - job_status: Failed + total_layers: 229 + job_id: jygz73w4p + job_status: Passed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.500477Z' + timestamp: '2024-05-20T16:35:31.109606Z' - torchscript_onnx_tflite: - inference_time: 1171.0 - throughput: 853.9709649871904 + inference_time: 1169.0 + throughput: 855.4319931565441 estimated_peak_memory_range: min: 12288 - max: 25301888 + max: 25617584 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 240 - job_id: j1gl6ll8g + job_id: jw561yvnp job_status: Passed torchscript_onnx_qnn: - inference_time: 799.0 - throughput: 1251.5644555694619 + inference_time: 780.0 + throughput: 1282.051282051282 estimated_peak_memory_range: - min: 12288 - max: 64850320 + min: 618496 + max: 71418032 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,23 +116,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 186 - job_id: j1p3v66lg + job_id: j1pvw64zg job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 1599.0 + throughput: 625.3908692933084 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 12288 + max: 27418000 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 229 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: j1pv077j5 - job_status: Failed + total_layers: 229 + job_id: jz5w9ex4p + job_status: Passed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.500551Z' + timestamp: '2024-05-20T16:35:31.109633Z' - torchscript_onnx_tflite: - inference_time: 1823.0 - throughput: 548.5463521667581 + inference_time: 1810.0 + throughput: 552.4861878453039 estimated_peak_memory_range: - min: 24576 - max: 1974184 + min: 16384 + max: 2390784 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 240 - job_id: jz57014rg + job_id: j1p3mj8mg job_status: Passed torchscript_onnx_qnn: - inference_time: 1185.0 - throughput: 843.8818565400844 + inference_time: 1168.0 + throughput: 856.1643835616438 estimated_peak_memory_range: - min: 634880 - max: 5992992 + min: 626688 + max: 8177944 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 186 - job_id: jopr8m0e5 + job_id: jlpevd285 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.500612Z' + timestamp: '2024-05-20T16:35:31.109650Z' + - torchscript_onnx_qnn: + inference_time: 1401.0 + throughput: 713.7758743754462 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 186 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 186 + job_id: j7gjlv11p + job_status: Passed + torchscript_onnx_ort: + inference_time: 2469.0 + throughput: 405.0222762251924 + estimated_peak_memory_range: + min: 3219456 + max: 3219456 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 229 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 229 + job_id: jmg94l8m5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2976.0 + throughput: 336.02150537634407 + estimated_peak_memory_range: + min: 13578240 + max: 13578240 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jnp1843ng + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.109674Z' diff --git a/qai_hub_models/models/squeezenet1_1/README.md b/qai_hub_models/models/squeezenet1_1/README.md index e35838b5..879ef789 100644 --- a/qai_hub_models/models/squeezenet1_1/README.md +++ b/qai_hub_models/models/squeezenet1_1/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/squeezenet1_1/export.py b/qai_hub_models/models/squeezenet1_1/export.py index c1840cfa..21488484 100644 --- a/qai_hub_models/models/squeezenet1_1/export.py +++ b/qai_hub_models/models/squeezenet1_1/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/squeezenet1_1/perf.yaml b/qai_hub_models/models/squeezenet1_1/perf.yaml index 0326ac8b..e6eb2648 100644 --- a/qai_hub_models/models/squeezenet1_1/perf.yaml +++ b/qai_hub_models/models/squeezenet1_1/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: SqueezeNet-1_1 performance_metrics: - torchscript_onnx_tflite: - inference_time: 672.0 - throughput: 1488.095238095238 + inference_time: 664.0 + throughput: 1506.0240963855422 estimated_peak_memory_range: min: 12288 - max: 1740976 + max: 1506784 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 41 - job_id: jlpeeyy1p + job_id: jvgdvx06g job_status: Passed torchscript_onnx_qnn: - inference_time: 711.0 - throughput: 1406.4697609001407 + inference_time: 712.0 + throughput: 1404.4943820224719 estimated_peak_memory_range: - min: 638976 - max: 12256680 + min: 618496 + max: 7468520 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 70 - job_id: jz5w24765 + job_id: j0px1km8g job_status: Passed torchscript_onnx_ort: - inference_time: 861.0 - throughput: 1161.4401858304298 + inference_time: 651.0 + throughput: 1536.0983102918588 estimated_peak_memory_range: min: 12288 - max: 10395112 + max: 7201352 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 71 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jnp1y6j2p + total_layers: 71 + job_id: jep2mkj65 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.524625Z' + timestamp: '2024-05-20T16:35:31.140012Z' - torchscript_onnx_tflite: - inference_time: 453.0 - throughput: 2207.5055187637968 + inference_time: 477.0 + throughput: 2096.4360587002097 estimated_peak_memory_range: - min: 12288 - max: 22540768 + min: 0 + max: 22219968 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 41 - job_id: jygzonnk5 + job_id: jz57dy6n5 job_status: Passed torchscript_onnx_qnn: inference_time: 490.0 throughput: 2040.8163265306123 estimated_peak_memory_range: min: 618496 - max: 28785760 + max: 27578288 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 70 - job_id: jmg9jdml5 + job_id: jo5mzn47p job_status: Passed torchscript_onnx_ort: - inference_time: 618.0 - throughput: 1618.1229773462783 + inference_time: 488.0 + throughput: 2049.1803278688526 estimated_peak_memory_range: - min: 618496 - max: 20314848 + min: 24576 + max: 17829040 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 71 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jvgde23e5 + total_layers: 71 + job_id: jqpyd1n0p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.524666Z' + timestamp: '2024-05-20T16:35:31.140037Z' - torchscript_onnx_tflite: - inference_time: 672.0 - throughput: 1488.095238095238 + inference_time: 664.0 + throughput: 1506.0240963855422 estimated_peak_memory_range: - min: 12288 - max: 1757808 + min: 20480 + max: 1789832 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 41 - job_id: j1pv0ydm5 + job_id: jqp4wl82g job_status: Passed torchscript_onnx_qnn: - inference_time: 718.0 - throughput: 1392.757660167131 + inference_time: 701.0 + throughput: 1426.5335235378031 estimated_peak_memory_range: - min: 618496 - max: 75568808 + min: 626688 + max: 3276112 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 70 - job_id: jnp1ykjlp + job_id: jopryv9kg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.524695Z' + timestamp: '2024-05-20T16:35:31.140054Z' + - torchscript_onnx_qnn: + inference_time: 828.0 + throughput: 1207.729468599034 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 70 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 70 + job_id: jegne6xjg + job_status: Passed + torchscript_onnx_ort: + inference_time: 696.0 + throughput: 1436.7816091954023 + estimated_peak_memory_range: + min: 3063808 + max: 3063808 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 71 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 71 + job_id: j2p0rzd0p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 2093.0 + throughput: 477.78308647873865 + estimated_peak_memory_range: + min: 9494528 + max: 9494528 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 41 + total_layers: 41 + job_id: j1p87q6q5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.140076Z' diff --git a/qai_hub_models/models/squeezenet1_1_quantized/README.md b/qai_hub_models/models/squeezenet1_1_quantized/README.md index 305ab62f..f13301a8 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/README.md +++ b/qai_hub_models/models/squeezenet1_1_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/squeezenet1_1_quantized/export.py b/qai_hub_models/models/squeezenet1_1_quantized/export.py index 202478bd..964eb563 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/export.py +++ b/qai_hub_models/models/squeezenet1_1_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -205,7 +216,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml index 0ee10d5b..d7ad8b01 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml +++ b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: SqueezeNet-1_1Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 218.0 - throughput: 4587.155963302752 + inference_time: 221.0 + throughput: 4524.886877828054 estimated_peak_memory_range: - min: 24576 - max: 1453208 + min: 12288 + max: 2523424 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 41 - job_id: jmg9jdmw5 + job_id: jogkyeovp job_status: Passed torchscript_onnx_qnn: - inference_time: 466.0 - throughput: 2145.922746781116 + inference_time: 467.0 + throughput: 2141.3276231263385 estimated_peak_memory_range: - min: 12288 - max: 10115704 + min: 176128 + max: 80481816 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,22 +69,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 45 - job_id: jvgde23r5 + job_id: jw561yrnp job_status: Passed torchscript_onnx_ort: - inference_time: 811.0 - throughput: 1233.0456226880394 + inference_time: 550.0 + throughput: 1818.1818181818182 estimated_peak_memory_range: min: 618496 - max: 5355192 + max: 7743200 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 49 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jqp4k318g + total_layers: 49 + job_id: j7gjlvo1p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -91,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.548624Z' + timestamp: '2024-05-20T16:35:31.170251Z' - torchscript_onnx_tflite: - inference_time: 178.0 - throughput: 5617.9775280898875 + inference_time: 184.0 + throughput: 5434.782608695652 estimated_peak_memory_range: min: 12288 - max: 21783424 + max: 22090256 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 41 - job_id: jnp1y6j8p + job_id: jn5q26ze5 job_status: Passed torchscript_onnx_qnn: - inference_time: 343.0 - throughput: 2915.451895043732 + inference_time: 341.0 + throughput: 2932.551319648094 estimated_peak_memory_range: - min: 167936 - max: 23042032 + min: 163840 + max: 26837472 primary_compute_unit: NPU precision: int8 layer_info: @@ -120,22 +122,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 45 - job_id: jz57094vg + job_id: j1p3mjxmg job_status: Passed torchscript_onnx_ort: - inference_time: 632.0 - throughput: 1582.2784810126582 + inference_time: 421.0 + throughput: 2375.296912114014 estimated_peak_memory_range: min: 12288 - max: 16606592 + max: 16755200 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 49 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j0pxnx435 + total_layers: 49 + job_id: jlpevd885 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -144,51 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.548662Z' + timestamp: '2024-05-20T16:35:31.170278Z' - torchscript_onnx_tflite: - inference_time: 645.0 - throughput: 1550.3875968992247 + inference_time: 225.0 + throughput: 4444.444444444444 estimated_peak_memory_range: - min: 12288 - max: 14710928 + min: 28672 + max: 1537872 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 43 + layers_on_npu: 41 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 43 - job_id: jogk7w2op + total_layers: 41 + job_id: j1glkvo2p job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 471.0 + throughput: 2123.1422505307855 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 12288 + max: 9792120 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 45 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: j1pv0ylm5 - job_status: Failed - torchscript_onnx_ort: - inference_time: 3597.0 - throughput: 278.00945232137894 + total_layers: 45 + job_id: j1pvw6ezg + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.170295Z' + - torchscript_onnx_tflite: + inference_time: 538.0 + throughput: 1858.736059479554 estimated_peak_memory_range: - min: 0 - max: 28318256 - primary_compute_unit: CPU - precision: fp32 + min: 12288 + max: 14558896 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 41 layers_on_gpu: 0 - layers_on_cpu: 51 - total_layers: 51 - job_id: jo5mq8mdp + layers_on_cpu: 0 + total_layers: 41 + job_id: jmg9wqkvp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 958.0 + throughput: 1043.8413361169103 + estimated_peak_memory_range: + min: 163840 + max: 22853712 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 45 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 45 + job_id: jo5m3ldqg job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -197,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.548701Z' + timestamp: '2024-05-20T16:35:31.170312Z' - torchscript_onnx_tflite: - inference_time: 4261.0 - throughput: 234.6866932644919 + inference_time: 4066.0 + throughput: 245.94195769798327 estimated_peak_memory_range: - min: 90112 - max: 1970416 + min: 28672 + max: 6476760 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 43 + layers_on_npu: 41 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 43 - job_id: jmg9yo4v5 + total_layers: 41 + job_id: jnp1em7lg job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -220,42 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.548716Z' - - torchscript_onnx_tflite: - inference_time: 246.0 - throughput: 4065.040650406504 + timestamp: '2024-05-20T16:35:31.170322Z' + - torchscript_onnx_qnn: + inference_time: 580.0 + throughput: 1724.1379310344828 estimated_peak_memory_range: - min: 12288 - max: 1876064 + min: 622592 + max: 622592 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 43 + layers_on_npu: 45 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 43 - job_id: j1gl69ylg + total_layers: 45 + job_id: jwgov2o15 job_status: Passed - torchscript_onnx_qnn: - inference_time: 507.0 - throughput: 1972.3865877712033 + torchscript_onnx_ort: + inference_time: 571.0 + throughput: 1751.3134851138354 estimated_peak_memory_range: - min: 528384 - max: 12189432 + min: 1773568 + max: 1773568 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 47 + layers_on_npu: 49 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 47 - job_id: jygzoql65 + total_layers: 49 + job_id: jygz7384p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 979.0 + throughput: 1021.4504596527069 + estimated_peak_memory_range: + min: 4251648 + max: 4251648 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 51 + total_layers: 51 + job_id: jz5w9e84p job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.548742Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.170345Z' diff --git a/qai_hub_models/models/stable_diffusion_quantized/README.md b/qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md similarity index 85% rename from qai_hub_models/models/stable_diffusion_quantized/README.md rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md index de6f68d1..e7447ff1 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/README.md +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md @@ -1,31 +1,33 @@ [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) -# [Stable-Diffusion: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions](https://aihub.qualcomm.com/models/stable_diffusion_quantized) +# [Stable-Diffusion-v1.5: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions](https://aihub.qualcomm.com/models/stable_diffusion_v1_5_quantized) Generates high resolution images from text prompts using a latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, and VAE based decoder to generate the final image. -This is based on the implementation of Stable-Diffusion found +This is based on the implementation of Stable-Diffusion-v1.5 found [here](https://github.com/CompVis/stable-diffusion/tree/main). This repository contains scripts for optimized on-device export suitable to run on Qualcomm® devices. More details on model performance -accross various devices, can be found [here](https://aihub.qualcomm.com/models/stable_diffusion_quantized). +accross various devices, can be found [here](https://aihub.qualcomm.com/models/stable_diffusion_v1_5_quantized). [Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: ```bash -pip install "qai_hub_models[stable_diffusion_quantized]" +pip install "qai_hub_models[stable_diffusion_v1_5_quantized]" ``` Once installed, run the following simple CLI demo: ```bash -python -m qai_hub_models.models.stable_diffusion_quantized.demo +python -m qai_hub_models.models.stable_diffusion_v1_5_quantized.demo ``` More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing @@ -38,13 +40,13 @@ This repository contains export scripts that produce a model optimized for on-device deployment. This can be run as follows: ```bash -python -m qai_hub_models.models.stable_diffusion_quantized.export +python -m qai_hub_models.models.stable_diffusion_v1_5_quantized.export ``` Additional options are documented with the `--help` option. Note that the above script requires access to Deployment instructions for Qualcomm® AI Hub. ## License -- The license for the original implementation of Stable-Diffusion can be found +- The license for the original implementation of Stable-Diffusion-v1.5 can be found [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE). - The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) diff --git a/qai_hub_models/models/stable_diffusion_quantized/__init__.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/__init__.py similarity index 58% rename from qai_hub_models/models/stable_diffusion_quantized/__init__.py rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/__init__.py index 7cc325fb..bb1d5cf9 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/__init__.py +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/__init__.py @@ -2,11 +2,9 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -from qai_hub_models.models.stable_diffusion_quantized.model import ( # noqa: F401 +from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import ( # noqa: F401 MODEL_ID, ) -from qai_hub_models.models.stable_diffusion_quantized.model import ( # noqa: F401 +from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import ( # noqa: F401 StableDiffusionQuantized as Model, ) - -from .app import StableDiffusionApp as App # noqa: F401 diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/demo.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/demo.py new file mode 100644 index 00000000..ff364004 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/demo.py @@ -0,0 +1,53 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel +from transformers import CLIPTokenizer + +from qai_hub_models.models._shared.stable_diffusion.demo import stable_diffusion_demo +from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ClipVITTextEncoder, + Unet, + VAEDecoder, +) + + +# Run Stable Diffuison end-to-end on a given prompt. The demo will output an +# AI-generated image based on the description in the prompt. +def main(is_test: bool = False): + tokenizer = CLIPTokenizer.from_pretrained( + "openai/clip-vit-large-patch14", subfolder="", revision="main" + ) + + scheduler = DPMSolverMultistepScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) + + time_embedding = UNet2DConditionModel.from_pretrained( + "runwayml/stable-diffusion-v1-5", subfolder="unet" + ).time_embedding + + text_encoder = ClipVITTextEncoder.from_precompiled() + unet = Unet.from_precompiled() + vae_decoder = VAEDecoder.from_precompiled() + stable_diffusion_demo( + model_id=MODEL_ID, + model_asset_version=MODEL_ASSET_VERSION, + text_encoder=text_encoder, + unet=unet, + vae_decoder=vae_decoder, + tokenizer=tokenizer, + scheduler=scheduler, + time_embedding=time_embedding, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/export.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/export.py new file mode 100644 index 00000000..c6394ccb --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/export.py @@ -0,0 +1,191 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import warnings +from pathlib import Path +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.stable_diffusion_v1_5_quantized import Model +from qai_hub_models.utils.args import export_parser +from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime +from qai_hub_models.utils.printing import print_profile_metrics_from_job +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = ["TextEncoder_Quantized", "UNet_Quantized", "VAEDecoder_Quantized"] +DEFAULT_COMPONENTS = ["TextEncoder_Quantized", "VAEDecoder_Quantized", "UNet_Quantized"] + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[str, Tuple[Optional[hub.ProfileJob], Optional[hub.InferenceJob]]] | List[ + str +]: + """ + This function accomplishes 5 main tasks: + + 1. Initialize model. + 2. Upload model assets to hub. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Summarizes the results from profiling. + + Each of the last three steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_summary: If set, skips waiting for and summarizing results + from profiling. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_precompiled` + + Returns: + A Mapping from component_name to a 2-tuple of: + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "stable_diffusion_v1_5_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + component_arg = components + components = components or DEFAULT_COMPONENTS + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "stable_diffusion_v1_5_quantized", + "Stable-Diffusion-v1.5", + device, + skip_profiling, + skip_inferencing, + False, + skip_summary, + output_path, + TargetRuntime.QNN, + "", + profile_options, + component_arg, + ) + + target_runtime = TargetRuntime.TFLITE + # 1. Initialize model + print("Initializing model class") + model = Model.from_precompiled() + components_dict: Dict[str, BasePrecompiledModel] = {} + if "TextEncoder_Quantized" in components: + components_dict["TextEncoder_Quantized"] = model.text_encoder # type: ignore + if "UNet_Quantized" in components: + components_dict["UNet_Quantized"] = model.unet # type: ignore + if "VAEDecoder_Quantized" in components: + components_dict["VAEDecoder_Quantized"] = model.vae_decoder # type: ignore + + # 2. Upload model assets to hub + print("Uploading model assets on hub") + uploaded_models = {} + for component_name in components: + uploaded_models[component_name] = hub.upload_model( + components_dict[component_name].get_target_model_path() + ) + + # 3. Profile the model assets on real devices + profile_jobs: Dict[str, hub.client.ProfileJob] = {} + if not skip_profiling: + for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + print(f"Profiling model {component_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=uploaded_models[component_name], + device=hub_device, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job + ) + + # 4. Run inference on-device with sample inputs + inference_jobs: Dict[str, hub.client.InferenceJob] = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + sample_inputs = components_dict[component_name].sample_inputs() + submitted_inference_job = hub.submit_inference_job( + model=uploaded_models[component_name], + inputs=sample_inputs, + device=hub_device, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job + ) + + # 5. Summarize the results from profiling + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + return { + component_name: ( + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, exporting_compiled_model=True + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion_quantized/info.yaml b/qai_hub_models/models/stable_diffusion_v1_5_quantized/info.yaml similarity index 91% rename from qai_hub_models/models/stable_diffusion_quantized/info.yaml rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/info.yaml index ceac7d79..7bf7c3d0 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/info.yaml +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/info.yaml @@ -1,5 +1,5 @@ -name: Stable-Diffusion -id: stable_diffusion_quantized +name: Stable-Diffusion-v1.5 +id: stable_diffusion_v1_5_quantized status: public headline: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions. @@ -18,7 +18,7 @@ deploy_license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE source_repo: https://github.com/CompVis/stable-diffusion/tree/main technical_details: Input: Text prompt to generate image - QNN-SDK: '2.19' + QNN-SDK: '2.20' Text Encoder Number of parameters: 340M UNet Number of parameters: 865M VAE Decoder Number of parameters: 83M @@ -28,6 +28,7 @@ applicable_scenarios: - Image Editing - Content Creation related_models: + - stable_diffusion_v2_1_quantized - controlnet_quantized form_factors: - Phone diff --git a/qai_hub_models/models/stable_diffusion_quantized/model.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/model.py similarity index 99% rename from qai_hub_models/models/stable_diffusion_quantized/model.py rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/model.py index f9da4488..0325e913 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/model.py +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/model.py @@ -13,7 +13,7 @@ MODEL_ID = __name__.split(".")[-2] MODEL_ASSET_VERSION = 1 -QNN_SDK_PREFIX = "QNN219" +QNN_SDK_PREFIX = "QNN220" TEXT_ENCODER = os.path.join(QNN_SDK_PREFIX, "text_encoder.serialized.bin") UNET_DIFFUSER = os.path.join(QNN_SDK_PREFIX, "unet.serialized.bin") VAE_DECODER = os.path.join(QNN_SDK_PREFIX, "vae_decoder.serialized.bin") diff --git a/qai_hub_models/models/stable_diffusion_quantized/perf.yaml b/qai_hub_models/models/stable_diffusion_v1_5_quantized/perf.yaml similarity index 100% rename from qai_hub_models/models/stable_diffusion_quantized/perf.yaml rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/perf.yaml diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/requirements.txt b/qai_hub_models/models/stable_diffusion_v1_5_quantized/requirements.txt new file mode 100644 index 00000000..83aa3d48 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.27.4 +diffusers[torch]==0.21.4 diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/test.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/test.py new file mode 100644 index 00000000..5cd49388 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/test.py @@ -0,0 +1,30 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import pytest + +from qai_hub_models.models._shared.stable_diffusion.test_utils import ( + export_for_component, +) +from qai_hub_models.models.stable_diffusion_v1_5_quantized.demo import main as demo_main +from qai_hub_models.models.stable_diffusion_v1_5_quantized.export import export_model +from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import ( + StableDiffusionQuantized, +) + + +def test_from_precompiled(): + StableDiffusionQuantized.from_precompiled() + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_export(): + export_for_component(export_model, "TextEncoder_Quantized") + + +@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md b/qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md new file mode 100644 index 00000000..2c8bd7d6 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md @@ -0,0 +1,83 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Stable-Diffusion-v2.1: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions](#) + +Generates high resolution images from text prompts using a latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, and VAE based decoder to generate the final image. + +This is based on the implementation of Stable-Diffusion-v2.1 found +[here](https://github.com/CompVis/stable-diffusion/tree/main). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](#). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[stable_diffusion_v2_1_quantized]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.stable_diffusion_v2_1_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.stable_diffusion_v2_1_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Stable-Diffusion-v2.1 can be found + [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) +* [Source Model Implementation](https://github.com/CompVis/stable-diffusion/tree/main) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + +## Usage and Limitations + +This model may not be used for or in connection with any of the following applications: + +- Accessing essential private and public services and benefits; +- Administration of justice and democratic processes; +- Assessing or recognizing the emotional state of a person; +- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics; +- Education and vocational training; +- Employment and workers management; +- Exploitation of the vulnerabilities of persons resulting in harmful behavior; +- General purpose social scoring; +- Law enforcement; +- Management and operation of critical infrastructure; +- Migration, asylum and border control management; +- Predictive policing; +- Real-time remote biometric identification in public spaces; +- Recommender systems of social media platforms; +- Scraping of facial images (from the internet or otherwise); and/or +- Subliminal manipulation + + diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/__init__.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/__init__.py new file mode 100644 index 00000000..7a6b1a25 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import ( # noqa: F401 + MODEL_ID, +) +from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import ( # noqa: F401 + StableDiffusionQuantized as Model, +) diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/demo.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/demo.py new file mode 100644 index 00000000..2ee347ec --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/demo.py @@ -0,0 +1,54 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel +from transformers import CLIPTokenizer + +from qai_hub_models.models._shared.stable_diffusion.demo import stable_diffusion_demo +from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ClipVITTextEncoder, + Unet, + VAEDecoder, +) + + +# Run Stable Diffuison end-to-end on a given prompt. The demo will output an +# AI-generated image based on the description in the prompt. +def main(is_test: bool = False): + tokenizer = CLIPTokenizer.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer", revision="main" + ) + + scheduler = DPMSolverMultistepScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) + + time_embedding = UNet2DConditionModel.from_pretrained( + "stabilityai/stable-diffusion-2-1-base", subfolder="unet", revision="main" + ).time_embedding + + text_encoder = ClipVITTextEncoder.from_precompiled() + unet = Unet.from_precompiled() + vae_decoder = VAEDecoder.from_precompiled() + stable_diffusion_demo( + model_id=MODEL_ID, + model_asset_version=MODEL_ASSET_VERSION, + text_encoder=text_encoder, + unet=unet, + vae_decoder=vae_decoder, + tokenizer=tokenizer, + scheduler=scheduler, + time_embedding=time_embedding, + channel_last_latent=False, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/export.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/export.py new file mode 100644 index 00000000..d2b0dffd --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/export.py @@ -0,0 +1,191 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import warnings +from pathlib import Path +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.stable_diffusion_v2_1_quantized import Model +from qai_hub_models.utils.args import export_parser +from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime +from qai_hub_models.utils.printing import print_profile_metrics_from_job +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = ["TextEncoder_Quantized", "UNet_Quantized", "VAEDecoder_Quantized"] +DEFAULT_COMPONENTS = ["TextEncoder_Quantized", "VAEDecoder_Quantized", "UNet_Quantized"] + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[str, Tuple[Optional[hub.ProfileJob], Optional[hub.InferenceJob]]] | List[ + str +]: + """ + This function accomplishes 5 main tasks: + + 1. Initialize model. + 2. Upload model assets to hub. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Summarizes the results from profiling. + + Each of the last three steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_summary: If set, skips waiting for and summarizing results + from profiling. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_precompiled` + + Returns: + A Mapping from component_name to a 2-tuple of: + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "stable_diffusion_v2_1_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + component_arg = components + components = components or DEFAULT_COMPONENTS + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "stable_diffusion_v2_1_quantized", + "Stable-Diffusion-v2.1", + device, + skip_profiling, + skip_inferencing, + False, + skip_summary, + output_path, + TargetRuntime.QNN, + "", + profile_options, + component_arg, + ) + + target_runtime = TargetRuntime.TFLITE + # 1. Initialize model + print("Initializing model class") + model = Model.from_precompiled() + components_dict: Dict[str, BasePrecompiledModel] = {} + if "TextEncoder_Quantized" in components: + components_dict["TextEncoder_Quantized"] = model.text_encoder # type: ignore + if "UNet_Quantized" in components: + components_dict["UNet_Quantized"] = model.unet # type: ignore + if "VAEDecoder_Quantized" in components: + components_dict["VAEDecoder_Quantized"] = model.vae_decoder # type: ignore + + # 2. Upload model assets to hub + print("Uploading model assets on hub") + uploaded_models = {} + for component_name in components: + uploaded_models[component_name] = hub.upload_model( + components_dict[component_name].get_target_model_path() + ) + + # 3. Profile the model assets on real devices + profile_jobs: Dict[str, hub.client.ProfileJob] = {} + if not skip_profiling: + for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + print(f"Profiling model {component_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=uploaded_models[component_name], + device=hub_device, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job + ) + + # 4. Run inference on-device with sample inputs + inference_jobs: Dict[str, hub.client.InferenceJob] = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + sample_inputs = components_dict[component_name].sample_inputs() + submitted_inference_job = hub.submit_inference_job( + model=uploaded_models[component_name], + inputs=sample_inputs, + device=hub_device, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job + ) + + # 5. Summarize the results from profiling + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + return { + component_name: ( + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, exporting_compiled_model=True + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/info.yaml b/qai_hub_models/models/stable_diffusion_v2_1_quantized/info.yaml new file mode 100644 index 00000000..e298e2a5 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/info.yaml @@ -0,0 +1,40 @@ +name: Stable-Diffusion-v2.1 +id: stable_diffusion_v2_1_quantized +status: public +headline: State-of-the-art generative AI model used to generate detailed images conditioned + on text descriptions. +domain: Generative AI +description: Generates high resolution images from text prompts using a latent diffusion + model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, + and VAE based decoder to generate the final image. +use_case: Image Generation +tags: + - generative-ai + - quantized +research_paper: https://arxiv.org/abs/2112.10752 +research_paper_title: High-Resolution Image Synthesis with Latent Diffusion Models +license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE +deploy_license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE +source_repo: https://github.com/CompVis/stable-diffusion/tree/main +technical_details: + Input: Text prompt to generate image + QNN-SDK: '2.20' + Text Encoder Number of parameters: 340M + UNet Number of parameters: 865M + VAE Decoder Number of parameters: 83M + Model size: 1GB +applicable_scenarios: + - Image Generation + - Image Editing + - Content Creation +related_models: + - stable_diffusion_v1_5_quantized + - controlnet_quantized +form_factors: + - Phone + - Tablet +has_static_banner: yes +has_animated_banner: yes +license_type: creativeml-openrail-m +deploy_license_type: creativeml-openrail-m +dataset: [] diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/model.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/model.py new file mode 100644 index 00000000..b1e6c86d --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/model.py @@ -0,0 +1,105 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from qai_hub_models.models.protocols import FromPrecompiledProtocol +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import BasePrecompiledModel, CollectionModel +from qai_hub_models.utils.input_spec import InputSpec + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +TEXT_ENCODER = "text_encoder.serialized.bin" +UNET_DIFFUSER = "unet.serialized.bin" +VAE_DECODER = "vae.serialized.bin" + + +class StableDiffusionQuantized(FromPrecompiledProtocol, CollectionModel): + """ + Stable Diffusion wrapper class consists of + - Text Encoder + - UNet based diffuser + - VAE decoder + + All three models are pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + def __init__(self, text_encoder, unet, vae_decoder) -> None: + self.text_encoder = text_encoder + self.unet = unet + self.vae_decoder = vae_decoder + + @classmethod + def from_precompiled(cls) -> "StableDiffusionQuantized": + return StableDiffusionQuantized( + text_encoder=ClipVITTextEncoder.from_precompiled(), + unet=Unet.from_precompiled(), + vae_decoder=VAEDecoder.from_precompiled(), + ) + + +class ClipVITTextEncoder(BasePrecompiledModel): + """ + CLIP-ViT based Text Encoder. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + @classmethod + def from_precompiled(cls) -> "ClipVITTextEncoder": + text_encoder_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, TEXT_ENCODER + ).fetch() + return ClipVITTextEncoder(text_encoder_path) + + @staticmethod + def get_input_spec() -> InputSpec: + return {"tokens": ((1, 77), "int32")} + + +class Unet(BasePrecompiledModel): + """ + UNet model to denoise image in latent space. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + @classmethod + def from_precompiled(cls) -> "Unet": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, UNET_DIFFUSER + ).fetch() + return Unet(model_path) + + @staticmethod + def get_input_spec() -> InputSpec: + return { + "latent": ((1, 4, 64, 64), "float32"), + "time_emb": ((1, 1280), "float32"), + "text_emb": ((1, 77, 1024), "float32"), + } + + +class VAEDecoder(BasePrecompiledModel): + """ + Decodes image from latent into output generated image. + + Pre-trained, quantized (int8 weight, uint16 activations) + and compiled into serialized binary for Qualcomm Snapdragon Gen2+. + """ + + @classmethod + def from_precompiled(cls) -> "VAEDecoder": + model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, VAE_DECODER + ).fetch() + return VAEDecoder(model_path) + + @staticmethod + def get_input_spec() -> InputSpec: + return {"latent": ((1, 4, 64, 64), "float32")} diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/requirements.txt b/qai_hub_models/models/stable_diffusion_v2_1_quantized/requirements.txt new file mode 100644 index 00000000..83aa3d48 --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.27.4 +diffusers[torch]==0.21.4 diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/test.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/test.py new file mode 100644 index 00000000..a19408ae --- /dev/null +++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/test.py @@ -0,0 +1,30 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import pytest + +from qai_hub_models.models._shared.stable_diffusion.test_utils import ( + export_for_component, +) +from qai_hub_models.models.stable_diffusion_v2_1_quantized.demo import main as demo_main +from qai_hub_models.models.stable_diffusion_v2_1_quantized.export import export_model +from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import ( + StableDiffusionQuantized, +) + + +def test_from_precompiled(): + StableDiffusionQuantized.from_precompiled() + + +# @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_export(): + export_for_component(export_model, "TextEncoder_Quantized") + + +# @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") +@pytest.mark.slow_cloud +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/stylegan2/README.md b/qai_hub_models/models/stylegan2/README.md index c3671d48..ea9e6792 100644 --- a/qai_hub_models/models/stylegan2/README.md +++ b/qai_hub_models/models/stylegan2/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/stylegan2/demo.py b/qai_hub_models/models/stylegan2/demo.py index dc28952f..be0c4842 100644 --- a/qai_hub_models/models/stylegan2/demo.py +++ b/qai_hub_models/models/stylegan2/demo.py @@ -41,8 +41,6 @@ def main(is_test: bool = False): help="Class[es] to use for image generation (if applicable).", ) args = parser.parse_args([] if is_test else None) - if not args.inference_options: - args.inference_options = "--compute_unit gpu" # Create model and app model = model_from_cli_args(StyleGAN2, args) diff --git a/qai_hub_models/models/stylegan2/export.py b/qai_hub_models/models/stylegan2/export.py index fc61a8ec..25cd6b7f 100644 --- a/qai_hub_models/models/stylegan2/export.py +++ b/qai_hub_models/models/stylegan2/export.py @@ -118,9 +118,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_output output_0" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -183,8 +190,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) diff --git a/qai_hub_models/models/stylegan2/model.py b/qai_hub_models/models/stylegan2/model.py index 906fdde7..05ac7791 100644 --- a/qai_hub_models/models/stylegan2/model.py +++ b/qai_hub_models/models/stylegan2/model.py @@ -4,10 +4,11 @@ # --------------------------------------------------------------------- from __future__ import annotations -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional import numpy as np import torch +from qai_hub.client import Device from qai_hub_models.utils.asset_loaders import SourceAsRoot from qai_hub_models.utils.base_model import BaseModel, TargetRuntime @@ -122,12 +123,20 @@ def sample_inputs( return inputs def get_hub_compile_options( - self, target_runtime: TargetRuntime, other_compile_options: str = "" + self, + target_runtime: TargetRuntime, + other_compile_options: str = "", + device: Optional[Device] = None, ) -> str: compile_options = super().get_hub_compile_options( - target_runtime, other_compile_options + target_runtime, other_compile_options, device ) - return compile_options + " --compute_unit gpu" + if ( + target_runtime == TargetRuntime.TFLITE + and "--compute_unit" not in compile_options + ): + compile_options = compile_options + " --compute_unit gpu" + return compile_options def get_hub_profile_options( self, target_runtime: TargetRuntime, other_profile_options: str = "" @@ -135,7 +144,12 @@ def get_hub_profile_options( profile_options = super().get_hub_profile_options( target_runtime, other_profile_options ) - return profile_options + " --compute_unit gpu" + if ( + target_runtime == TargetRuntime.TFLITE + and "--compute_unit" not in profile_options + ): + profile_options = profile_options + " --compute_unit gpu" + return profile_options def _get_qaihm_upfirdn2d_ref(misc: Any, conv2d_gradfix: Callable, upfirdn2d: Any): diff --git a/qai_hub_models/models/stylegan2/perf.yaml b/qai_hub_models/models/stylegan2/perf.yaml index ede5aaaa..123e840c 100644 --- a/qai_hub_models/models/stylegan2/perf.yaml +++ b/qai_hub_models/models/stylegan2/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: StyleGAN2 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1317970.0 - throughput: 0.7587426117438182 + inference_time: 1588522.0 + throughput: 0.6295159903356705 estimated_peak_memory_range: - min: 1448136704 - max: 2566842336 + min: 1459597312 + max: 2294159464 primary_compute_unit: CPU precision: fp32 layer_info: @@ -46,9 +48,9 @@ models: layers_on_gpu: 78 layers_on_cpu: 402 total_layers: 480 - job_id: jegnlknk5 + job_id: jmg94lkm5 job_status: Passed - torchscript_onnx_ort: + torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' estimated_peak_memory_range: @@ -61,8 +63,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jep20ewrg + job_id: jz57dykn5 job_status: Failed + torchscript_onnx_ort: + inference_time: 640892.0 + throughput: 1.560325296617839 + estimated_peak_memory_range: + min: 206315520 + max: 337724960 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 454 + layers_on_gpu: 0 + layers_on_cpu: 89 + total_layers: 543 + job_id: jo5mzno7p + job_status: Passed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.583802Z' + timestamp: '2024-05-20T16:35:31.209775Z' - torchscript_onnx_tflite: - inference_time: 1012977.0 - throughput: 0.9871892451654875 + inference_time: 1240378.0 + throughput: 0.8062058501521311 estimated_peak_memory_range: - min: 954945536 - max: 980253632 + min: 1137418240 + max: 1169458160 primary_compute_unit: CPU precision: fp32 layer_info: @@ -84,9 +101,9 @@ models: layers_on_gpu: 78 layers_on_cpu: 402 total_layers: 480 - job_id: jopr8w005 + job_id: jnp1847ng job_status: Passed - torchscript_onnx_ort: + torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' estimated_peak_memory_range: @@ -99,8 +116,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jqpyrmx85 + job_id: jqp4wlm2g job_status: Failed + torchscript_onnx_ort: + inference_time: 508041.0 + throughput: 1.9683450745117028 + estimated_peak_memory_range: + min: 300343296 + max: 1069005056 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 454 + layers_on_gpu: 0 + layers_on_cpu: 89 + total_layers: 543 + job_id: jegne6ojg + job_status: Passed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.583878Z' + timestamp: '2024-05-20T16:35:31.209803Z' - torchscript_onnx_tflite: - inference_time: 1253049.0 - throughput: 0.7980533881755622 + inference_time: 1643379.0 + throughput: 0.6085023600764036 estimated_peak_memory_range: - min: 941391872 - max: 2204990360 + min: 1178169344 + max: 1181322952 primary_compute_unit: CPU precision: fp32 layer_info: @@ -122,8 +154,23 @@ models: layers_on_gpu: 78 layers_on_cpu: 402 total_layers: 480 - job_id: j0pxn8015 + job_id: jvgdvx86g job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j0px1k38g + job_status: Failed reference_device_info: name: QCS8550 (Proxy) os: '12' @@ -131,4 +178,42 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.583943Z' + timestamp: '2024-05-20T16:35:31.209820Z' + - torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jopryvokg + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jep2mk465 + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.209838Z' diff --git a/qai_hub_models/models/stylegan2/requirements.txt b/qai_hub_models/models/stylegan2/requirements.txt index 7317e178..2f72dd5e 100644 --- a/qai_hub_models/models/stylegan2/requirements.txt +++ b/qai_hub_models/models/stylegan2/requirements.txt @@ -1 +1 @@ -click==8.0 +click==8.1.7 diff --git a/qai_hub_models/models/swin_base/README.md b/qai_hub_models/models/swin_base/README.md index 8c239d97..e1b53caa 100644 --- a/qai_hub_models/models/swin_base/README.md +++ b/qai_hub_models/models/swin_base/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/swin_base/export.py b/qai_hub_models/models/swin_base/export.py index 5847165f..fcf1640d 100644 --- a/qai_hub_models/models/swin_base/export.py +++ b/qai_hub_models/models/swin_base/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/swin_base/perf.yaml b/qai_hub_models/models/swin_base/perf.yaml index c847057a..bedb6e0b 100644 --- a/qai_hub_models/models/swin_base/perf.yaml +++ b/qai_hub_models/models/swin_base/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Swin-Base performance_metrics: - torchscript_onnx_tflite: - inference_time: 61028.0 - throughput: 16.38592121649079 + inference_time: 38211.0 + throughput: 26.170474470702153 estimated_peak_memory_range: - min: 106496 - max: 3418200 + min: 0 + max: 7586888 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1568 - job_id: j1p801xkg + job_id: jqpyd1q0p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 31640.0 + throughput: 31.605562579013906 + estimated_peak_memory_range: + min: 40960 + max: 49217704 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1255 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1255 + job_id: jogkye9vp job_status: Passed torchscript_onnx_ort: - inference_time: 72900.0 - throughput: 13.717421124828531 + inference_time: 64134.0 + throughput: 15.592353509838775 estimated_peak_memory_range: - min: 118784 - max: 421108168 + min: 114688 + max: 476901736 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1163 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jn5qevyn5 + total_layers: 1163 + job_id: j1p3mjwmg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.601688Z' + timestamp: '2024-05-20T16:35:31.233759Z' - torchscript_onnx_tflite: - inference_time: 39474.0 - throughput: 25.333130668287986 + inference_time: 26230.0 + throughput: 38.12428516965307 estimated_peak_memory_range: - min: 73728 - max: 512044160 + min: 53248 + max: 498968400 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1568 - job_id: jogk784wp + job_id: j2p0rzv0p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 21887.0 + throughput: 45.68922191255083 + estimated_peak_memory_range: + min: 0 + max: 408673168 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1255 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1255 + job_id: jn5q26me5 job_status: Passed torchscript_onnx_ort: - inference_time: 51726.0 - throughput: 19.332637358388432 + inference_time: 44459.0 + throughput: 22.49263366247554 estimated_peak_memory_range: - min: 651264 - max: 268896832 + min: 626688 + max: 202092528 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1163 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1gl6lxjg + total_layers: 1163 + job_id: jwgov2415 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.601867Z' + timestamp: '2024-05-20T16:35:31.233787Z' - torchscript_onnx_tflite: - inference_time: 61645.0 - throughput: 16.221915808256956 + inference_time: 38283.0 + throughput: 26.121254865083717 estimated_peak_memory_range: - min: 28672 - max: 3282368 + min: 98304 + max: 3696992 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1568 - job_id: jw56e9o0g + job_id: j1p87q4q5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 31310.0 + throughput: 31.938677738741617 + estimated_peak_memory_range: + min: 45056 + max: 48773208 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1255 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1255 + job_id: jw561ydnp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.602032Z' + timestamp: '2024-05-20T16:35:31.233806Z' + - torchscript_onnx_qnn: + inference_time: 38967.0 + throughput: 25.662740267405752 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1255 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1255 + job_id: j1glkv12p + job_status: Passed + torchscript_onnx_ort: + inference_time: 66278.0 + throughput: 15.087962823259604 + estimated_peak_memory_range: + min: 685105152 + max: 685105152 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1163 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1163 + job_id: j1pvw69zg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j7gjlvw1p + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.233833Z' diff --git a/qai_hub_models/models/swin_small/README.md b/qai_hub_models/models/swin_small/README.md index eae34fe2..01c8a31a 100644 --- a/qai_hub_models/models/swin_small/README.md +++ b/qai_hub_models/models/swin_small/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/swin_small/export.py b/qai_hub_models/models/swin_small/export.py index f7a264ad..4215098d 100644 --- a/qai_hub_models/models/swin_small/export.py +++ b/qai_hub_models/models/swin_small/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/swin_small/perf.yaml b/qai_hub_models/models/swin_small/perf.yaml index 41e018e7..fc750018 100644 --- a/qai_hub_models/models/swin_small/perf.yaml +++ b/qai_hub_models/models/swin_small/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Swin-Small performance_metrics: - torchscript_onnx_tflite: - inference_time: 46059.0 - throughput: 21.711283353959054 + inference_time: 29128.0 + throughput: 34.33122768470201 estimated_peak_memory_range: - min: 28672 - max: 8907776 + min: 36864 + max: 2408576 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1563 - job_id: j1p3v693g + job_id: jlpevdl85 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 23681.0 + throughput: 42.22794645496389 + estimated_peak_memory_range: + min: 16384 + max: 45345336 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1246 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1246 + job_id: jmg94lxm5 job_status: Passed torchscript_onnx_ort: - inference_time: 61104.0 - throughput: 16.365540717465304 + inference_time: 56992.0 + throughput: 17.54632229084784 estimated_peak_memory_range: - min: 12288 - max: 250842792 + min: 40960 + max: 225148824 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1158 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1pv07lk5 + total_layers: 1158 + job_id: jmg94lxq5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.619812Z' + timestamp: '2024-05-20T16:35:31.258354Z' - torchscript_onnx_tflite: - inference_time: 29579.0 - throughput: 33.80776902532202 + inference_time: 19660.0 + throughput: 50.8646998982706 estimated_peak_memory_range: - min: 45056 - max: 479603376 + min: 49152 + max: 467994720 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1563 - job_id: jwgok8rqp + job_id: jygz7344p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 16138.0 + throughput: 61.96554715578139 + estimated_peak_memory_range: + min: 0 + max: 376584720 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1246 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1246 + job_id: jnp184vng job_status: Passed torchscript_onnx_ort: - inference_time: 43618.0 - throughput: 22.926314824155167 + inference_time: 39508.0 + throughput: 25.311329351017516 estimated_peak_memory_range: - min: 696320 - max: 646499600 + min: 88776704 + max: 260548080 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 1158 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j7gjzqrv5 + total_layers: 1158 + job_id: jnp184vkg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.619995Z' + timestamp: '2024-05-20T16:35:31.258381Z' - torchscript_onnx_tflite: - inference_time: 45406.0 - throughput: 22.023521120556754 + inference_time: 29352.0 + throughput: 34.06922867266285 estimated_peak_memory_range: - min: 94208 - max: 3127248 + min: 20480 + max: 8413168 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1563 - job_id: jz5701nlg + job_id: jz5w9e14p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 23705.0 + throughput: 42.185192997257964 + estimated_peak_memory_range: + min: 53248 + max: 45854248 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1246 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1246 + job_id: jz5w9e1zp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.620167Z' + timestamp: '2024-05-20T16:35:31.258398Z' + - torchscript_onnx_qnn: + inference_time: 23881.0 + throughput: 41.87429337129936 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1246 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1246 + job_id: jvgdvxz6g + job_status: Passed + torchscript_onnx_ort: + inference_time: 59131.0 + throughput: 16.91160305085319 + estimated_peak_memory_range: + min: 473104384 + max: 473104384 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1158 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1158 + job_id: jvgdvxzkg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 449448.0 + throughput: 2.2249514960573857 + estimated_peak_memory_range: + min: 1191936 + max: 1191936 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 1050 + total_layers: 1050 + job_id: jz57dy7q5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.258421Z' diff --git a/qai_hub_models/models/swin_tiny/README.md b/qai_hub_models/models/swin_tiny/README.md index 25b9d845..8549a629 100644 --- a/qai_hub_models/models/swin_tiny/README.md +++ b/qai_hub_models/models/swin_tiny/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/swin_tiny/export.py b/qai_hub_models/models/swin_tiny/export.py index ae43d850..05142b6c 100644 --- a/qai_hub_models/models/swin_tiny/export.py +++ b/qai_hub_models/models/swin_tiny/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/swin_tiny/perf.yaml b/qai_hub_models/models/swin_tiny/perf.yaml index cc35a05a..9a0129a2 100644 --- a/qai_hub_models/models/swin_tiny/perf.yaml +++ b/qai_hub_models/models/swin_tiny/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Swin-Tiny performance_metrics: - torchscript_onnx_tflite: - inference_time: 28481.0 - throughput: 35.11112671605632 + inference_time: 17594.0 + throughput: 56.83755825849722 estimated_peak_memory_range: - min: 217088 - max: 74292680 + min: 0 + max: 2690144 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 837 - job_id: jygzonlo5 + job_id: jqp4wl9qg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 15006.0 + throughput: 66.6400106624017 + estimated_peak_memory_range: + min: 0 + max: 28760920 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 700 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 700 + job_id: jegne6kvg job_status: Passed torchscript_onnx_ort: - inference_time: 27887.0 - throughput: 35.85900240255316 + inference_time: 34124.0 + throughput: 29.304888055327627 estimated_peak_memory_range: - min: 16384 - max: 164109776 + min: 65536 + max: 157394912 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 624 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jmg9jdzw5 + total_layers: 624 + job_id: j2p0rz62p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.637970Z' + timestamp: '2024-05-20T16:35:31.283034Z' - torchscript_onnx_tflite: - inference_time: 18310.0 - throughput: 54.614964500273075 + inference_time: 11804.0 + throughput: 84.71704506946797 estimated_peak_memory_range: min: 40960 - max: 293649808 + max: 289709760 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 837 - job_id: jz5w24l35 + job_id: j0px1kdjg job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jopryvwvg + job_status: Failed torchscript_onnx_ort: - inference_time: 19785.0 - throughput: 50.543340914834474 + inference_time: 23681.0 + throughput: 42.22794645496389 estimated_peak_memory_range: - min: 634880 - max: 162638432 + min: 28672 + max: 109585264 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 624 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jnp1y6n8p + total_layers: 624 + job_id: j1p87q1z5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.638080Z' + timestamp: '2024-05-20T16:35:31.283062Z' - torchscript_onnx_tflite: - inference_time: 28405.0 - throughput: 35.205069530012324 + inference_time: 17554.0 + throughput: 56.96707303178763 estimated_peak_memory_range: - min: 57344 - max: 3112384 + min: 28672 + max: 2913592 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 837 - job_id: jep20qd4g + job_id: jo5mzndyp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 14942.0 + throughput: 66.9254450542096 + estimated_peak_memory_range: + min: 225280 + max: 27331792 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 700 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 700 + job_id: jqpyd1mrp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.638179Z' + timestamp: '2024-05-20T16:35:31.283079Z' + - torchscript_onnx_qnn: + inference_time: 14251.0 + throughput: 70.17051434987019 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 700 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 700 + job_id: jep2mkex5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 35507.0 + throughput: 28.16346072605402 + estimated_peak_memory_range: + min: 241229824 + max: 241229824 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 624 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 624 + job_id: jogkye8yp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 17912.0 + throughput: 55.828494863778474 + estimated_peak_memory_range: + min: 1433600 + max: 1433600 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 564 + total_layers: 564 + job_id: jn5q26v75 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.283101Z' diff --git a/qai_hub_models/models/trocr/README.md b/qai_hub_models/models/trocr/README.md index 9343aea7..8e1b963a 100644 --- a/qai_hub_models/models/trocr/README.md +++ b/qai_hub_models/models/trocr/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/t a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/trocr/export.py b/qai_hub_models/models/trocr/export.py index b005f639..d0528db8 100644 --- a/qai_hub_models/models/trocr/export.py +++ b/qai_hub_models/models/trocr/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -227,10 +227,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") parser = export_parser( - model_cls=Model, - components=ALL_COMPONENTS, - supports_qnn=False, - supports_ort=False, + model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False ) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/trocr/perf.yaml b/qai_hub_models/models/trocr/perf.yaml index 9538686c..2bf9904d 100644 --- a/qai_hub_models/models/trocr/perf.yaml +++ b/qai_hub_models/models/trocr/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: TrOCREncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 216492.0 - throughput: 4.619108327328492 + inference_time: 149663.0 + throughput: 6.68167817028925 estimated_peak_memory_range: - min: 7274496 - max: 10306224 + min: 7266304 + max: 10722008 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 592 - job_id: jz5709evg + job_id: j1glkvlep + job_status: Passed + torchscript_onnx_qnn: + inference_time: 123961.0 + throughput: 8.067053347423787 + estimated_peak_memory_range: + min: 32768 + max: 24931512 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 469 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 469 + job_id: jlpevdy75 job_status: Passed torchscript_onnx_ort: - inference_time: 189041.0 - throughput: 5.289857755724949 + inference_time: 111209.0 + throughput: 8.992077979300236 estimated_peak_memory_range: - min: 69632 - max: 125141888 + min: 143360 + max: 114159672 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 396 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jegnlkzk5 + total_layers: 396 + job_id: jz57dy9q5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.655937Z' + timestamp: '2024-05-20T16:35:31.307706Z' - torchscript_onnx_tflite: - inference_time: 162590.0 - throughput: 6.1504397564425854 + inference_time: 111478.0 + throughput: 8.970379805880981 estimated_peak_memory_range: - min: 5963776 - max: 327025904 + min: 6787072 + max: 349351296 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 592 - job_id: j0pxnxl35 + job_id: j1p3mj6xg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 92809.0 + throughput: 10.77481709747977 + estimated_peak_memory_range: + min: 1785856 + max: 169310384 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 469 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 469 + job_id: jz5w9e4zp job_status: Passed torchscript_onnx_ort: - inference_time: 143879.0 - throughput: 6.95028461415495 + inference_time: 84299.0 + throughput: 11.86253692214617 estimated_peak_memory_range: - min: 14708736 - max: 90842000 + min: 11382784 + max: 88625792 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 396 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jep20errg + total_layers: 396 + job_id: j0px1kxjg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.656015Z' + timestamp: '2024-05-20T16:35:31.307733Z' - torchscript_onnx_tflite: - inference_time: 216411.0 - throughput: 4.620837203284491 + inference_time: 149781.0 + throughput: 6.676414231444576 estimated_peak_memory_range: min: 7274496 - max: 10398120 + max: 10723104 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 592 - job_id: jlpee0x1p + job_id: j1pvw677g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 123679.0 + throughput: 8.085447003937613 + estimated_peak_memory_range: + min: 1929216 + max: 24597888 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 469 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 469 + job_id: jnp1846kg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,15 +178,53 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.656082Z' + timestamp: '2024-05-20T16:35:31.307750Z' + - torchscript_onnx_ort: + inference_time: 111834.0 + throughput: 8.941824489868912 + estimated_peak_memory_range: + min: 34922496 + max: 34922496 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 396 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 396 + job_id: jegne6nvg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 39277.0 + throughput: 25.46019298826285 + estimated_peak_memory_range: + min: 2703360 + max: 2703360 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 365 + total_layers: 365 + job_id: jep2mkwx5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.307768Z' - name: TrOCRDecoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 2684.0 - throughput: 372.5782414307005 + inference_time: 2717.0 + throughput: 368.052999631947 estimated_peak_memory_range: - min: 16384 - max: 2557552 + min: 20480 + max: 2492240 primary_compute_unit: NPU precision: fp16 layer_info: @@ -147,22 +232,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 370 - job_id: jqp4k3y8g + job_id: jw561ywvp job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jygz73nzp + job_status: Failed torchscript_onnx_ort: - inference_time: 2944.0 - throughput: 339.67391304347825 + inference_time: 2875.0 + throughput: 347.82608695652175 estimated_peak_memory_range: - min: 28672 - max: 392358928 + min: 0 + max: 575282800 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 352 layers_on_gpu: 0 layers_on_cpu: 1 - total_layers: 2 - job_id: jopr8wl05 + total_layers: 353 + job_id: jqp4wl3qg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -171,13 +271,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.656134Z' + timestamp: '2024-05-20T16:35:31.307792Z' - torchscript_onnx_tflite: - inference_time: 1948.0 - throughput: 513.347022587269 + inference_time: 1998.0 + throughput: 500.5005005005005 estimated_peak_memory_range: min: 12288 - max: 192910976 + max: 192263456 primary_compute_unit: NPU precision: fp16 layer_info: @@ -185,22 +285,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 370 - job_id: jo5mq80dp + job_id: jwgov2845 job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jmg94ldq5 + job_status: Failed torchscript_onnx_ort: - inference_time: 2482.0 - throughput: 402.90088638195004 + inference_time: 2139.0 + throughput: 467.50818139317437 estimated_peak_memory_range: min: 0 - max: 36159696 + max: 45855536 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 352 layers_on_gpu: 0 layers_on_cpu: 1 - total_layers: 2 - job_id: jqpyrmo85 + total_layers: 353 + job_id: jo5mzn8yp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -209,13 +324,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.656185Z' + timestamp: '2024-05-20T16:35:31.307816Z' - torchscript_onnx_tflite: - inference_time: 2691.0 - throughput: 371.6090672612412 + inference_time: 2735.0 + throughput: 365.6307129798903 estimated_peak_memory_range: min: 16384 - max: 2038272 + max: 2426968 primary_compute_unit: NPU precision: fp16 layer_info: @@ -223,8 +338,23 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 370 - job_id: jygzoqyk5 + job_id: j7gjlvq7p job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jvgdvx2kg + job_status: Failed reference_device_info: name: QCS8550 (Proxy) os: '12' @@ -232,4 +362,42 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.656227Z' + timestamp: '2024-05-20T16:35:31.307832Z' + - torchscript_onnx_ort: + inference_time: 2647.0 + throughput: 377.7861730260673 + estimated_peak_memory_range: + min: 355991552 + max: 355991552 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 352 + layers_on_gpu: 0 + layers_on_cpu: 1 + total_layers: 353 + job_id: jopryv0vg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 1426.0 + throughput: 701.2622720897616 + estimated_peak_memory_range: + min: 7168000 + max: 7168000 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 300 + total_layers: 300 + job_id: jqpyd1xrp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.307850Z' diff --git a/qai_hub_models/models/unet_segmentation/README.md b/qai_hub_models/models/unet_segmentation/README.md index 03162771..78dfce4f 100644 --- a/qai_hub_models/models/unet_segmentation/README.md +++ b/qai_hub_models/models/unet_segmentation/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/u a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/unet_segmentation/export.py b/qai_hub_models/models/unet_segmentation/export.py index be9c6471..2ecf01e9 100644 --- a/qai_hub_models/models/unet_segmentation/export.py +++ b/qai_hub_models/models/unet_segmentation/export.py @@ -120,12 +120,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -163,8 +167,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -192,8 +198,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -205,7 +215,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/unet_segmentation/perf.yaml b/qai_hub_models/models/unet_segmentation/perf.yaml index c45f70ad..dfcd3e11 100644 --- a/qai_hub_models/models/unet_segmentation/perf.yaml +++ b/qai_hub_models/models/unet_segmentation/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Unet-Segmentation performance_metrics: - torchscript_onnx_tflite: - inference_time: 155616.0 - throughput: 6.4260744396463085 + inference_time: 161691.0 + throughput: 6.184636126933472 estimated_peak_memory_range: - min: 6692864 - max: 229373376 + min: 16384 + max: 237098920 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: jogk782wp + job_id: j2p0rzj2p job_status: Passed torchscript_onnx_qnn: - inference_time: 150609.0 - throughput: 6.63970944631463 + inference_time: 149965.0 + throughput: 6.668222585269897 estimated_peak_memory_range: - min: 9854976 - max: 34064640 + min: 9981952 + max: 30872736 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 51 - job_id: j1gl6lyjg + job_id: jn5q26y75 job_status: Passed torchscript_onnx_ort: - inference_time: 150132.0 - throughput: 6.6608051581275145 + inference_time: 157701.0 + throughput: 6.341113880064172 estimated_peak_memory_range: - min: 13246464 - max: 147066768 + min: 13557760 + max: 158096808 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 53 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p3v6z3g + total_layers: 53 + job_id: jwgov2r45 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.687955Z' + timestamp: '2024-05-20T16:35:31.352363Z' - torchscript_onnx_tflite: - inference_time: 112866.0 - throughput: 8.860064146864424 + inference_time: 115442.0 + throughput: 8.662358586996067 estimated_peak_memory_range: - min: 5500928 - max: 359682512 + min: 4841472 + max: 335577584 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: jn5qevln5 + job_id: j1p87qxz5 job_status: Passed torchscript_onnx_qnn: - inference_time: 111273.0 - throughput: 8.98690607784458 + inference_time: 109130.0 + throughput: 9.163383121048291 estimated_peak_memory_range: - min: 9814016 - max: 110733232 + min: 9969664 + max: 88942624 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 51 - job_id: jw56ew86g + job_id: j1glkvxep job_status: Passed torchscript_onnx_ort: - inference_time: 110582.0 - throughput: 9.043063066321825 + inference_time: 118569.0 + throughput: 8.433907682446508 estimated_peak_memory_range: - min: 16162816 - max: 113694432 + min: 22605824 + max: 100595248 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 53 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jwgok8lqp + total_layers: 53 + job_id: j1pvw6d7g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.687994Z' + timestamp: '2024-05-20T16:35:31.352389Z' - torchscript_onnx_tflite: - inference_time: 160844.0 - throughput: 6.2172042475939415 + inference_time: 157031.0 + throughput: 6.368169342359152 estimated_peak_memory_range: - min: 323584 - max: 237497504 + min: 6692864 + max: 464186128 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: jw56e900g + job_id: jogkye4yp job_status: Passed torchscript_onnx_qnn: - inference_time: 150008.0 - throughput: 6.666311130073063 + inference_time: 146356.0 + throughput: 6.832654622974118 estimated_peak_memory_range: - min: 9900032 - max: 34159264 + min: 9895936 + max: 31713392 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 51 - job_id: jo5mq11wp + job_id: j1p3mj9xg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.688021Z' + timestamp: '2024-05-20T16:35:31.352406Z' + - torchscript_onnx_qnn: + inference_time: 190735.0 + throughput: 5.24287624190631 + estimated_peak_memory_range: + min: 9850880 + max: 9850880 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 51 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 51 + job_id: jw561y7vp + job_status: Passed + torchscript_onnx_ort: + inference_time: 146581.0 + throughput: 6.82216658366364 + estimated_peak_memory_range: + min: 9854976 + max: 9854976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 53 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 53 + job_id: j7gjlv77p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 1963452.0 + throughput: 0.5093070775348723 + estimated_peak_memory_range: + min: 1940811776 + max: 1940811776 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 31 + total_layers: 31 + job_id: jlpevdz75 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.352430Z' diff --git a/qai_hub_models/models/vit/README.md b/qai_hub_models/models/vit/README.md index 3a7735f0..314d20ef 100644 --- a/qai_hub_models/models/vit/README.md +++ b/qai_hub_models/models/vit/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/v a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/vit/export.py b/qai_hub_models/models/vit/export.py index 9b96fb31..62a9bca5 100644 --- a/qai_hub_models/models/vit/export.py +++ b/qai_hub_models/models/vit/export.py @@ -121,9 +121,16 @@ def export_model( model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -161,8 +168,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -199,7 +210,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/vit/perf.yaml b/qai_hub_models/models/vit/perf.yaml index f7c0334b..09b5ee29 100644 --- a/qai_hub_models/models/vit/perf.yaml +++ b/qai_hub_models/models/vit/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: VIT performance_metrics: - torchscript_onnx_tflite: - inference_time: 119744.0 - throughput: 8.351149118118654 + inference_time: 79223.0 + throughput: 12.622596973101246 estimated_peak_memory_range: - min: 196608 - max: 3447072 + min: 126976 + max: 3307040 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 535 - job_id: j7gjzq3v5 + job_id: jygz73mzp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 67117.0 + throughput: 14.899354857934652 + estimated_peak_memory_range: + min: 32768 + max: 42487808 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 386 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 386 + job_id: jnp184jkg job_status: Passed torchscript_onnx_ort: - inference_time: 128755.0 - throughput: 7.766688672284571 + inference_time: 104492.0 + throughput: 9.570110630478888 estimated_peak_memory_range: - min: 36864 - max: 430908512 + min: 73728 + max: 437745512 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 376 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jygzonzo5 + total_layers: 376 + job_id: j0px1k4jg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.712224Z' + timestamp: '2024-05-20T16:35:31.382740Z' - torchscript_onnx_tflite: - inference_time: 89024.0 - throughput: 11.23292595255212 + inference_time: 56817.0 + throughput: 17.60036608761462 estimated_peak_memory_range: - min: 151552 - max: 407939792 + min: 114688 + max: 373000000 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 535 - job_id: jlpeey6op + job_id: jz5w9e7zp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 48402.0 + throughput: 20.660303293252344 + estimated_peak_memory_range: + min: 0 + max: 164302880 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 386 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 386 + job_id: jvgdvx3kg job_status: Passed torchscript_onnx_ort: - inference_time: 98667.0 - throughput: 10.135100894929408 + inference_time: 76327.0 + throughput: 13.101523707207148 estimated_peak_memory_range: - min: 663552 - max: 874006192 + min: 638976 + max: 514001424 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 376 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jz5w24y35 + total_layers: 376 + job_id: jo5mznmyp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.712295Z' + timestamp: '2024-05-20T16:35:31.382767Z' - torchscript_onnx_tflite: - inference_time: 119402.0 - throughput: 8.375069094320029 + inference_time: 78953.0 + throughput: 12.665763175560143 estimated_peak_memory_range: - min: 135168 - max: 4419520 + min: 143360 + max: 3490600 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 535 - job_id: jqpyrkk75 + job_id: jmg94lmq5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 67350.0 + throughput: 14.847809948032666 + estimated_peak_memory_range: + min: 12288 + max: 46277240 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 386 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 386 + job_id: jqp4wl1qg job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.712360Z' + timestamp: '2024-05-20T16:35:31.382784Z' + - torchscript_onnx_qnn: + inference_time: 65972.0 + throughput: 15.157945795185837 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 385 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 385 + job_id: jz57dy4q5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 103551.0 + throughput: 9.657077189017972 + estimated_peak_memory_range: + min: 176091136 + max: 176091136 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 376 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 376 + job_id: jegne6zvg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jopryvlvg + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.382809Z' diff --git a/qai_hub_models/models/whisper_base_en/README.md b/qai_hub_models/models/whisper_base_en/README.md index 6939e9d7..d751e49d 100644 --- a/qai_hub_models/models/whisper_base_en/README.md +++ b/qai_hub_models/models/whisper_base_en/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/whisper_base_en/export.py b/qai_hub_models/models/whisper_base_en/export.py index 2b462de6..12f78f5b 100644 --- a/qai_hub_models/models/whisper_base_en/export.py +++ b/qai_hub_models/models/whisper_base_en/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -226,12 +226,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser( - model_cls=Model, - components=ALL_COMPONENTS, - supports_qnn=False, - supports_ort=False, - ) + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/whisper_base_en/perf.yaml b/qai_hub_models/models/whisper_base_en/perf.yaml index 277067ac..c88444dd 100644 --- a/qai_hub_models/models/whisper_base_en/perf.yaml +++ b/qai_hub_models/models/whisper_base_en/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,39 +31,55 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: WhisperEncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 154415.0 - throughput: 6.476054787423502 + inference_time: 159429.0 + throughput: 6.272384572442905 estimated_peak_memory_range: - min: 36925440 - max: 139242008 + min: 25227264 + max: 130754096 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 303 + layers_on_gpu: 419 layers_on_cpu: 0 - total_layers: 303 - job_id: jnp1y6o8p + total_layers: 419 + job_id: jep2mkrx5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 622656.0 + throughput: 1.6060232295199919 + estimated_peak_memory_range: + min: 12288 + max: 87059512 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 580 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 580 + job_id: j1glkvyep job_status: Passed torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + inference_time: 394707.0 + throughput: 2.53352486781334 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 75538432 + max: 255421288 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 380 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: j0pxnx035 - job_status: Failed + total_layers: 380 + job_id: jz5w9elzp + job_status: Passed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -70,37 +87,52 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.730149Z' + timestamp: '2024-05-20T16:35:31.407290Z' - torchscript_onnx_tflite: - inference_time: 118628.0 - throughput: 8.42971305256769 + inference_time: 122468.0 + throughput: 8.16539830812947 estimated_peak_memory_range: - min: 36814848 - max: 61467824 + min: 0 + max: 42440336 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 303 + layers_on_gpu: 419 layers_on_cpu: 0 - total_layers: 303 - job_id: jz5709ovg + total_layers: 419 + job_id: j2p0rzm2p job_status: Passed - torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + torchscript_onnx_qnn: + inference_time: 454603.0 + throughput: 2.1997215152561687 estimated_peak_memory_range: min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + max: 198547792 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 580 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jegnlk1k5 - job_status: Failed + total_layers: 580 + job_id: j1p3mjzxg + job_status: Passed + torchscript_onnx_ort: + inference_time: 304852.0 + throughput: 3.280280267146025 + estimated_peak_memory_range: + min: 73445376 + max: 277367024 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 380 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 380 + job_id: jnp184nkg + job_status: Passed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -108,21 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.730201Z' + timestamp: '2024-05-20T16:35:31.407316Z' - torchscript_onnx_tflite: - inference_time: 157798.0 - throughput: 6.337215934295745 + inference_time: 157524.0 + throughput: 6.348238998501816 estimated_peak_memory_range: - min: 25370624 - max: 124671888 + min: 29507584 + max: 129166896 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 303 + layers_on_gpu: 419 layers_on_cpu: 0 - total_layers: 303 - job_id: jlpee001p + total_layers: 419 + job_id: jogkye2yp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 625414.0 + throughput: 1.5989408615732938 + estimated_peak_memory_range: + min: 1048576 + max: 78119600 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 580 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 580 + job_id: jlpevd775 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,38 +178,106 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.730243Z' + timestamp: '2024-05-20T16:35:31.407333Z' + - torchscript_onnx_qnn: + inference_time: 459784.0 + throughput: 2.1749343169836273 + estimated_peak_memory_range: + min: 962560 + max: 962560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 579 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 579 + job_id: j1pvw6l7g + job_status: Passed + torchscript_onnx_ort: + inference_time: 390367.0 + throughput: 2.56169194629669 + estimated_peak_memory_range: + min: 139673600 + max: 139673600 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 380 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 380 + job_id: jz57dyeq5 + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j0px1kljg + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.407358Z' - name: WhisperDecoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 13793.0 - throughput: 72.50054375407815 + inference_time: 23342.0 + throughput: 42.84123040013709 estimated_peak_memory_range: - min: 5775360 - max: 8469096 + min: 5783552 + max: 8760040 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 447 + layers_on_npu: 983 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 449 - job_id: jvgde26r5 + layers_on_cpu: 0 + total_layers: 983 + job_id: jqpyd1orp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 23335.0 + throughput: 42.854081851296336 + estimated_peak_memory_range: + min: 41029632 + max: 57664648 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 821 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 821 + job_id: jw561y8vp job_status: Passed torchscript_onnx_ort: - inference_time: 17653.0 - throughput: 56.64759530957911 + inference_time: 24574.0 + throughput: 40.6934158053227 estimated_peak_memory_range: - min: 11657216 - max: 330606792 - primary_compute_unit: CPU - precision: fp32 + min: 11902976 + max: 207621344 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 844 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 3 - job_id: jo5mq89dp + layers_on_cpu: 0 + total_layers: 844 + job_id: jmg94lzq5 job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -171,36 +286,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.730305Z' + timestamp: '2024-05-20T16:35:31.407380Z' - torchscript_onnx_tflite: - inference_time: 10194.0 - throughput: 98.09691975671964 + inference_time: 19155.0 + throughput: 52.205690420255806 estimated_peak_memory_range: - min: 3768320 - max: 98615936 + min: 3674112 + max: 90342624 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 447 + layers_on_npu: 983 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 449 - job_id: jqp4k3e8g + layers_on_cpu: 0 + total_layers: 983 + job_id: j1p87qez5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 18519.0 + throughput: 53.99859603650305 + estimated_peak_memory_range: + min: 131715072 + max: 412276656 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 821 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 821 + job_id: jwgov2l45 job_status: Passed torchscript_onnx_ort: - inference_time: 14072.0 - throughput: 71.0631040363843 + inference_time: 20701.0 + throughput: 48.30684507994783 estimated_peak_memory_range: - min: 52715520 - max: 167779568 - primary_compute_unit: CPU - precision: fp32 + min: 55021568 + max: 137177376 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 844 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 3 - job_id: jopr8wx05 + layers_on_cpu: 0 + total_layers: 844 + job_id: jvgdvxdkg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -209,21 +339,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.730361Z' + timestamp: '2024-05-20T16:35:31.407400Z' - torchscript_onnx_tflite: - inference_time: 13928.0 - throughput: 71.79781734635267 + inference_time: 23210.0 + throughput: 43.084877208099954 + estimated_peak_memory_range: + min: 1146880 + max: 5317720 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 983 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 983 + job_id: jn5q26l75 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 23685.0 + throughput: 42.22081486172683 estimated_peak_memory_range: - min: 5758976 - max: 8442936 + min: 42434560 + max: 57209568 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 447 + layers_on_npu: 821 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 449 - job_id: jygzoqqk5 + layers_on_cpu: 0 + total_layers: 821 + job_id: jygz73lzp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -232,4 +377,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.730409Z' + timestamp: '2024-05-20T16:35:31.407416Z' + - torchscript_onnx_qnn: + inference_time: 13480.0 + throughput: 74.1839762611276 + estimated_peak_memory_range: + min: 42463232 + max: 42463232 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 821 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 821 + job_id: j7gjlvr7p + job_status: Passed + torchscript_onnx_ort: + inference_time: 20213.0 + throughput: 49.47311136397368 + estimated_peak_memory_range: + min: 112713728 + max: 112713728 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 844 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 844 + job_id: jqp4wlyqg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jo5mzn0yp + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.407452Z' diff --git a/qai_hub_models/models/whisper_small_en/README.md b/qai_hub_models/models/whisper_small_en/README.md index e32ddef2..f0f96498 100644 --- a/qai_hub_models/models/whisper_small_en/README.md +++ b/qai_hub_models/models/whisper_small_en/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/whisper_small_en/export.py b/qai_hub_models/models/whisper_small_en/export.py index 9c455526..82d19aa8 100644 --- a/qai_hub_models/models/whisper_small_en/export.py +++ b/qai_hub_models/models/whisper_small_en/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -226,12 +226,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser( - model_cls=Model, - components=ALL_COMPONENTS, - supports_qnn=False, - supports_ort=False, - ) + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/whisper_small_en/perf.yaml b/qai_hub_models/models/whisper_small_en/perf.yaml index c98a944d..91b45ae6 100644 --- a/qai_hub_models/models/whisper_small_en/perf.yaml +++ b/qai_hub_models/models/whisper_small_en/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,24 +31,40 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: WhisperEncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 600006.0 - throughput: 1.666650000166665 + inference_time: 615600.0 + throughput: 1.6244314489928524 estimated_peak_memory_range: - min: 79036416 - max: 532898328 + min: 12288 + max: 448683040 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 585 + layers_on_gpu: 911 layers_on_cpu: 0 - total_layers: 585 - job_id: j2p036o9p + total_layers: 911 + job_id: jegne61vg job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jogkye6yp + job_status: Failed torchscript_onnx_ort: inference_time: 'null' throughput: 'null' @@ -61,7 +78,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: j1gl6lwjg + job_id: jlpevd675 job_status: Failed reference_device_info: name: Samsung Galaxy S23 @@ -70,37 +87,52 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.762329Z' + timestamp: '2024-05-20T16:35:31.453698Z' - torchscript_onnx_tflite: - inference_time: 465622.0 - throughput: 2.1476648440151025 + inference_time: 470667.0 + throughput: 2.124644387645618 estimated_peak_memory_range: - min: 110800896 - max: 143440272 + min: 108802048 + max: 205784096 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 585 + layers_on_gpu: 911 layers_on_cpu: 0 - total_layers: 585 - job_id: jogk786wp + total_layers: 911 + job_id: jep2mkox5 job_status: Passed - torchscript_onnx_ort: - inference_time: 'null' - throughput: 'null' + torchscript_onnx_qnn: + inference_time: 1479203.0 + throughput: 0.6760397322071413 estimated_peak_memory_range: min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + max: 569102256 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 1474 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: j1p3v6o3g - job_status: Failed + total_layers: 1474 + job_id: j1glkvwep + job_status: Passed + torchscript_onnx_ort: + inference_time: 1261557.0 + throughput: 0.7926712784281645 + estimated_peak_memory_range: + min: 999424 + max: 563911776 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 884 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 884 + job_id: jz5w9eyzp + job_status: Passed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -108,22 +140,37 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.762404Z' + timestamp: '2024-05-20T16:35:31.453725Z' - torchscript_onnx_tflite: - inference_time: 602366.0 - throughput: 1.66012025911157 + inference_time: 612583.0 + throughput: 1.63243185005134 estimated_peak_memory_range: - min: 72904704 - max: 522853520 + min: 16384 + max: 444838416 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 585 + layers_on_gpu: 911 layers_on_cpu: 0 - total_layers: 585 - job_id: j2p038w9p + total_layers: 911 + job_id: j2p0rzo2p job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1pvw627g + job_status: Failed reference_device_info: name: QCS8550 (Proxy) os: '12' @@ -131,38 +178,106 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.762472Z' + timestamp: '2024-05-20T16:35:31.453757Z' + - torchscript_onnx_qnn: + inference_time: 1707514.0 + throughput: 0.5856467355465313 + estimated_peak_memory_range: + min: 962560 + max: 962560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1473 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1473 + job_id: j1p3mjoxg + job_status: Passed + torchscript_onnx_ort: + inference_time: 1518658.0 + throughput: 0.6584761019268328 + estimated_peak_memory_range: + min: 555753472 + max: 555753472 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 884 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 884 + job_id: jnp184okg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jz5w9eyjp + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.453784Z' - name: WhisperDecoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 45614.0 - throughput: 21.92309378699522 + inference_time: 26229.0 + throughput: 38.12573868618704 estimated_peak_memory_range: - min: 16830464 - max: 20007784 + min: 16203776 + max: 19541664 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 879 + layers_on_npu: 2573 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 881 - job_id: j1p801jkg + layers_on_cpu: 0 + total_layers: 2573 + job_id: jopryvxvg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 24425.0 + throughput: 40.941658137154555 + estimated_peak_memory_range: + min: 121384960 + max: 195379040 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 2255 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 2255 + job_id: jn5q26475 job_status: Passed torchscript_onnx_ort: - inference_time: 75579.0 - throughput: 13.231188557668135 + inference_time: 62618.0 + throughput: 15.969848925229167 estimated_peak_memory_range: - min: 40751104 - max: 289480944 - primary_compute_unit: CPU - precision: fp32 + min: 49823744 + max: 691829120 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 2302 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 3 - job_id: jw56ewo6g + layers_on_cpu: 0 + total_layers: 2302 + job_id: jygz73zzp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -171,36 +286,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.762570Z' + timestamp: '2024-05-20T16:35:31.453805Z' - torchscript_onnx_tflite: - inference_time: 34559.0 - throughput: 28.936022454353424 + inference_time: 19526.0 + throughput: 51.21376626037079 estimated_peak_memory_range: - min: 15560704 - max: 1589538480 + min: 16277504 + max: 1152242688 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 879 + layers_on_npu: 2573 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 881 - job_id: jn5qev4n5 + layers_on_cpu: 0 + total_layers: 2573 + job_id: jqpyd18rp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 19235.0 + throughput: 51.988562516246425 + estimated_peak_memory_range: + min: 110612480 + max: 902217440 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 2255 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 2255 + job_id: jw561yovp job_status: Passed torchscript_onnx_ort: - inference_time: 60639.0 - throughput: 16.49103712132456 + inference_time: 53225.0 + throughput: 18.788163457022076 estimated_peak_memory_range: - min: 160247808 - max: 557923088 - primary_compute_unit: CPU - precision: fp32 + min: 84680704 + max: 354730464 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 2302 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 3 - job_id: jwgok8dqp + layers_on_cpu: 0 + total_layers: 2302 + job_id: jmg94loq5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -209,21 +339,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.762666Z' + timestamp: '2024-05-20T16:35:31.453825Z' - torchscript_onnx_tflite: - inference_time: 45957.0 - throughput: 21.75947080966991 + inference_time: 27363.0 + throughput: 36.54570039834813 estimated_peak_memory_range: min: 16830464 - max: 19552208 + max: 19976992 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 2573 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 2573 + job_id: j1p87qjz5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 25042.0 + throughput: 39.93291270665282 + estimated_peak_memory_range: + min: 127197184 + max: 202463224 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 879 + layers_on_npu: 2255 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 881 - job_id: j1p80dnkg + layers_on_cpu: 0 + total_layers: 2255 + job_id: j7gjlv37p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -232,4 +377,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.762757Z' + timestamp: '2024-05-20T16:35:31.453845Z' + - torchscript_onnx_qnn: + inference_time: 20874.0 + throughput: 47.906486538277285 + estimated_peak_memory_range: + min: 127381504 + max: 127381504 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 2255 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 2255 + job_id: jwgov2d45 + job_status: Passed + torchscript_onnx_ort: + inference_time: 54047.0 + throughput: 18.502414565100747 + estimated_peak_memory_range: + min: 347856896 + max: 347856896 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 2302 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 2302 + job_id: jvgdvx6kg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jmg94lov5 + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.453870Z' diff --git a/qai_hub_models/models/whisper_tiny_en/README.md b/qai_hub_models/models/whisper_tiny_en/README.md index 00d0e87f..e541696e 100644 --- a/qai_hub_models/models/whisper_tiny_en/README.md +++ b/qai_hub_models/models/whisper_tiny_en/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/whisper_tiny_en/export.py b/qai_hub_models/models/whisper_tiny_en/export.py index 038202f9..58af8d0b 100644 --- a/qai_hub_models/models/whisper_tiny_en/export.py +++ b/qai_hub_models/models/whisper_tiny_en/export.py @@ -134,7 +134,7 @@ def export_model( # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, compile_options + target_runtime, compile_options, hub_device ) print(f"Optimizing model {component_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -226,12 +226,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser( - model_cls=Model, - components=ALL_COMPONENTS, - supports_qnn=False, - supports_ort=False, - ) + parser = export_parser(model_cls=Model, components=ALL_COMPONENTS) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/whisper_tiny_en/perf.yaml b/qai_hub_models/models/whisper_tiny_en/perf.yaml index a16bafaf..724a32a5 100644 --- a/qai_hub_models/models/whisper_tiny_en/perf.yaml +++ b/qai_hub_models/models/whisper_tiny_en/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,23 +31,39 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: WhisperEncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 67351.0 - throughput: 14.847589493845675 + inference_time: 68887.0 + throughput: 14.516527066064715 estimated_peak_memory_range: - min: 16117760 - max: 104999648 + min: 11296768 + max: 56646392 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 209 + layers_on_gpu: 271 layers_on_cpu: 0 - total_layers: 209 - job_id: jlpeeyxop + total_layers: 271 + job_id: jnp184olg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 288969.0 + throughput: 3.4605788164128333 + estimated_peak_memory_range: + min: 159744 + max: 54792792 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 338 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 338 + job_id: jegne6qmg job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -61,7 +78,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jnp1y618p + job_id: j1glkv8lp job_status: Failed reference_device_info: name: Samsung Galaxy S23 @@ -70,21 +87,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.794639Z' + timestamp: '2024-05-20T16:35:31.499723Z' - torchscript_onnx_tflite: - inference_time: 52682.0 - throughput: 18.981815420826848 + inference_time: 54355.0 + throughput: 18.397571520559286 estimated_peak_memory_range: min: 0 - max: 28255008 + max: 32722000 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 209 + layers_on_gpu: 271 layers_on_cpu: 0 - total_layers: 209 - job_id: jz5w24z35 + total_layers: 271 + job_id: jz57dynr5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 218798.0 + throughput: 4.570425689448715 + estimated_peak_memory_range: + min: 999424 + max: 138033888 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 338 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 338 + job_id: jep2mkdm5 job_status: Passed torchscript_onnx_ort: inference_time: 'null' @@ -99,7 +131,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: jz5709nvg + job_id: j1p3mj7zg job_status: Failed reference_device_info: name: Samsung Galaxy S24 @@ -108,21 +140,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.794681Z' + timestamp: '2024-05-20T16:35:31.499751Z' - torchscript_onnx_tflite: - inference_time: 67311.0 - throughput: 14.856412770572417 + inference_time: 68575.0 + throughput: 14.582573824279985 estimated_peak_memory_range: - min: 17125376 - max: 63332656 + min: 12288 + max: 95017352 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 - layers_on_gpu: 209 + layers_on_gpu: 271 + layers_on_cpu: 0 + total_layers: 271 + job_id: j0px1kr9g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 293385.0 + throughput: 3.40849054995995 + estimated_peak_memory_range: + min: 978944 + max: 49011728 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 338 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 209 - job_id: jygzoq1o5 + total_layers: 338 + job_id: jogkye0op job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,38 +178,106 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.794711Z' + timestamp: '2024-05-20T16:35:31.499770Z' + - torchscript_onnx_qnn: + inference_time: 240121.0 + throughput: 4.164567030788644 + estimated_peak_memory_range: + min: 962560 + max: 962560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 337 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 337 + job_id: j2p0rz9ep + job_status: Passed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1pvw6mmg + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jlpevdx05 + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.499797Z' - name: WhisperDecoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 7115.0 - throughput: 140.54813773717498 + inference_time: 3871.0 + throughput: 258.3311805734952 estimated_peak_memory_range: min: 2977792 - max: 5417544 + max: 5435904 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 303 + layers_on_npu: 557 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 305 - job_id: jygzonyo5 + layers_on_cpu: 0 + total_layers: 557 + job_id: jvgdvx6lg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 3646.0 + throughput: 274.27317608337904 + estimated_peak_memory_range: + min: 9920512 + max: 47146336 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 447 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 447 + job_id: jopryvdeg job_status: Passed torchscript_onnx_ort: - inference_time: 8714.0 - throughput: 114.75786091347257 + inference_time: 5287.0 + throughput: 189.14318138831095 estimated_peak_memory_range: - min: 6172672 - max: 212702328 - primary_compute_unit: CPU - precision: fp32 + min: 6336512 + max: 214447104 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 462 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 3 - job_id: jvgde24r5 + layers_on_cpu: 0 + total_layers: 462 + job_id: jw561ym7p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -171,36 +286,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.794757Z' + timestamp: '2024-05-20T16:35:31.499823Z' - torchscript_onnx_tflite: - inference_time: 5479.0 - throughput: 182.5150574922431 + inference_time: 3044.0 + throughput: 328.515111695138 estimated_peak_memory_range: - min: 2871296 - max: 232253952 + min: 36864 + max: 223105088 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 303 + layers_on_npu: 557 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 305 - job_id: jmg9jd2w5 + layers_on_cpu: 0 + total_layers: 557 + job_id: jqp4wl4lg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 2767.0 + throughput: 361.4022406938923 + estimated_peak_memory_range: + min: 9170944 + max: 143104560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 447 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 447 + job_id: jqpyd124p job_status: Passed torchscript_onnx_ort: - inference_time: 6141.0 - throughput: 162.83992835043153 + inference_time: 4230.0 + throughput: 236.4066193853428 estimated_peak_memory_range: - min: 24158208 - max: 103238656 - primary_compute_unit: CPU - precision: fp32 + min: 27504640 + max: 86953184 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 462 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 3 - job_id: jqp4k348g + layers_on_cpu: 0 + total_layers: 462 + job_id: jwgov2wd5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -209,21 +339,36 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.794805Z' + timestamp: '2024-05-20T16:35:31.499844Z' - torchscript_onnx_tflite: - inference_time: 7148.0 - throughput: 139.89927252378288 + inference_time: 3892.0 + throughput: 256.9373072970195 estimated_peak_memory_range: min: 2977792 - max: 5388280 + max: 7226936 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 557 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 557 + job_id: jo5mznkqp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 3696.0 + throughput: 270.56277056277054 + estimated_peak_memory_range: + min: 11145216 + max: 48599472 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 303 + layers_on_npu: 447 layers_on_gpu: 0 - layers_on_cpu: 2 - total_layers: 305 - job_id: jz5w20j35 + layers_on_cpu: 0 + total_layers: 447 + job_id: jn5q261m5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -232,4 +377,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.794844Z' + timestamp: '2024-05-20T16:35:31.499860Z' + - torchscript_onnx_qnn: + inference_time: 3823.0 + throughput: 261.5746795710175 + estimated_peak_memory_range: + min: 21233664 + max: 21233664 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 447 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 447 + job_id: j1p87qr85 + job_status: Passed + torchscript_onnx_ort: + inference_time: 4460.0 + throughput: 224.2152466367713 + estimated_peak_memory_range: + min: 21245952 + max: 21245952 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 462 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 462 + job_id: j7gjlvy8p + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jygz73y6p + job_status: Failed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.499881Z' diff --git a/qai_hub_models/models/wideresnet50/README.md b/qai_hub_models/models/wideresnet50/README.md index fe17a2b9..1fd5bb18 100644 --- a/qai_hub_models/models/wideresnet50/README.md +++ b/qai_hub_models/models/wideresnet50/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/wideresnet50/export.py b/qai_hub_models/models/wideresnet50/export.py index 4b8f0722..0fa960e8 100644 --- a/qai_hub_models/models/wideresnet50/export.py +++ b/qai_hub_models/models/wideresnet50/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,12 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -197,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/wideresnet50/perf.yaml b/qai_hub_models/models/wideresnet50/perf.yaml index e403f8b2..bc894e3e 100644 --- a/qai_hub_models/models/wideresnet50/perf.yaml +++ b/qai_hub_models/models/wideresnet50/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: WideResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 4900.0 - throughput: 204.08163265306123 + inference_time: 4874.0 + throughput: 205.1702913418137 estimated_peak_memory_range: - min: 49152 - max: 2616288 + min: 20480 + max: 2339968 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jegnlkqk5 + job_id: jz5w9ezjp job_status: Passed torchscript_onnx_qnn: - inference_time: 5767.0 - throughput: 173.40038148083926 + inference_time: 5693.0 + throughput: 175.65431231336729 estimated_peak_memory_range: - min: 618496 - max: 261398592 + min: 643072 + max: 344558120 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jep20edrg + job_id: jvgdvx4lg job_status: Passed torchscript_onnx_ort: - inference_time: 5427.0 - throughput: 184.26386585590566 + inference_time: 5517.0 + throughput: 181.257930034439 estimated_peak_memory_range: - min: 36864 - max: 457326944 + min: 24576 + max: 414560576 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 128 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j2p03699p + total_layers: 128 + job_id: jo5mznlqp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.826943Z' + timestamp: '2024-05-20T16:35:31.545698Z' - torchscript_onnx_tflite: - inference_time: 3655.0 - throughput: 273.59781121751024 + inference_time: 3649.0 + throughput: 274.0476842970677 estimated_peak_memory_range: - min: 16384 - max: 97733152 + min: 12288 + max: 97464480 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: jopr8wd05 + job_id: jmg94l2v5 job_status: Passed torchscript_onnx_qnn: - inference_time: 4245.0 - throughput: 235.57126030624264 + inference_time: 4302.0 + throughput: 232.4500232450023 estimated_peak_memory_range: - min: 618496 - max: 53403616 + min: 270987264 + max: 325564848 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jqpyrm285 + job_id: jz57dy8r5 job_status: Passed torchscript_onnx_ort: - inference_time: 4122.0 - throughput: 242.600679281902 + inference_time: 4156.0 + throughput: 240.61597690086623 estimated_peak_memory_range: min: 618496 - max: 39529440 + max: 36255216 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 128 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1p801rkg + total_layers: 128 + job_id: jegne6wmg job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.826991Z' + timestamp: '2024-05-20T16:35:31.545724Z' - torchscript_onnx_tflite: - inference_time: 4907.0 - throughput: 203.79050336254332 + inference_time: 4864.0 + throughput: 205.5921052631579 estimated_peak_memory_range: - min: 28672 - max: 2415760 + min: 24576 + max: 2245440 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 79 - job_id: j2p038n9p + job_id: jnp1841lg job_status: Passed torchscript_onnx_qnn: - inference_time: 5790.0 - throughput: 172.71157167530225 + inference_time: 5687.0 + throughput: 175.83963425356075 estimated_peak_memory_range: - min: 622592 - max: 209332032 + min: 647168 + max: 355205232 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jw56e9k6g + job_id: j0px1kz9g job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.827030Z' + timestamp: '2024-05-20T16:35:31.545745Z' + - torchscript_onnx_qnn: + inference_time: 5857.0 + throughput: 170.73587160662456 + estimated_peak_memory_range: + min: 602112 + max: 602112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: jqp4wl2lg + job_status: Passed + torchscript_onnx_ort: + inference_time: 5137.0 + throughput: 194.66614755693985 + estimated_peak_memory_range: + min: 46718976 + max: 46718976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 128 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 128 + job_id: jopryv7eg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 27924.0 + throughput: 35.8114883254548 + estimated_peak_memory_range: + min: 36831232 + max: 36831232 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jep2mkzm5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.545770Z' diff --git a/qai_hub_models/models/wideresnet50_quantized/README.md b/qai_hub_models/models/wideresnet50_quantized/README.md index ed33868e..cb6dc1eb 100644 --- a/qai_hub_models/models/wideresnet50_quantized/README.md +++ b/qai_hub_models/models/wideresnet50_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/wideresnet50_quantized/export.py b/qai_hub_models/models/wideresnet50_quantized/export.py index 83cb894f..fd3f6e92 100644 --- a/qai_hub_models/models/wideresnet50_quantized/export.py +++ b/qai_hub_models/models/wideresnet50_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image_tensor" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image_tensor" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,12 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image_tensor", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -205,7 +216,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/wideresnet50_quantized/perf.yaml b/qai_hub_models/models/wideresnet50_quantized/perf.yaml index d234b9ec..fdc9206b 100644 --- a/qai_hub_models/models/wideresnet50_quantized/perf.yaml +++ b/qai_hub_models/models/wideresnet50_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: WideResNet50-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1807.0 - throughput: 553.4034311012729 + inference_time: 1821.0 + throughput: 549.1488193300385 estimated_peak_memory_range: - min: 49152 - max: 2181928 + min: 24576 + max: 2584464 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,14 +54,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 80 - job_id: jn5qev1n5 + job_id: jqpyd1y4p job_status: Passed torchscript_onnx_qnn: - inference_time: 2119.0 - throughput: 471.92071731949034 + inference_time: 2043.0 + throughput: 489.47626040137055 estimated_peak_memory_range: - min: 0 - max: 480120320 + min: 16384 + max: 250792696 primary_compute_unit: NPU precision: int8 layer_info: @@ -67,22 +69,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 78 - job_id: jw56ewm6g + job_id: jogkyekop job_status: Passed torchscript_onnx_ort: - inference_time: 2464.0 - throughput: 405.84415584415586 + inference_time: 2117.0 + throughput: 472.3665564478035 estimated_peak_memory_range: - min: 24576 - max: 187692992 + min: 110592 + max: 324998136 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jwgok8wqp + total_layers: 86 + job_id: j1p3mjrzg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -91,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.850968Z' + timestamp: '2024-05-20T16:35:31.575903Z' - torchscript_onnx_tflite: - inference_time: 1351.0 - throughput: 740.1924500370096 + inference_time: 1377.0 + throughput: 726.2164124909223 estimated_peak_memory_range: min: 12288 - max: 55206416 + max: 54112960 primary_compute_unit: NPU precision: int8 layer_info: @@ -105,14 +107,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 80 - job_id: j1gl6l8jg + job_id: j2p0rzxep job_status: Passed torchscript_onnx_qnn: - inference_time: 1589.0 - throughput: 629.3266205160478 + inference_time: 1526.0 + throughput: 655.307994757536 estimated_peak_memory_range: - min: 167936 - max: 45857248 + min: 0 + max: 44606448 primary_compute_unit: NPU precision: int8 layer_info: @@ -120,22 +122,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 78 - job_id: j1p3v673g + job_id: jn5q26dm5 job_status: Passed torchscript_onnx_ort: - inference_time: 1858.0 - throughput: 538.2131324004306 + inference_time: 1713.0 + throughput: 583.7711617046118 estimated_peak_memory_range: min: 0 - max: 28645856 + max: 30424256 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 1 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: j1pv07nk5 + total_layers: 86 + job_id: jwgov29d5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -144,51 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.851014Z' + timestamp: '2024-05-20T16:35:31.575930Z' - torchscript_onnx_tflite: - inference_time: 8152.0 - throughput: 122.6692836113837 + inference_time: 1831.0 + throughput: 546.1496450027307 estimated_peak_memory_range: - min: 12288 - max: 25276096 + min: 61440 + max: 1506248 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 82 + layers_on_npu: 80 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 82 - job_id: j1pv0yxk5 + total_layers: 80 + job_id: j1p87qk85 job_status: Passed torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 2035.0 + throughput: 491.4004914004914 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 16384 + max: 250480080 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 78 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: jw56ex4yg - job_status: Failed - torchscript_onnx_ort: - inference_time: 75852.0 - throughput: 13.183568000843747 + total_layers: 78 + job_id: jw561y07p + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.575948Z' + - torchscript_onnx_tflite: + inference_time: 8208.0 + throughput: 121.83235867446393 estimated_peak_memory_range: - min: 4431872 - max: 54054544 - primary_compute_unit: CPU - precision: fp32 + min: 12288 + max: 26585200 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 80 layers_on_gpu: 0 - layers_on_cpu: 88 - total_layers: 88 - job_id: j7gjzq8v5 + layers_on_cpu: 0 + total_layers: 80 + job_id: j2p0lxj6p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 8312.0 + throughput: 120.30798845043311 + estimated_peak_memory_range: + min: 94208 + max: 42576560 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 78 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 78 + job_id: j1p3er9l5 job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -197,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.851058Z' + timestamp: '2024-05-20T16:35:31.575965Z' - torchscript_onnx_tflite: - inference_time: 24077.0 - throughput: 41.533413631266356 + inference_time: 23889.0 + throughput: 41.8602704173469 estimated_peak_memory_range: min: 45056 - max: 2559568 + max: 2992736 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 82 + layers_on_npu: 80 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 82 - job_id: jnp1wo8lg + total_layers: 80 + job_id: j1p8zkxxp job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -220,42 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.851077Z' - - torchscript_onnx_tflite: - inference_time: 1831.0 - throughput: 546.1496450027307 + timestamp: '2024-05-20T16:35:31.575975Z' + - torchscript_onnx_qnn: + inference_time: 1966.0 + throughput: 508.646998982706 estimated_peak_memory_range: - min: 32768 - max: 1466192 + min: 344064 + max: 344064 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 82 + layers_on_npu: 78 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 82 - job_id: j2p038q9p + total_layers: 78 + job_id: j1glkvqlp job_status: Passed - torchscript_onnx_qnn: - inference_time: 2151.0 - throughput: 464.9000464900046 + torchscript_onnx_ort: + inference_time: 1912.0 + throughput: 523.0125523012553 estimated_peak_memory_range: - min: 622592 - max: 7136072 + min: 115851264 + max: 115851264 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 80 + layers_on_npu: 86 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 80 - job_id: jqp4k601g + total_layers: 86 + job_id: j1pvw6nmg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 256303.0 + throughput: 3.9016320526876394 + estimated_peak_memory_range: + min: 20701184 + max: 20701184 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j7gjlv88p job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.851108Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.575998Z' diff --git a/qai_hub_models/models/xlsr/README.md b/qai_hub_models/models/xlsr/README.md index 34d830be..1b462ab6 100644 --- a/qai_hub_models/models/xlsr/README.md +++ b/qai_hub_models/models/xlsr/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/x a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/xlsr/export.py b/qai_hub_models/models/xlsr/export.py index d1edd6e3..917c64ab 100644 --- a/qai_hub_models/models/xlsr/export.py +++ b/qai_hub_models/models/xlsr/export.py @@ -119,12 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -162,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -191,8 +197,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -201,7 +211,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/xlsr/model.py b/qai_hub_models/models/xlsr/model.py index aaee1928..5ad0eed8 100644 --- a/qai_hub_models/models/xlsr/model.py +++ b/qai_hub_models/models/xlsr/model.py @@ -49,7 +49,7 @@ def from_pretrained(cls) -> XLSR: def get_evaluator(self) -> BaseEvaluator: return SuperResolutionOutputEvaluator() - def forward(self, image: torch.Tensor) -> torch.Tensor: + def forward(self, image): """ Run XLSR on `image`, and produce an upscaled image diff --git a/qai_hub_models/models/xlsr/perf.yaml b/qai_hub_models/models/xlsr/perf.yaml index 5931cad8..c232cb11 100644 --- a/qai_hub_models/models/xlsr/perf.yaml +++ b/qai_hub_models/models/xlsr/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: XLSR performance_metrics: - torchscript_onnx_tflite: - inference_time: 2596.0 - throughput: 385.2080123266564 + inference_time: 2482.0 + throughput: 402.90088638195004 estimated_peak_memory_range: - min: 12288 - max: 1829544 + min: 16384 + max: 1867704 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,14 +48,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 16 - job_id: jlpeeynop + job_id: jlpevdn05 job_status: Passed torchscript_onnx_qnn: - inference_time: 971.0 - throughput: 1029.8661174047375 + inference_time: 1346.0 + throughput: 742.9420505200594 estimated_peak_memory_range: - min: 217088 - max: 11994560 + min: 16384 + max: 5062976 primary_compute_unit: NPU precision: fp16 layer_info: @@ -61,22 +63,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 21 - job_id: jz5w24r35 + job_id: jmg94lqv5 job_status: Passed torchscript_onnx_ort: - inference_time: 1502.0 - throughput: 665.7789613848203 + inference_time: 1552.0 + throughput: 644.3298969072165 estimated_peak_memory_range: - min: 212992 - max: 8613544 + min: 16384 + max: 72227024 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 23 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jnp1y6m8p + total_layers: 23 + job_id: jqp4wl6lg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.886071Z' + timestamp: '2024-05-20T16:35:31.615271Z' - torchscript_onnx_tflite: - inference_time: 1833.0 - throughput: 545.5537370430987 + inference_time: 1775.0 + throughput: 563.3802816901408 estimated_peak_memory_range: min: 16384 - max: 19549104 + max: 20190320 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,14 +101,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 16 - job_id: jygzon0o5 + job_id: jygz7306p job_status: Passed torchscript_onnx_qnn: - inference_time: 632.0 - throughput: 1582.2784810126582 + inference_time: 834.0 + throughput: 1199.0407673860911 estimated_peak_memory_range: - min: 208896 - max: 17756816 + min: 0 + max: 16978032 primary_compute_unit: NPU precision: fp16 layer_info: @@ -114,22 +116,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 21 - job_id: jmg9jdqw5 + job_id: jnp184mlg job_status: Passed torchscript_onnx_ort: - inference_time: 1006.0 - throughput: 994.0357852882704 + inference_time: 1029.0 + throughput: 971.8172983479105 estimated_peak_memory_range: - min: 344064 - max: 16233520 + min: 0 + max: 15374048 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 23 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jvgde2mr5 + total_layers: 23 + job_id: j0px1k89g job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.886105Z' + timestamp: '2024-05-20T16:35:31.615298Z' - torchscript_onnx_tflite: - inference_time: 2709.0 - throughput: 369.139904023625 + inference_time: 2490.0 + throughput: 401.60642570281124 estimated_peak_memory_range: - min: 6631424 - max: 8101008 + min: 12623872 + max: 14367408 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,14 +154,14 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 16 - job_id: jmg9jrn85 + job_id: jz5w9erjp job_status: Passed torchscript_onnx_qnn: - inference_time: 963.0 - throughput: 1038.4215991692627 + inference_time: 1362.0 + throughput: 734.2143906020558 estimated_peak_memory_range: - min: 212992 - max: 33066344 + min: 49152 + max: 9493856 primary_compute_unit: NPU precision: fp16 layer_info: @@ -167,7 +169,7 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 21 - job_id: jqp4k7r1g + job_id: jz57dy1r5 job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.886129Z' + timestamp: '2024-05-20T16:35:31.615315Z' + - torchscript_onnx_qnn: + inference_time: 3991.0 + throughput: 250.56376847907794 + estimated_peak_memory_range: + min: 237568 + max: 237568 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 21 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 21 + job_id: jvgdvxmlg + job_status: Passed + torchscript_onnx_ort: + inference_time: 1578.0 + throughput: 633.7135614702155 + estimated_peak_memory_range: + min: 8957952 + max: 8957952 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 23 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 23 + job_id: jo5mzn1qp + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 3324.0 + throughput: 300.84235860409143 + estimated_peak_memory_range: + min: 16203776 + max: 16203776 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 14 + total_layers: 14 + job_id: jegne6dmg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.615338Z' diff --git a/qai_hub_models/models/xlsr_quantized/README.md b/qai_hub_models/models/xlsr_quantized/README.md index b4b99361..968a6349 100644 --- a/qai_hub_models/models/xlsr_quantized/README.md +++ b/qai_hub_models/models/xlsr_quantized/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/x a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/xlsr_quantized/export.py b/qai_hub_models/models/xlsr_quantized/export.py index aafd8724..ccbe279b 100644 --- a/qai_hub_models/models/xlsr_quantized/export.py +++ b/qai_hub_models/models/xlsr_quantized/export.py @@ -123,12 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + " --force_channel_last_output output_0" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image" - + " --force_channel_last_output output_0", + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -170,8 +174,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -199,8 +205,12 @@ def export_model( assert inference_job is not None and inference_job.wait().success inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first - inference_result = transpose_channel_last_to_first( - "output_0", inference_result, target_runtime + inference_result = ( + inference_result + if target_runtime == TargetRuntime.ORT + else transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) ) print_inference_metrics(inference_job, inference_result, torch_out) @@ -209,7 +219,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/xlsr_quantized/model.py b/qai_hub_models/models/xlsr_quantized/model.py index cbf2ec5a..7ff4cd2c 100644 --- a/qai_hub_models/models/xlsr_quantized/model.py +++ b/qai_hub_models/models/xlsr_quantized/model.py @@ -8,28 +8,23 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, + constrain_quantized_inputs_to_image_range, ) # isort: on import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim -from qai_hub_models.models.common import SourceModelFormat, TargetRuntime -from qai_hub_models.models.xlsr.model import XLSR, _load_xlsr_source_model +from qai_hub_models.models.xlsr.model import XLSR +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 2 -# Weights and config stored in S3 are sourced from -# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/xlsr/model/model_cards/xlsr_4x_w8a8.json: -# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_february_artifacts/xlsr_4x_checkpoint_int8.pth -# and -# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js -# Encodings were generated with AIMET QuantSim library -XLSR_QUANTIZED_WEIGHTS = "xlsr_4x_checkpoint_int8.pth" -AIMET_ENCODINGS = "aimet_quantization_encodings.json" -AIMET_CONFIG = "default_config_per_channel.json" +MODEL_ASSET_VERSION = 3 +DEFAULT_ENCODINGS = "xlsr_quantized_encodings.json" SCALING_FACTOR = 4 @@ -44,9 +39,7 @@ def __init__( xlsr_model: QuantizationSimModel, ) -> None: XLSR.__init__(self, xlsr_model.model) - AIMETQuantizableMixin.__init__( - self, xlsr_model, needs_onnx_direct_aimet_export=True - ) + AIMETQuantizableMixin.__init__(self, xlsr_model) @classmethod def from_pretrained( @@ -60,40 +53,27 @@ def from_pretrained( elif None: Doesn't load any encodings. Used when computing encodings. else: Interprets as a filepath and loads the encodings stored there. """ - xlsr = _load_xlsr_source_model() - input_shape = XLSR.get_input_spec()["image"][0] + fp16_model = XLSR.from_pretrained() + input_shape = cls.get_input_spec()["image"][0] - weights = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, XLSR_QUANTIZED_WEIGHTS - ).fetch() - aimet_config = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, AIMET_CONFIG - ).fetch() + model = prepare_model(fp16_model) + equalize_model(model, input_shape) - # Load the model weights and quantization parameters - state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] - xlsr.load_state_dict(state_dict) sim = QuantizationSimModel( - xlsr, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=aimet_config, + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) + constrain_quantized_inputs_to_image_range(sim) + if aimet_encodings: if aimet_encodings == "DEFAULT": aimet_encodings = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS ).fetch() load_encodings_to_sim(sim, aimet_encodings) return cls(sim) - - def preferred_hub_source_model_format( - self, target_runtime: TargetRuntime - ) -> SourceModelFormat: - if target_runtime == TargetRuntime.QNN: - return SourceModelFormat.ONNX - else: - return SourceModelFormat.TORCHSCRIPT diff --git a/qai_hub_models/models/xlsr_quantized/perf.yaml b/qai_hub_models/models/xlsr_quantized/perf.yaml index 9dc9925a..a33479bf 100644 --- a/qai_hub_models/models/xlsr_quantized/perf.yaml +++ b/qai_hub_models/models/xlsr_quantized/perf.yaml @@ -26,6 +26,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -36,15 +37,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: XLSR-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1128.0 - throughput: 886.5248226950355 + inference_time: 1142.0 + throughput: 875.6567425569177 estimated_peak_memory_range: - min: 12288 - max: 1590504 + min: 20480 + max: 1494816 primary_compute_unit: NPU precision: int8 layer_info: @@ -52,7 +54,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - job_id: jmg9jdq85 + job_id: jopryvmeg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 798.0 + throughput: 1253.1328320802006 + estimated_peak_memory_range: + min: 65536 + max: 74050712 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 17 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 17 + job_id: j2p0rz8ep + job_status: Passed + torchscript_onnx_ort: + inference_time: 1166.0 + throughput: 857.6329331046312 + estimated_peak_memory_range: + min: 12288 + max: 10231824 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 21 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 21 + job_id: j1glkv9lp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -61,13 +93,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.910002Z' + timestamp: '2024-05-20T16:35:31.645635Z' - torchscript_onnx_tflite: - inference_time: 1209.0 - throughput: 827.129859387924 + inference_time: 948.0 + throughput: 1054.8523206751054 estimated_peak_memory_range: - min: 53248 - max: 20193472 + min: 16384 + max: 20809824 primary_compute_unit: NPU precision: int8 layer_info: @@ -75,7 +107,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - job_id: jnp1y6m7p + job_id: jep2mkqm5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 548.0 + throughput: 1824.8175182481752 + estimated_peak_memory_range: + min: 65536 + max: 18623024 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 17 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 17 + job_id: j1p87qd85 + job_status: Passed + torchscript_onnx_ort: + inference_time: 864.0 + throughput: 1157.4074074074074 + estimated_peak_memory_range: + min: 344064 + max: 17534000 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 21 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 21 + job_id: jw561y97p job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -84,21 +146,74 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.910020Z' + timestamp: '2024-05-20T16:35:31.645660Z' - torchscript_onnx_tflite: - inference_time: 3053.0 - throughput: 327.54667540124467 + inference_time: 1133.0 + throughput: 882.61253309797 estimated_peak_memory_range: - min: 57344 - max: 15609680 + min: 12288 + max: 1909504 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 16 + layers_on_npu: 14 layers_on_gpu: 0 layers_on_cpu: 3 - total_layers: 19 - job_id: jopr8r375 + total_layers: 17 + job_id: jqpyd1k4p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 793.0 + throughput: 1261.034047919294 + estimated_peak_memory_range: + min: 69632 + max: 73885472 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 17 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 17 + job_id: jn5q26xm5 + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.645677Z' + - torchscript_onnx_tflite: + inference_time: 2418.0 + throughput: 413.564929693962 + estimated_peak_memory_range: + min: 12288 + max: 14878432 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 14 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 17 + job_id: jqpy6yo75 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 1550.0 + throughput: 645.1612903225806 + estimated_peak_memory_range: + min: 65536 + max: 17596976 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 17 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 17 + job_id: jw56n080g job_status: Passed reference_device_info: name: RB3 Gen 2 (Proxy) @@ -107,21 +222,21 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.910035Z' + timestamp: '2024-05-20T16:35:31.645694Z' - torchscript_onnx_tflite: - inference_time: 15998.0 - throughput: 62.50781347668458 + inference_time: 14145.0 + throughput: 70.69635913750442 estimated_peak_memory_range: - min: 45056 - max: 17827664 + min: 4235264 + max: 15314136 primary_compute_unit: GPU precision: int8 layer_info: - layers_on_npu: 5 + layers_on_npu: 3 layers_on_gpu: 9 layers_on_cpu: 5 - total_layers: 19 - job_id: jvgdq6vl5 + total_layers: 17 + job_id: j2p0lxm6p job_status: Passed reference_device_info: name: RB5 (Proxy) @@ -130,27 +245,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8250 - timestamp: '2024-04-23T18:42:33.910050Z' - - torchscript_onnx_tflite: - inference_time: 1313.0 - throughput: 761.6146230007616 + timestamp: '2024-05-20T16:35:31.645705Z' + - torchscript_onnx_qnn: + inference_time: 933.0 + throughput: 1071.8113612004288 estimated_peak_memory_range: - min: 28672 - max: 5004672 + min: 49152 + max: 49152 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 16 + layers_on_npu: 17 layers_on_gpu: 0 - layers_on_cpu: 3 - total_layers: 19 - job_id: j2p03w0np + layers_on_cpu: 0 + total_layers: 17 + job_id: jogkyewop + job_status: Passed + torchscript_onnx_ort: + inference_time: 1191.0 + throughput: 839.6305625524769 + estimated_peak_memory_range: + min: 8818688 + max: 8818688 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 21 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 21 + job_id: j1p3mjlzg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 11552.0 + throughput: 86.56509695290859 + estimated_peak_memory_range: + min: 33103872 + max: 33103872 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jwgov27d5 job_status: Passed reference_device_info: - name: QCS8550 (Proxy) - os: '12' - form_factor: Iot - os_name: Android + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.910063Z' + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.645729Z' diff --git a/qai_hub_models/models/yolonas/README.md b/qai_hub_models/models/yolonas/README.md new file mode 100644 index 00000000..15cb8fb0 --- /dev/null +++ b/qai_hub_models/models/yolonas/README.md @@ -0,0 +1,61 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Yolo-NAS: Real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolonas) + +YoloNAS is a machine learning model that predicts bounding boxes and classes of objects in an image. + +This is based on the implementation of Yolo-NAS found +[here](https://github.com/Deci-AI/super-gradients). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolonas). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[yolonas]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.yolonas.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.yolonas.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Yolo-NAS can be found + [here](https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search](https://deci.ai/blog/yolo-nas-object-detection-foundation-model/) +* [Source Model Implementation](https://github.com/Deci-AI/super-gradients) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/yolonas/__init__.py b/qai_hub_models/models/yolonas/__init__.py new file mode 100644 index 00000000..6d2ecd39 --- /dev/null +++ b/qai_hub_models/models/yolonas/__init__.py @@ -0,0 +1,8 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.yolonas.app import YoloNASDetectionApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import YoloNAS as Model # noqa: F401 diff --git a/qai_hub_models/models/yolonas/app.py b/qai_hub_models/models/yolonas/app.py new file mode 100644 index 00000000..06cb65fb --- /dev/null +++ b/qai_hub_models/models/yolonas/app.py @@ -0,0 +1,52 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from typing import Tuple + +import torch + +from qai_hub_models.models._shared.yolo.app import YoloObjectDetectionApp +from qai_hub_models.models.yolonas.model import YoloNAS + + +class YoloNASDetectionApp(YoloObjectDetectionApp): + def check_image_size(self, pixel_values: torch.Tensor) -> None: + """ + Verify image size is a valid model input. Image size should be shape + [batch_size, num_channels, height, width], where height and width are multiples + of `YoloNAS.STRIDE_MULTIPLE`. + """ + if len(pixel_values.shape) != 4: + raise ValueError("Pixel Values must be rank 4: [batch, channels, x, y]") + if ( + pixel_values.shape[2] % YoloNAS.STRIDE_MULTIPLE != 0 + or pixel_values.shape[3] % YoloNAS.STRIDE_MULTIPLE != 0 + ): + raise ValueError( + f"Pixel values must have spatial dimensions (H & W) that are multiples of {YoloNAS.STRIDE_MULTIPLE}." + ) + + def pre_nms_postprocess( + self, *predictions: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Process the output of the YOLO detector for input to NMS. + + Parameters: + predictions: + Should contain two tensors: boxes and scores. + + Returns: + boxes: torch.Tensor + Bounding box locations. Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y) + scores: torch.Tensor + Confidence score that the given box is the predicted class: Shape is [batch, num_preds] + class_idx: torch.tensor + Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction. + """ + boxes, scores = predictions + scores, class_idx = torch.max(scores, -1, keepdim=False) + return boxes, scores, class_idx diff --git a/qai_hub_models/models/yolonas/conftest.py b/qai_hub_models/models/yolonas/conftest.py new file mode 100644 index 00000000..2d67f608 --- /dev/null +++ b/qai_hub_models/models/yolonas/conftest.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import inspect + +import pytest + +from qai_hub_models.models.yolonas import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +# Instantiate the model only once for all tests. +# Mock from_pretrained to always return the initialized model. +# This speeds up tests and limits memory leaks. +@pytest.fixture(scope="module", autouse=True) +def cached_from_pretrained(): + with pytest.MonkeyPatch.context() as mp: + pretrained_cache = {} + from_pretrained = Model.from_pretrained + sig = inspect.signature(from_pretrained) + + @skip_clone_repo_check + def _cached_from_pretrained(*args, **kwargs): + cache_key = str(args) + str(kwargs) + model = pretrained_cache.get(cache_key, None) + if model: + return model + else: + model = from_pretrained(*args, **kwargs) + pretrained_cache[cache_key] = model + return model + + _cached_from_pretrained.__signature__ = sig + + mp.setattr(Model, "from_pretrained", _cached_from_pretrained) + yield mp diff --git a/qai_hub_models/models/yolonas/demo.py b/qai_hub_models/models/yolonas/demo.py new file mode 100644 index 00000000..fa4ade05 --- /dev/null +++ b/qai_hub_models/models/yolonas/demo.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.yolo.demo import yolo_detection_demo +from qai_hub_models.models.yolonas.app import YoloNASDetectionApp +from qai_hub_models.models.yolonas.model import MODEL_ID, YoloNAS +from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS + + +def main(is_test: bool = False): + yolo_detection_demo( + YoloNAS, + MODEL_ID, + YoloNASDetectionApp, + IMAGE_ADDRESS, + YoloNAS.STRIDE_MULTIPLE, + is_test=is_test, + default_score_threshold=0.7, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolonas/export.py b/qai_hub_models/models/yolonas/export.py new file mode 100644 index 00000000..44dbdd27 --- /dev/null +++ b/qai_hub_models/models/yolonas/export.py @@ -0,0 +1,217 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub +import torch + +from qai_hub_models.models.yolonas import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "yolonas" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "yolonas", + "Yolo-NAS", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + model.eval() + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + channel_last_flags, hub_device + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub_device, + name=model_name, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub_device, + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub_device, + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics( + inference_job, inference_result, torch_out, outputs_to_skip=[2] + ) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolonas/info.yaml b/qai_hub_models/models/yolonas/info.yaml new file mode 100644 index 00000000..b2b6b9e2 --- /dev/null +++ b/qai_hub_models/models/yolonas/info.yaml @@ -0,0 +1,40 @@ +name: Yolo-NAS +# id must match with the model dir name in qai_hub_models +id: yolonas +status: public +headline: Real-time object detection optimized for mobile and edge. +domain: Computer Vision +description: YoloNAS is a machine learning model that predicts bounding boxes and classes + of objects in an image. +use_case: Object Detection +tags: + - real-time +research_paper: https://deci.ai/blog/yolo-nas-object-detection-foundation-model/ +research_paper_title: 'YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search' +license: https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/Deci-AI/super-gradients +technical_details: + Model checkpoint: YoloNAS Small + Input resolution: 640x640 + Number of parameters: 12.2M + Model size: 46.6 MB +applicable_scenarios: + - Factory Automation + - Robotic Navigation + - Camera +related_models: + - yolov6 + - yolov7 + - yolov8_det +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +deploy_license_type: AI Model Hub License +dataset: + - COCO diff --git a/qai_hub_models/models/yolonas/model.py b/qai_hub_models/models/yolonas/model.py new file mode 100644 index 00000000..b2f5e62f --- /dev/null +++ b/qai_hub_models/models/yolonas/model.py @@ -0,0 +1,160 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +import os +import sys + +import torch + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.evaluators.detection_evaluator import DetectionEvaluator +from qai_hub_models.models._shared.yolo.utils import yolo_sample_inputs +from qai_hub_models.models.common import SampleInputsType +from qai_hub_models.utils.asset_loaders import SourceAsRoot, find_replace_in_repo +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +SOURCE_REPOSITORY = "https://github.com/Deci-AI/super-gradients/" +SOURCE_REPO_COMMIT = "00a1f86da1a5bfdbbac44bfeda177de9439f4c73" +MODEL_ID = __name__.split(".")[-2] +DEFAULT_WEIGHTS = "yolo_nas_s" +MODEL_ASSET_VERSION = 1 +YOLO_HEAD_FILE = ( + "src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py" +) +DFL_HEAD_FILE = ( + "src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py" +) + + +class YoloNAS(BaseModel): + """Exportable YoloNAS bounding box detector, end-to-end.""" + + def __init__( + self, + model: torch.nn.Module, + include_postprocessing: bool = True, + class_dtype: torch.dtype = torch.float32, + ) -> None: + super().__init__() + self.model = model + self.include_postprocessing = include_postprocessing + self.class_dtype = class_dtype + + # All image input spatial dimensions should be a multiple of this stride. + STRIDE_MULTIPLE = 32 + + def get_evaluator(self) -> BaseEvaluator: + return DetectionEvaluator(*self.get_input_spec()["image"][0][2:]) + + @classmethod + def from_pretrained( + cls, + weights_name: str = DEFAULT_WEIGHTS, + include_postprocessing: bool = True, + ): + with SourceAsRoot( + SOURCE_REPOSITORY, + SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + ) as repo_root: + # There are some places where the input shape is derived dynamically + # from tensors that doesn't play nice with AIMET. Set the `eval_size` + # based on the model input spec and use that instead to derive shapes. + find_replace_in_repo( + repo_root, + YOLO_HEAD_FILE, + "feats: Tuple[Tensor, ...],\n", + "feats: Tuple[Tensor, ...], eval_size: Tuple[Tensor, Tensor],\n", + ) + find_replace_in_repo( + repo_root, + YOLO_HEAD_FILE, + "_, _, h, w = feat.shape", + "h, w = (eval_size[0] // stride, eval_size[1] // stride)", + ) + find_replace_in_repo( + repo_root, + DFL_HEAD_FILE, + "feats, self.fpn_strides", + "feats, self.eval_size, self.fpn_strides", + ) + find_replace_in_repo( + repo_root, DFL_HEAD_FILE, "if feats is not None:", "if False:" + ) + find_replace_in_repo( + repo_root, DFL_HEAD_FILE, "if self.eval_size:", "if False:" + ) + find_replace_in_repo( + repo_root, DFL_HEAD_FILE, "dtype=dtype", "dtype=torch.float32" + ) + find_replace_in_repo( + repo_root, DFL_HEAD_FILE, "device=device", "device='cpu'" + ) + + os.chdir("src") + sys.path.append(".") + + from super_gradients.training import models + + model = models.get(weights_name, pretrained_weights="coco") + input_size = cls.get_input_spec()["image"][0] + model.prep_model_for_conversion(input_size=input_size) + model.heads.eval_size = input_size[2:] + return cls(model.eval(), include_postprocessing) + + def forward(self, image): + """ + Run YoloNAS on `image`, and produce a predicted set of bounding boxes and associated class probabilities. + + Parameters: + image: Pixel values pre-processed for encoder consumption. + Range: float[0, 1] + 3-channel Color Space: BGR + + Returns: + If self.include_postprocessing: + boxes: torch.Tensor + Bounding box locations. Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y) + scores: torch.Tensor + Confidence score that the given box is the predicted class: Shape is [batch, num_preds] + class_idx: torch.tensor + Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction. + else: + boxes: torch.Tensor + Bounding box locations. Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y) + scores: torch.Tensor + Probability distribution over the classes for each box prediction. + Shape is [batch, num_preds, num_classes] + """ + out = self.model(image) + if isinstance(out[0], tuple): + out = out[0] + boxes, scores = out + if not self.include_postprocessing: + return boxes, scores + scores, class_idx = torch.max(scores, -1, keepdim=False) + return boxes, scores, class_idx.to(self.class_dtype) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 640, + width: int = 640, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return {"image": ((batch_size, num_channels, height, width), "float32")} + + def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType: + if input_spec is not None and input_spec != self.get_input_spec(): + raise ValueError("Sample input has a fixed size that cannot be changed") + + return yolo_sample_inputs() diff --git a/qai_hub_models/models/yolonas/perf.yaml b/qai_hub_models/models/yolonas/perf.yaml new file mode 100644 index 00000000..56ca583e --- /dev/null +++ b/qai_hub_models/models/yolonas/perf.yaml @@ -0,0 +1,234 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - QCS8550 (Proxy) + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy S24+ + - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Qcs8550 + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 + - Snapdragon® X Elite +models: +- name: Yolo-NAS + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 11744.0 + throughput: 85.14986376021798 + estimated_peak_memory_range: + min: 20480 + max: 7339120 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 201 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 201 + job_id: j1pvw6ymg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 14893.0 + throughput: 67.14563889075404 + estimated_peak_memory_range: + min: 6094848 + max: 24240072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 289 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 289 + job_id: jygz73q6p + job_status: Passed + torchscript_onnx_ort: + inference_time: 9987.0 + throughput: 100.13016921998599 + estimated_peak_memory_range: + min: 32768 + max: 59395840 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 290 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 290 + job_id: jvgdvxylg + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-05-20T16:35:31.684983Z' + - torchscript_onnx_tflite: + inference_time: 8017.0 + throughput: 124.73493825620557 + estimated_peak_memory_range: + min: 229376 + max: 96302464 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 201 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 201 + job_id: j7gjlv68p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 10167.0 + throughput: 98.35743090390478 + estimated_peak_memory_range: + min: 4931584 + max: 93285616 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 289 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 289 + job_id: jz5w9e0jp + job_status: Passed + torchscript_onnx_ort: + inference_time: 6706.0 + throughput: 149.1201908738443 + estimated_peak_memory_range: + min: 4931584 + max: 51901312 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 290 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 290 + job_id: jz57dymr5 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-05-20T16:35:31.685021Z' + - torchscript_onnx_tflite: + inference_time: 11751.0 + throughput: 85.09914049868097 + estimated_peak_memory_range: + min: 249856 + max: 7448824 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 201 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 201 + job_id: jlpevd005 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 15283.0 + throughput: 65.43217954590068 + estimated_peak_memory_range: + min: 4947968 + max: 24255016 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 289 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 289 + job_id: jnp184klg + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.685050Z' + - torchscript_onnx_qnn: + inference_time: 11900.0 + throughput: 84.03361344537815 + estimated_peak_memory_range: + min: 4923392 + max: 4923392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 289 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 289 + job_id: jmg94l7v5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 10117.0 + throughput: 98.84353069091628 + estimated_peak_memory_range: + min: 15732736 + max: 15732736 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 290 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 290 + job_id: jqp4wl7lg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 49609.0 + throughput: 20.157632687617166 + estimated_peak_memory_range: + min: 70164480 + max: 70164480 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 186 + total_layers: 186 + job_id: j0px1kq9g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.685076Z' diff --git a/qai_hub_models/models/yolonas/requirements.txt b/qai_hub_models/models/yolonas/requirements.txt new file mode 100644 index 00000000..b6f0ec66 --- /dev/null +++ b/qai_hub_models/models/yolonas/requirements.txt @@ -0,0 +1,9 @@ +object-detection-metrics==0.4.post1 +stringcase==1.2.0 +rapidfuzz==3.8.1 +treelib==1.6.1 +imagesize==1.4.1 +einops==0.3.2 +Deprecated==1.2.11 +data-gradients==0.3.1 +shapely==2.0.3 diff --git a/qai_hub_models/models/yolonas/test.py b/qai_hub_models/models/yolonas/test.py new file mode 100644 index 00000000..19ce0b85 --- /dev/null +++ b/qai_hub_models/models/yolonas/test.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.yolonas.app import YoloNASDetectionApp +from qai_hub_models.models.yolonas.demo import main as demo_main +from qai_hub_models.models.yolonas.model import MODEL_ASSET_VERSION, MODEL_ID, YoloNAS +from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + load_numpy, +) +from qai_hub_models.utils.bounding_box_processing import get_iou +from qai_hub_models.utils.testing import skip_clone_repo_check + +GT_BOXES = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "yolonas_boxes.npy" +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + app = YoloNASDetectionApp(YoloNAS.from_pretrained(), nms_score_threshold=0.7) + boxes = app.predict_boxes_from_image(image, raw_output=True)[0][0].numpy() + print(boxes.shape) + boxes_gt = load_numpy(GT_BOXES) + boxes = sorted(boxes, key=lambda box: box[0]) + boxes_gt = sorted(boxes_gt, key=lambda box: box[0]) + assert len(boxes) == len(boxes_gt) + ious = [get_iou(box, box_gt) for box, box_gt in zip(boxes, boxes_gt)] + for iou in ious: + assert iou > 0.95 + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/yolonas_quantized/README.md b/qai_hub_models/models/yolonas_quantized/README.md new file mode 100644 index 00000000..eed10e62 --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/README.md @@ -0,0 +1,61 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Yolo-NAS-Quantized: Quantized real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolonas_quantized) + +YoloNAS is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the COCO dataset. + +This is based on the implementation of Yolo-NAS-Quantized found +[here](https://github.com/Deci-AI/super-gradients). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolonas_quantized). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[yolonas_quantized]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.yolonas_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.yolonas_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Yolo-NAS-Quantized can be found + [here](https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search](https://deci.ai/blog/yolo-nas-object-detection-foundation-model/) +* [Source Model Implementation](https://github.com/Deci-AI/super-gradients) + +## Community +* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/yolonas_quantized/__init__.py b/qai_hub_models/models/yolonas_quantized/__init__.py new file mode 100644 index 00000000..28fd836f --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/__init__.py @@ -0,0 +1,8 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.yolonas.app import YoloNASDetectionApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import YoloNASQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/yolonas_quantized/conftest.py b/qai_hub_models/models/yolonas_quantized/conftest.py new file mode 100644 index 00000000..61cc2334 --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/conftest.py @@ -0,0 +1,39 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +import inspect + +import pytest + +from qai_hub_models.models.yolonas_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +# Instantiate the model only once for all tests. +# Mock from_pretrained to always return the initialized model. +# This speeds up tests and limits memory leaks. +@pytest.fixture(scope="module", autouse=True) +def cached_from_pretrained(): + with pytest.MonkeyPatch.context() as mp: + pretrained_cache = {} + from_pretrained = Model.from_pretrained + sig = inspect.signature(from_pretrained) + + @skip_clone_repo_check + def _cached_from_pretrained(*args, **kwargs): + cache_key = str(args) + str(kwargs) + model = pretrained_cache.get(cache_key, None) + if model: + return model + else: + model = from_pretrained(*args, **kwargs) + pretrained_cache[cache_key] = model + return model + + _cached_from_pretrained.__signature__ = sig + + mp.setattr(Model, "from_pretrained", _cached_from_pretrained) + yield mp diff --git a/qai_hub_models/models/yolonas_quantized/demo.py b/qai_hub_models/models/yolonas_quantized/demo.py new file mode 100644 index 00000000..fb2e64a3 --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/demo.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.yolo.demo import yolo_detection_demo +from qai_hub_models.models.yolonas.app import YoloNASDetectionApp +from qai_hub_models.models.yolonas_quantized.model import MODEL_ID, YoloNASQuantizable +from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS + + +def main(is_test: bool = False): + yolo_detection_demo( + YoloNASQuantizable, + MODEL_ID, + YoloNASDetectionApp, + IMAGE_ADDRESS, + YoloNASQuantizable.STRIDE_MULTIPLE, + is_test=is_test, + default_score_threshold=0.7, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolonas_quantized/export.py b/qai_hub_models/models/yolonas_quantized/export.py new file mode 100644 index 00000000..4f5733e9 --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/export.py @@ -0,0 +1,225 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.yolonas_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + chipset: Optional[str] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + chipset: If set, will choose a random device with this chipset. + Overrides the `device` argument. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "yolonas_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if chipset: + hub_device = hub.Device(attributes=f"chipset:{chipset}") + else: + hub_device = hub.Device(name=device) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "yolonas_quantized", + "Yolo-NAS-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + channel_last_flags, hub_device + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub_device, + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub_device, + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub_device, + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics( + inference_job, inference_result, torch_out, outputs_to_skip=[2] + ) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/yolonas_quantized/info.yaml b/qai_hub_models/models/yolonas_quantized/info.yaml new file mode 100644 index 00000000..d3a0e9d4 --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/info.yaml @@ -0,0 +1,42 @@ +name: Yolo-NAS-Quantized +# id must match with the model dir name in qai_hub_models +id: yolonas_quantized +status: public +headline: Quantized real-time object detection optimized for mobile and edge. +domain: Computer Vision +description: YoloNAS is a machine learning model that predicts bounding boxes and classes + of objects in an image. This model is post-training quantized to int8 using samples + from the COCO dataset. +use_case: Object Detection +tags: + - real-time + - quantized +research_paper: https://deci.ai/blog/yolo-nas-object-detection-foundation-model/ +research_paper_title: 'YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search' +license: https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/Deci-AI/super-gradients +technical_details: + Model checkpoint: YoloNAS Small + Input resolution: 640x640 + Number of parameters: 12.2M + Model size: 12.1 MB +applicable_scenarios: + - Factory Automation + - Robotic Navigation + - Camera +related_models: + - yolov6 + - yolov7 + - yolov8_det +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: apache-2.0 +deploy_license_type: AI Model Hub License +dataset: + - COCO diff --git a/qai_hub_models/models/yolonas_quantized/model.py b/qai_hub_models/models/yolonas_quantized/model.py new file mode 100644 index 00000000..52c5fb5e --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/model.py @@ -0,0 +1,93 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, + constrain_quantized_inputs_to_image_range, +) + +# isort: on + +from typing import Optional + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.yolonas.model import DEFAULT_WEIGHTS, YoloNAS +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.quantization_aimet import tie_observers + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "yolonas_quantized_encodings.json" + + +class YoloNASQuantizable(AIMETQuantizableMixin, YoloNAS): + """Exportable Quantized YoloNAS bounding box detector, end-to-end.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + # Sim model will already include postprocessing + torch.nn.Module.__init__(self) + AIMETQuantizableMixin.__init__(self, sim_model) + self.model = sim_model.model + + @classmethod + def from_pretrained( + cls, + weights_name: Optional[str] = DEFAULT_WEIGHTS, + aimet_encodings: str | None = "DEFAULT", + include_postprocessing: bool = True, + ) -> "YoloNASQuantizable": + """Load YoloNAS from a weightfile created by the source YoloNAS repository.""" + fp16_model = YoloNAS.from_pretrained( + weights_name, + include_postprocessing=include_postprocessing, + ) + fp16_model.class_dtype = torch.int8 + + input_shape = cls.get_input_spec()["image"][0] + + model = prepare_model(fp16_model) + equalize_model(model, input_shape) + + sim = QuantizationSimModel( + model, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_default_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + tie_observers(sim) + constrain_quantized_inputs_to_image_range(sim) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + final_model = cls(sim) + return final_model + + def forward(self, image: torch.Tensor): + """ + Run YoloNASQuantizable on `image`, and produce a + predicted set of bounding boxes and associated class probabilities. + + See YoloNAS model for details. + """ + return self.model(image) diff --git a/qai_hub_models/models/yolonas_quantized/perf.yaml b/qai_hub_models/models/yolonas_quantized/perf.yaml new file mode 100644 index 00000000..b30254ec --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/perf.yaml @@ -0,0 +1,286 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - QCS6490 (Proxy) + - QCS8250 (Proxy) + - QCS8550 (Proxy) + - RB3 Gen 2 (Proxy) + - RB5 (Proxy) + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy S24+ + - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Qcs6490 + - Qcs8250 + - Qcs8550 + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 + - Snapdragon® X Elite +models: +- name: Yolo-NAS-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 6961.0 + throughput: 143.65752047119668 + estimated_peak_memory_range: + min: 9187328 + max: 12329304 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 200 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 203 + job_id: jo5mzn7qp + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jep2mk1m5 + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1p87qn85 + job_status: Failed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-05-20T16:35:31.715440Z' + - torchscript_onnx_tflite: + inference_time: 4940.0 + throughput: 202.42914979757086 + estimated_peak_memory_range: + min: 712704 + max: 62991232 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 200 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 203 + job_id: jegne64mg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jqpyd1l4p + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jogkye1op + job_status: Failed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-05-20T16:35:31.715466Z' + - torchscript_onnx_tflite: + inference_time: 6961.0 + throughput: 143.65752047119668 + estimated_peak_memory_range: + min: 102400 + max: 12952792 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 200 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 203 + job_id: jopryvreg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j2p0rzwep + job_status: Failed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.715498Z' + - torchscript_onnx_tflite: + inference_time: 18142.0 + throughput: 55.120714364458166 + estimated_peak_memory_range: + min: 69632 + max: 59977872 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 200 + layers_on_gpu: 0 + layers_on_cpu: 1 + total_layers: 201 + job_id: jep2lzo4g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1gl3qw8g + job_status: Failed + reference_device_info: + name: RB3 Gen 2 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:31.715517Z' + - torchscript_onnx_tflite: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jqpy6y875 + job_status: Failed + reference_device_info: + name: RB5 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:31.715528Z' + - torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jn5q26nm5 + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 25114.0 + throughput: 39.81842796846381 + estimated_peak_memory_range: + min: 36032512 + max: 36032512 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 279 + total_layers: 279 + job_id: j1glkvdlp + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.715550Z' diff --git a/qai_hub_models/models/yolonas_quantized/requirements.txt b/qai_hub_models/models/yolonas_quantized/requirements.txt new file mode 100644 index 00000000..b6f0ec66 --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/requirements.txt @@ -0,0 +1,9 @@ +object-detection-metrics==0.4.post1 +stringcase==1.2.0 +rapidfuzz==3.8.1 +treelib==1.6.1 +imagesize==1.4.1 +einops==0.3.2 +Deprecated==1.2.11 +data-gradients==0.3.1 +shapely==2.0.3 diff --git a/qai_hub_models/models/yolonas_quantized/test.py b/qai_hub_models/models/yolonas_quantized/test.py new file mode 100644 index 00000000..2c1afe58 --- /dev/null +++ b/qai_hub_models/models/yolonas_quantized/test.py @@ -0,0 +1,45 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models.yolonas.app import YoloNASDetectionApp +from qai_hub_models.models.yolonas_quantized.demo import main as demo_main +from qai_hub_models.models.yolonas_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + YoloNASQuantizable, +) +from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + load_image, + load_numpy, +) +from qai_hub_models.utils.bounding_box_processing import get_iou +from qai_hub_models.utils.testing import skip_clone_repo_check + +GT_BOXES = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "yolonas_boxes.npy" +) + + +@skip_clone_repo_check +def test_task(): + image = load_image(IMAGE_ADDRESS) + app = YoloNASDetectionApp( + YoloNASQuantizable.from_pretrained(), nms_score_threshold=0.7 + ) + boxes = app.predict_boxes_from_image(image, raw_output=True)[0][0].numpy() + print(boxes) + boxes_gt = load_numpy(GT_BOXES) + boxes = sorted(boxes, key=lambda box: box[0]) + boxes_gt = sorted(boxes_gt, key=lambda box: box[0]) + assert len(boxes) == len(boxes_gt) + ious = [get_iou(box, box_gt) for box, box_gt in zip(boxes, boxes_gt)] + for iou in ious: + assert iou > 0.75 + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/yolov6/README.md b/qai_hub_models/models/yolov6/README.md index bddb6ad0..d3d4f458 100644 --- a/qai_hub_models/models/yolov6/README.md +++ b/qai_hub_models/models/yolov6/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y a hosted Qualcomm® device. + + ## Example & Usage diff --git a/qai_hub_models/models/yolov6/export.py b/qai_hub_models/models/yolov6/export.py index 06e18952..8dfb4ffe 100644 --- a/qai_hub_models/models/yolov6/export.py +++ b/qai_hub_models/models/yolov6/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -199,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/yolov6/model.py b/qai_hub_models/models/yolov6/model.py index c30783cd..8960bd81 100644 --- a/qai_hub_models/models/yolov6/model.py +++ b/qai_hub_models/models/yolov6/model.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from __future__ import annotations -import tempfile from importlib import reload import torch @@ -15,6 +14,7 @@ CachedWebModelAsset, SourceAsRoot, load_path, + qaihm_temp_dir, ) from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.input_spec import InputSpec @@ -93,7 +93,7 @@ def get_input_spec( def _load_yolov6_source_model_from_weights( ckpt_path: str | CachedWebModelAsset, ) -> torch.nn.Module: - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: model_path = load_path(ckpt_path, tmpdir) with SourceAsRoot( YOLOV6_SOURCE_REPOSITORY, diff --git a/qai_hub_models/models/yolov6/perf.yaml b/qai_hub_models/models/yolov6/perf.yaml index 49cd5919..a89eb61b 100644 --- a/qai_hub_models/models/yolov6/perf.yaml +++ b/qai_hub_models/models/yolov6/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Yolo-v6 performance_metrics: - torchscript_onnx_tflite: - inference_time: 7953.0 - throughput: 125.7387149503332 + inference_time: 7322.0 + throughput: 136.5747063643813 estimated_peak_memory_range: - min: 2138112 - max: 5576840 + min: 225280 + max: 2559408 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,37 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 182 - job_id: jvgde2mz5 + job_id: jw561yx7p job_status: Passed torchscript_onnx_qnn: - inference_time: 6885.0 - throughput: 145.24328249818447 + inference_time: 5353.0 + throughput: 186.81113394358303 estimated_peak_memory_range: - min: 4939776 - max: 18625080 + min: 4947968 + max: 15312024 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 229 + layers_on_npu: 228 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 229 - job_id: jqp4k321g + total_layers: 228 + job_id: j1pvw68mg job_status: Passed torchscript_onnx_ort: - inference_time: 6690.0 - throughput: 149.47683109118086 + inference_time: 6762.0 + throughput: 147.88524105294292 estimated_peak_memory_range: - min: 5345280 - max: 37259592 + min: 5337088 + max: 34449840 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 228 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jo5mq8l9p + total_layers: 228 + job_id: jz5w9ekjp job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -85,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.930548Z' + timestamp: '2024-05-20T16:35:31.754074Z' - torchscript_onnx_tflite: - inference_time: 5649.0 - throughput: 177.02248185519562 + inference_time: 5305.0 + throughput: 188.5014137606032 estimated_peak_memory_range: - min: 16384 - max: 82704608 + min: 49152 + max: 78708192 primary_compute_unit: NPU precision: fp16 layer_info: @@ -99,37 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 182 - job_id: jz570989g + job_id: j1p3mjdzg job_status: Passed torchscript_onnx_qnn: - inference_time: 4867.0 - throughput: 205.4653790836244 + inference_time: 3962.0 + throughput: 252.39777889954567 estimated_peak_memory_range: min: 4931584 - max: 98473200 + max: 96111232 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 229 + layers_on_npu: 228 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 229 - job_id: j0pxnxzl5 + total_layers: 228 + job_id: j7gjlv98p job_status: Passed torchscript_onnx_ort: - inference_time: 4842.0 - throughput: 206.52622883106156 + inference_time: 4919.0 + throughput: 203.29335230737954 estimated_peak_memory_range: - min: 4931584 - max: 66299664 + min: 3256320 + max: 65642736 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 228 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jegnlkwq5 + total_layers: 228 + job_id: jmg94lrv5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -138,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.930616Z' + timestamp: '2024-05-20T16:35:31.754099Z' - torchscript_onnx_tflite: - inference_time: 7952.0 - throughput: 125.75452716297787 + inference_time: 7402.0 + throughput: 135.09862199405566 estimated_peak_memory_range: - min: 217088 - max: 3444928 + min: 229376 + max: 3627424 primary_compute_unit: NPU precision: fp16 layer_info: @@ -152,22 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 182 - job_id: jn5qenqo5 + job_id: jwgov2xd5 job_status: Passed torchscript_onnx_qnn: - inference_time: 6878.0 - throughput: 145.39110206455365 + inference_time: 5362.0 + throughput: 186.4975755315181 estimated_peak_memory_range: - min: 4952064 - max: 19343808 + min: 4939776 + max: 15305728 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 229 + layers_on_npu: 228 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 229 - job_id: j7gjz9ve5 + total_layers: 228 + job_id: jygz7366p job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -176,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.930670Z' + timestamp: '2024-05-20T16:35:31.754129Z' + - torchscript_onnx_qnn: + inference_time: 6754.0 + throughput: 148.06040864672786 + estimated_peak_memory_range: + min: 4923392 + max: 4923392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 228 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 228 + job_id: jlpevdq05 + job_status: Passed + torchscript_onnx_ort: + inference_time: 6563.0 + throughput: 152.36934328813044 + estimated_peak_memory_range: + min: 7618560 + max: 7618560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 228 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 228 + job_id: jnp1849lg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 17525.0 + throughput: 57.06134094151213 + estimated_peak_memory_range: + min: 35479552 + max: 35479552 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 149 + total_layers: 149 + job_id: jvgdvxklg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.754151Z' diff --git a/qai_hub_models/models/yolov7/README.md b/qai_hub_models/models/yolov7/README.md index 02430d15..e6ab3b03 100644 --- a/qai_hub_models/models/yolov7/README.md +++ b/qai_hub_models/models/yolov7/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/yolov7/app.py b/qai_hub_models/models/yolov7/app.py index bc694b38..69de2553 100644 --- a/qai_hub_models/models/yolov7/app.py +++ b/qai_hub_models/models/yolov7/app.py @@ -16,7 +16,9 @@ class YoloV7DetectionApp(YoloObjectDetectionApp): def check_image_size(self, pixel_values: torch.Tensor) -> None: """ - Verify image size is valid model input. + Verify image size is a valid model input. Image size should be shape + [batch_size, num_channels, height, width], where height and width are multiples + of `YoloNAS.STRIDE_MULTIPLE`. """ if len(pixel_values.shape) != 4: raise ValueError("Pixel Values must be rank 4: [batch, channels, x, y]") diff --git a/qai_hub_models/models/yolov7/export.py b/qai_hub_models/models/yolov7/export.py index db8f85b2..0e4f08d9 100644 --- a/qai_hub_models/models/yolov7/export.py +++ b/qai_hub_models/models/yolov7/export.py @@ -119,9 +119,16 @@ def export_model( model.eval() source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -159,8 +166,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -199,7 +208,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/yolov7/model.py b/qai_hub_models/models/yolov7/model.py index ccbcc00e..e937d71a 100644 --- a/qai_hub_models/models/yolov7/model.py +++ b/qai_hub_models/models/yolov7/model.py @@ -49,7 +49,7 @@ def __init__( STRIDE_MULTIPLE = 32 def get_evaluator(self) -> BaseEvaluator: - return DetectionEvaluator(640, 640) + return DetectionEvaluator(*self.get_input_spec()["image"][0][2:]) @classmethod def from_pretrained( @@ -96,7 +96,7 @@ def forward(self, image): Returns: If self.include_postprocessing: boxes: torch.Tensor - Bounding box locations. Shape [batch, num preds, 4] where 4 == (center_x, center_y, w, h) + Bounding box locations. Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y) scores: torch.Tensor class scores multiplied by confidence: Shape is [batch, num_preds] class_idx: torch.tensor diff --git a/qai_hub_models/models/yolov7/perf.yaml b/qai_hub_models/models/yolov7/perf.yaml index 35945597..14de581f 100644 --- a/qai_hub_models/models/yolov7/perf.yaml +++ b/qai_hub_models/models/yolov7/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,38 +31,54 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Yolo-v7 performance_metrics: - torchscript_onnx_tflite: - inference_time: 20875.0 - throughput: 47.90419161676647 + inference_time: 15991.0 + throughput: 62.53517603652054 estimated_peak_memory_range: - min: 9580544 - max: 45193728 + min: 1212416 + max: 3555464 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 292 + layers_on_npu: 203 layers_on_gpu: 0 - layers_on_cpu: 21 - total_layers: 313 - job_id: jep20ezqg + layers_on_cpu: 12 + total_layers: 215 + job_id: jz5w9ej6p job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jvgdvxjeg + job_status: Failed torchscript_onnx_ort: - inference_time: 22899.0 - throughput: 43.670029258919605 + inference_time: 13667.0 + throughput: 73.16894709885125 estimated_peak_memory_range: - min: 9625600 - max: 55617832 - primary_compute_unit: CPU - precision: fp32 + min: 6905856 + max: 39411600 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 2 + layers_on_npu: 213 layers_on_gpu: 0 - layers_on_cpu: 21 - total_layers: 23 - job_id: j2p036xnp + layers_on_cpu: 12 + total_layers: 225 + job_id: j0px1kw1g job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,36 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.954673Z' + timestamp: '2024-05-20T16:35:31.784368Z' - torchscript_onnx_tflite: - inference_time: 16244.0 - throughput: 61.56119182467373 + inference_time: 10824.0 + throughput: 92.38728750923873 estimated_peak_memory_range: - min: 40960 - max: 202538080 + min: 188416 + max: 59790128 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 292 + layers_on_npu: 203 layers_on_gpu: 0 - layers_on_cpu: 21 - total_layers: 313 - job_id: jqpyrmyl5 + layers_on_cpu: 12 + total_layers: 215 + job_id: jmg94l6l5 job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jz57dyql5 + job_status: Failed torchscript_onnx_ort: - inference_time: 18014.0 - throughput: 55.51237926057511 + inference_time: 9691.0 + throughput: 103.18852543597151 estimated_peak_memory_range: - min: 17952768 - max: 200617376 - primary_compute_unit: CPU - precision: fp32 + min: 6680576 + max: 67183456 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 2 + layers_on_npu: 213 layers_on_gpu: 0 - layers_on_cpu: 21 - total_layers: 23 - job_id: j1p801kog + layers_on_cpu: 12 + total_layers: 225 + job_id: jo5mznjwp job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,22 +140,37 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.954727Z' + timestamp: '2024-05-20T16:35:31.784396Z' - torchscript_onnx_tflite: - inference_time: 20857.0 - throughput: 47.94553387351968 + inference_time: 15945.0 + throughput: 62.715584822828475 estimated_peak_memory_range: - min: 9539584 - max: 12396608 + min: 1220608 + max: 3533376 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 292 + layers_on_npu: 203 layers_on_gpu: 0 - layers_on_cpu: 21 - total_layers: 313 - job_id: jvgdekxz5 + layers_on_cpu: 12 + total_layers: 215 + job_id: jnp184r2g job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jqp4wlzvg + job_status: Failed reference_device_info: name: QCS8550 (Proxy) os: '12' @@ -131,4 +178,42 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.954767Z' + timestamp: '2024-05-20T16:35:31.784414Z' + - torchscript_onnx_ort: + inference_time: 13497.0 + throughput: 74.0905386382159 + estimated_peak_memory_range: + min: 4927488 + max: 4927488 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 213 + layers_on_gpu: 0 + layers_on_cpu: 12 + total_layers: 225 + job_id: jegne6jrg + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 38053.0 + throughput: 26.279136993141144 + estimated_peak_memory_range: + min: 150495232 + max: 150495232 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jopryvz9g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.784434Z' diff --git a/qai_hub_models/models/yolov7/requirements.txt b/qai_hub_models/models/yolov7/requirements.txt index 4dd59a05..faa7f850 100644 --- a/qai_hub_models/models/yolov7/requirements.txt +++ b/qai_hub_models/models/yolov7/requirements.txt @@ -2,3 +2,4 @@ matplotlib==3.7.4 object-detection-metrics==0.4.post1 scipy==1.8.1 seaborn==0.11.0 +shapely==2.0.3 diff --git a/qai_hub_models/models/yolov7_quantized/README.md b/qai_hub_models/models/yolov7_quantized/README.md index 2c305a70..390b486d 100644 --- a/qai_hub_models/models/yolov7_quantized/README.md +++ b/qai_hub_models/models/yolov7_quantized/README.md @@ -3,7 +3,7 @@ # [Yolo-v7-Quantized: Quantized real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov7_quantized) -YoloV7 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the [COCO dataset](https://cocodataset.org/#home). +YoloV7 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the COCO dataset. This is based on the implementation of Yolo-v7-Quantized found [here](https://github.com/WongKinYiu/yolov7/). This repository contains scripts for optimized on-device @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/yolov7_quantized/export.py b/qai_hub_models/models/yolov7_quantized/export.py index a6d017a8..d2f3d51e 100644 --- a/qai_hub_models/models/yolov7_quantized/export.py +++ b/qai_hub_models/models/yolov7_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), diff --git a/qai_hub_models/models/yolov7_quantized/info.yaml b/qai_hub_models/models/yolov7_quantized/info.yaml index 17802e3b..9ce2d281 100644 --- a/qai_hub_models/models/yolov7_quantized/info.yaml +++ b/qai_hub_models/models/yolov7_quantized/info.yaml @@ -6,7 +6,7 @@ headline: Quantized real-time object detection optimized for mobile and edge. domain: Computer Vision description: YoloV7 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples - from the [COCO dataset](https://cocodataset.org/#home). + from the COCO dataset. use_case: Object Detection tags: - real-time diff --git a/qai_hub_models/models/yolov7_quantized/perf.yaml b/qai_hub_models/models/yolov7_quantized/perf.yaml index c7be0fc9..f2da51b5 100644 --- a/qai_hub_models/models/yolov7_quantized/perf.yaml +++ b/qai_hub_models/models/yolov7_quantized/perf.yaml @@ -9,8 +9,10 @@ aggregated: - Google Pixel 4a - Google Pixel 5a 5G - QCS6490 (Proxy) + - QCS8250 (Proxy) - QCS8550 (Proxy) - RB3 Gen 2 (Proxy) + - RB5 (Proxy) - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -24,48 +26,66 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Qcs6490 + - Qcs8250 - Qcs8550 - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: Yolo-v7-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 6122.0 - throughput: 163.3453119895459 + inference_time: 4575.0 + throughput: 218.5792349726776 estimated_peak_memory_range: - min: 278528 - max: 13519408 + min: 323584 + max: 2051176 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 224 + layers_on_npu: 225 layers_on_gpu: 0 - layers_on_cpu: 0 - total_layers: 224 - job_id: j1gl6j32g + layers_on_cpu: 1 + total_layers: 226 + job_id: jep2mk245 job_status: Passed torchscript_onnx_qnn: - inference_time: 5732.0 - throughput: 174.45917655268667 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 16384 - max: 12543776 - primary_compute_unit: NPU - precision: int8 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: - layers_on_npu: 219 + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 219 - job_id: j1p80l0og - job_status: Passed + total_layers: 0 + job_id: j1p87qlx5 + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1glkvj8p + job_status: Failed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -73,37 +93,52 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:33.972519Z' + timestamp: '2024-05-20T16:35:31.813972Z' - torchscript_onnx_tflite: - inference_time: 4059.0 - throughput: 246.3661000246366 + inference_time: 2984.0 + throughput: 335.1206434316354 estimated_peak_memory_range: min: 40960 - max: 67566064 + max: 60470096 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 224 + layers_on_npu: 225 layers_on_gpu: 0 - layers_on_cpu: 0 - total_layers: 224 - job_id: jogk7jyvp + layers_on_cpu: 1 + total_layers: 226 + job_id: jqpyd197p job_status: Passed torchscript_onnx_qnn: - inference_time: 3804.0 - throughput: 262.88117770767616 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 1245184 - max: 89862128 - primary_compute_unit: NPU - precision: int8 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: - layers_on_npu: 219 + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 219 - job_id: jmg9j64m5 - job_status: Passed + total_layers: 0 + job_id: jogkyej2p + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jw561yk0p + job_status: Failed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -111,8 +146,23 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:33.972581Z' - - torchscript_onnx_qnn: + timestamp: '2024-05-20T16:35:31.813999Z' + - torchscript_onnx_tflite: + inference_time: 4604.0 + throughput: 217.2024326672459 + estimated_peak_memory_range: + min: 282624 + max: 2513496 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 225 + layers_on_gpu: 0 + layers_on_cpu: 1 + total_layers: 226 + job_id: j2p0rzn6p + job_status: Passed + torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' estimated_peak_memory_range: @@ -125,36 +175,112 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 0 - job_id: j1p80nqog + job_id: jn5q26j45 job_status: Failed reference_device_info: - name: RB3 Gen 2 (Proxy) + name: QCS8550 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs6490 - timestamp: '2024-04-23T18:42:33.972595Z' - - torchscript_onnx_qnn: - inference_time: 5978.0 - throughput: 167.2800267648043 + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.814017Z' + - torchscript_onnx_tflite: + inference_time: 11128.0 + throughput: 89.86340762041696 estimated_peak_memory_range: - min: 4939776 - max: 15407880 + min: 262144 + max: 60976880 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 220 + layers_on_npu: 225 + layers_on_gpu: 0 + layers_on_cpu: 1 + total_layers: 226 + job_id: jn5q3d1np + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 220 - job_id: j2p03wznp + total_layers: 0 + job_id: j7gje88v5 + job_status: Failed + reference_device_info: + name: RB3 Gen 2 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:31.814035Z' + - torchscript_onnx_tflite: + inference_time: 86803.0 + throughput: 11.520339158784834 + estimated_peak_memory_range: + min: 4190208 + max: 40909296 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 32 + layers_on_gpu: 126 + layers_on_cpu: 68 + total_layers: 226 + job_id: j1gl3q8jg job_status: Passed reference_device_info: - name: QCS8550 (Proxy) + name: RB5 (Proxy) os: '12' form_factor: Iot os_name: Android manufacturer: Qualcomm - chipset: Qcs8550 - timestamp: '2024-04-23T18:42:33.972626Z' + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:31.814046Z' + - torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1p3mjylg + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 15343.0 + throughput: 65.17630189663039 + estimated_peak_memory_range: + min: 51806208 + max: 51806208 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 256 + total_layers: 256 + job_id: jwgov2jx5 + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.814068Z' diff --git a/qai_hub_models/models/yolov7_quantized/requirements.txt b/qai_hub_models/models/yolov7_quantized/requirements.txt index 4dd59a05..faa7f850 100644 --- a/qai_hub_models/models/yolov7_quantized/requirements.txt +++ b/qai_hub_models/models/yolov7_quantized/requirements.txt @@ -2,3 +2,4 @@ matplotlib==3.7.4 object-detection-metrics==0.4.post1 scipy==1.8.1 seaborn==0.11.0 +shapely==2.0.3 diff --git a/qai_hub_models/models/yolov8_det/README.md b/qai_hub_models/models/yolov8_det/README.md index 610006a8..c82afce9 100644 --- a/qai_hub_models/models/yolov8_det/README.md +++ b/qai_hub_models/models/yolov8_det/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/yolov8_det/export.py b/qai_hub_models/models/yolov8_det/export.py index 4d4d321c..71b23405 100644 --- a/qai_hub_models/models/yolov8_det/export.py +++ b/qai_hub_models/models/yolov8_det/export.py @@ -121,9 +121,16 @@ def export_model( model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -161,8 +168,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -201,7 +210,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/yolov8_det/model.py b/qai_hub_models/models/yolov8_det/model.py index 48ecd1f9..224497fd 100644 --- a/qai_hub_models/models/yolov8_det/model.py +++ b/qai_hub_models/models/yolov8_det/model.py @@ -160,7 +160,7 @@ def get_input_spec( return {"image": ((batch_size, num_channels, height, width), "float32")} def get_evaluator(self) -> BaseEvaluator: - return DetectionEvaluator(640, 640) + return DetectionEvaluator(*self.get_input_spec()["image"][0][2:]) def yolov8_detect_postprocess( diff --git a/qai_hub_models/models/yolov8_det/perf.yaml b/qai_hub_models/models/yolov8_det/perf.yaml index 526f1b82..6ba185bb 100644 --- a/qai_hub_models/models/yolov8_det/perf.yaml +++ b/qai_hub_models/models/yolov8_det/perf.yaml @@ -8,6 +8,7 @@ aggregated: - Google Pixel 4 - Google Pixel 4a - Google Pixel 5a 5G + - QCS8550 (Proxy) - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -21,30 +22,63 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: + - Qcs8550 - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: YOLOv8-Detection performance_metrics: - torchscript_onnx_tflite: - inference_time: 6113.0 - throughput: 163.5858007524947 + inference_time: 5873.0 + throughput: 170.2707304614337 estimated_peak_memory_range: - min: 233472 - max: 8968336 + min: 245760 + max: 8436704 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 300 + layers_on_npu: 290 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 300 - job_id: jqpyzm28g + total_layers: 290 + job_id: j1pvw6jjg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 5218.0 + throughput: 191.64430816404752 + estimated_peak_memory_range: + min: 6332416 + max: 18723960 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 285 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 285 + job_id: jygz731kp + job_status: Passed + torchscript_onnx_ort: + inference_time: 6644.0 + throughput: 150.51173991571343 + estimated_peak_memory_range: + min: 6328320 + max: 32755768 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 286 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 286 + job_id: jvgdvxweg job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -53,36 +87,51 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-02T15:27:43.907101Z' + timestamp: '2024-05-20T16:35:31.892347Z' + - torchscript_onnx_tflite: + inference_time: 4141.0 + throughput: 241.48756339048538 + estimated_peak_memory_range: + min: 36864 + max: 84965392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 290 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 290 + job_id: j7gjlvjxp + job_status: Passed torchscript_onnx_qnn: - inference_time: 5316.0 - throughput: 188.11136192626034 + inference_time: 3671.0 + throughput: 272.40533914464726 estimated_peak_memory_range: - min: 4935680 - max: 19108344 + min: 78393344 + max: 180541088 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 293 + layers_on_npu: 285 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 293 - job_id: j1p821rkp + total_layers: 285 + job_id: jz5w9eo6p job_status: Passed - - torchscript_onnx_tflite: - inference_time: 4320.0 - throughput: 231.4814814814815 + torchscript_onnx_ort: + inference_time: 4354.0 + throughput: 229.67386311437758 estimated_peak_memory_range: - min: 73728 - max: 88723920 + min: 4956160 + max: 70229504 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 300 + layers_on_npu: 286 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 300 - job_id: j2p04699g + total_layers: 286 + job_id: jz57dyzl5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -91,19 +140,95 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-02T15:30:24.719256Z' + timestamp: '2024-05-20T16:35:31.892377Z' + - torchscript_onnx_tflite: + inference_time: 5872.0 + throughput: 170.29972752043597 + estimated_peak_memory_range: + min: 16384 + max: 5135584 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 290 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 290 + job_id: jlpevdj15 + job_status: Passed torchscript_onnx_qnn: - inference_time: 3677.0 - throughput: 271.9608376393799 + inference_time: 5208.0 + throughput: 192.01228878648234 + estimated_peak_memory_range: + min: 4935680 + max: 18248224 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 285 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 285 + job_id: jnp18402g + job_status: Passed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.892393Z' + - torchscript_onnx_qnn: + inference_time: 5820.0 + throughput: 171.82130584192439 + estimated_peak_memory_range: + min: 4923392 + max: 4923392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 285 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 285 + job_id: jmg94lvl5 + job_status: Passed + torchscript_onnx_ort: + inference_time: 6424.0 + throughput: 155.6662515566625 estimated_peak_memory_range: - min: 4931584 - max: 110753456 + min: 10039296 + max: 10039296 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 293 + layers_on_npu: 286 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 293 - job_id: jogkv80wp + total_layers: 286 + job_id: jqp4wlqvg job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 14327.0 + throughput: 69.79828296223913 + estimated_peak_memory_range: + min: 82149376 + max: 82149376 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j0px1kv1g + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.892415Z' diff --git a/qai_hub_models/models/yolov8_det/requirements.txt b/qai_hub_models/models/yolov8_det/requirements.txt index 48634d89..158fe9bd 100644 --- a/qai_hub_models/models/yolov8_det/requirements.txt +++ b/qai_hub_models/models/yolov8_det/requirements.txt @@ -2,3 +2,4 @@ object-detection-metrics==0.4.post1 seaborn==0.11.0 thop==0.1.1.post2209072238 ultralytics==8.0.193 +shapely==2.0.3 diff --git a/qai_hub_models/models/yolov8_det_quantized/README.md b/qai_hub_models/models/yolov8_det_quantized/README.md index 9e6d342c..874a00c7 100644 --- a/qai_hub_models/models/yolov8_det_quantized/README.md +++ b/qai_hub_models/models/yolov8_det_quantized/README.md @@ -3,7 +3,7 @@ # [YOLOv8-Detection-Quantized: Quantized real-time object detection optimized for mobile and edge by Ultralytics](https://aihub.qualcomm.com/models/yolov8_det_quantized) -Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the [COCO dataset](https://cocodataset.org/#home). +Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the COCO dataset. This is based on the implementation of YOLOv8-Detection-Quantized found [here](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/detect). This repository contains scripts for optimized on-device @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/yolov8_det_quantized/export.py b/qai_hub_models/models/yolov8_det_quantized/export.py index 7482e447..ddf0d6eb 100644 --- a/qai_hub_models/models/yolov8_det_quantized/export.py +++ b/qai_hub_models/models/yolov8_det_quantized/export.py @@ -123,9 +123,16 @@ def export_model( else: quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -167,8 +174,10 @@ def export_model( if target_runtime == TargetRuntime.QNN: hub_inputs = get_qnn_inputs(compile_job, sample_inputs) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), diff --git a/qai_hub_models/models/yolov8_det_quantized/info.yaml b/qai_hub_models/models/yolov8_det_quantized/info.yaml index 9e8e4bea..09e86fec 100644 --- a/qai_hub_models/models/yolov8_det_quantized/info.yaml +++ b/qai_hub_models/models/yolov8_det_quantized/info.yaml @@ -7,7 +7,7 @@ domain: Computer Vision use_case: Object Detection description: Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized - to int8 using samples from the [COCO dataset](https://cocodataset.org/#home). + to int8 using samples from the COCO dataset. tags: - real-time - quantized diff --git a/qai_hub_models/models/yolov8_det_quantized/perf.yaml b/qai_hub_models/models/yolov8_det_quantized/perf.yaml index b99d8e6a..3542b0ae 100644 --- a/qai_hub_models/models/yolov8_det_quantized/perf.yaml +++ b/qai_hub_models/models/yolov8_det_quantized/perf.yaml @@ -8,6 +8,11 @@ aggregated: - Google Pixel 4 - Google Pixel 4a - Google Pixel 5a 5G + - QCS6490 (Proxy) + - QCS8250 (Proxy) + - QCS8550 (Proxy) + - RB3 Gen 2 (Proxy) + - RB5 (Proxy) - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -21,46 +26,66 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: + - Qcs6490 + - Qcs8250 + - Qcs8550 - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: YOLOv8-Detection-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 2122.0 - throughput: 471.25353440150803 + inference_time: 2343.0 + throughput: 426.8032437046522 estimated_peak_memory_range: min: 12288 - max: 2262728 + max: 2559336 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 274 + layers_on_npu: 276 layers_on_gpu: 0 - layers_on_cpu: 0 - total_layers: 274 - job_id: jwgokj31p + layers_on_cpu: 1 + total_layers: 277 + job_id: jo5mznrwp job_status: Passed torchscript_onnx_qnn: - inference_time: 2121.0 - throughput: 471.4757190004715 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 1249280 - max: 12007368 - primary_compute_unit: NPU - precision: int8 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: - layers_on_npu: 272 + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 272 - job_id: jnp1yrwnp - job_status: Passed + total_layers: 0 + job_id: jep2mk845 + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1p87qox5 + job_status: Failed reference_device_info: name: Samsung Galaxy S23 os: '13' @@ -68,37 +93,52 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:34.010775Z' + timestamp: '2024-05-20T16:35:31.922821Z' - torchscript_onnx_tflite: - inference_time: 1422.0 - throughput: 703.2348804500704 + inference_time: 1587.0 + throughput: 630.119722747322 estimated_peak_memory_range: min: 12288 - max: 49561728 + max: 49417568 primary_compute_unit: NPU precision: int8 layer_info: - layers_on_npu: 274 + layers_on_npu: 276 layers_on_gpu: 0 - layers_on_cpu: 0 - total_layers: 274 - job_id: j1gl6jk2g + layers_on_cpu: 1 + total_layers: 277 + job_id: jegne62rg job_status: Passed torchscript_onnx_qnn: - inference_time: 1420.0 - throughput: 704.2253521126761 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 1245184 - max: 107412320 - primary_compute_unit: NPU - precision: int8 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: - layers_on_npu: 272 + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 272 - job_id: jvgdejv65 - job_status: Passed + total_layers: 0 + job_id: jqpyd1e7p + job_status: Failed + torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jogkyez2p + job_status: Failed reference_device_info: name: Samsung Galaxy S24 os: '14' @@ -106,4 +146,141 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:34.010850Z' + timestamp: '2024-05-20T16:35:31.922851Z' + - torchscript_onnx_tflite: + inference_time: 2345.0 + throughput: 426.43923240938165 + estimated_peak_memory_range: + min: 12288 + max: 3644216 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 276 + layers_on_gpu: 0 + layers_on_cpu: 1 + total_layers: 277 + job_id: jopryvk9g + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j2p0rzy6p + job_status: Failed + reference_device_info: + name: QCS8550 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8550 + timestamp: '2024-05-20T16:35:31.922869Z' + - torchscript_onnx_tflite: + inference_time: 5342.0 + throughput: 187.19580681392736 + estimated_peak_memory_range: + min: 12288 + max: 37726400 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 274 + layers_on_gpu: 0 + layers_on_cpu: 1 + total_layers: 275 + job_id: jogk3kkw5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: j1pvvnykp + job_status: Failed + reference_device_info: + name: RB3 Gen 2 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs6490 + timestamp: '2024-05-20T16:35:31.922886Z' + - torchscript_onnx_tflite: + inference_time: 44633.0 + throughput: 22.404947012300315 + estimated_peak_memory_range: + min: 3031040 + max: 12276104 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 273 + layers_on_gpu: 1 + layers_on_cpu: 1 + total_layers: 275 + job_id: jn5q3ddnp + job_status: Passed + reference_device_info: + name: RB5 (Proxy) + os: '12' + form_factor: Iot + os_name: Android + manufacturer: Qualcomm + chipset: Qcs8250 + timestamp: '2024-05-20T16:35:31.922898Z' + - torchscript_onnx_ort: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jn5q26845 + job_status: Failed + torchscript_onnx_ort_dml_gpu: + inference_time: 63514.0 + throughput: 15.744560254432093 + estimated_peak_memory_range: + min: 82382848 + max: 82382848 + primary_compute_unit: GPU + precision: int8 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: j1glkvn8p + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.922917Z' diff --git a/qai_hub_models/models/yolov8_det_quantized/requirements.txt b/qai_hub_models/models/yolov8_det_quantized/requirements.txt index 48634d89..158fe9bd 100644 --- a/qai_hub_models/models/yolov8_det_quantized/requirements.txt +++ b/qai_hub_models/models/yolov8_det_quantized/requirements.txt @@ -2,3 +2,4 @@ object-detection-metrics==0.4.post1 seaborn==0.11.0 thop==0.1.1.post2209072238 ultralytics==8.0.193 +shapely==2.0.3 diff --git a/qai_hub_models/models/yolov8_seg/README.md b/qai_hub_models/models/yolov8_seg/README.md index f9d01e5e..518fab1f 100644 --- a/qai_hub_models/models/yolov8_seg/README.md +++ b/qai_hub_models/models/yolov8_seg/README.md @@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y a hosted Qualcomm® device. + + ## Example & Usage Install the package via pip: diff --git a/qai_hub_models/models/yolov8_seg/export.py b/qai_hub_models/models/yolov8_seg/export.py index 362c5898..a632483a 100644 --- a/qai_hub_models/models/yolov8_seg/export.py +++ b/qai_hub_models/models/yolov8_seg/export.py @@ -121,9 +121,16 @@ def export_model( model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) + # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite) + channel_last_flags = ( + " --force_channel_last_input image" + if target_runtime != TargetRuntime.ORT + else "" + ) + # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( - target_runtime, compile_options + " --force_channel_last_input image" + target_runtime, compile_options + channel_last_flags, hub_device ) print(f"Optimizing model {model_name} to run on-device") submitted_compile_job = hub.submit_compile_job( @@ -161,8 +168,10 @@ def export_model( ) sample_inputs = model.sample_inputs(input_spec) # Convert inputs from channel first to channel last - hub_inputs = transpose_channel_first_to_last( - "image", sample_inputs, target_runtime + hub_inputs = ( + sample_inputs + if target_runtime == TargetRuntime.ORT + else transpose_channel_first_to_last("image", sample_inputs, target_runtime) ) submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), @@ -201,7 +210,7 @@ def export_model( def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False) + parser = export_parser(model_cls=Model, supports_qnn=False) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/yolov8_seg/info.yaml b/qai_hub_models/models/yolov8_seg/info.yaml index e6fec415..c766b93d 100644 --- a/qai_hub_models/models/yolov8_seg/info.yaml +++ b/qai_hub_models/models/yolov8_seg/info.yaml @@ -37,7 +37,7 @@ form_factors: - IoT - XR has_static_banner: yes -has_animated_banner: no +has_animated_banner: yes license_type: agpl-3.0 deploy_license_type: agpl-3.0 dataset: [] diff --git a/qai_hub_models/models/yolov8_seg/perf.yaml b/qai_hub_models/models/yolov8_seg/perf.yaml index fd75d70b..2f2dba93 100644 --- a/qai_hub_models/models/yolov8_seg/perf.yaml +++ b/qai_hub_models/models/yolov8_seg/perf.yaml @@ -22,6 +22,7 @@ aggregated: - Samsung Galaxy S24 Ultra - Samsung Galaxy S24+ - Samsung Galaxy Tab S8 + - Snapdragon X Elite CRD - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: @@ -30,15 +31,16 @@ aggregated: - Snapdragon® 8 Gen 2 - Snapdragon® 8 Gen 3 - Snapdragon® 888 + - Snapdragon® X Elite models: - name: YOLOv8-Segmentation performance_metrics: - torchscript_onnx_tflite: - inference_time: 7033.0 - throughput: 142.18683349921798 + inference_time: 7377.0 + throughput: 135.556459265284 estimated_peak_memory_range: - min: 4595712 - max: 6959144 + min: 4571136 + max: 14729800 primary_compute_unit: NPU precision: fp16 layer_info: @@ -46,22 +48,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 337 - job_id: jqp4k361g + job_id: jw561y60p + job_status: Passed + torchscript_onnx_qnn: + inference_time: 6398.0 + throughput: 156.29884338855894 + estimated_peak_memory_range: + min: 6324224 + max: 17126264 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 333 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 333 + job_id: j1pvw63jg job_status: Passed torchscript_onnx_ort: - inference_time: 8072.0 - throughput: 123.8850346878097 + inference_time: 8007.0 + throughput: 124.89072061945798 estimated_peak_memory_range: - min: 15532032 - max: 36380192 + min: 14934016 + max: 41914744 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 336 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jo5mq819p + total_layers: 336 + job_id: jz5w9ev6p job_status: Passed reference_device_info: name: Samsung Galaxy S23 @@ -70,13 +87,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-04-23T18:42:34.024849Z' + timestamp: '2024-05-20T16:35:31.961611Z' - torchscript_onnx_tflite: - inference_time: 5210.0 - throughput: 191.93857965451056 + inference_time: 5365.0 + throughput: 186.39328984156572 estimated_peak_memory_range: - min: 40960 - max: 98992992 + min: 16384 + max: 95805104 primary_compute_unit: NPU precision: fp16 layer_info: @@ -84,22 +101,37 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 337 - job_id: j0pxnx8l5 + job_id: j1p3mjklg + job_status: Passed + torchscript_onnx_qnn: + inference_time: 4560.0 + throughput: 219.2982456140351 + estimated_peak_memory_range: + min: 4931584 + max: 119239328 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 333 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 333 + job_id: j7gjlvxxp job_status: Passed torchscript_onnx_ort: - inference_time: 5653.0 - throughput: 176.89722271360338 + inference_time: 5499.0 + throughput: 181.8512456810329 estimated_peak_memory_range: - min: 17702912 - max: 83989088 + min: 16408576 + max: 80100880 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 1 + layers_on_npu: 336 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1 - job_id: jegnlkdq5 + total_layers: 336 + job_id: jmg94l1l5 job_status: Passed reference_device_info: name: Samsung Galaxy S24 @@ -108,13 +140,13 @@ models: os_name: Android manufacturer: Samsung chipset: Snapdragon® 8 Gen 3 - timestamp: '2024-04-23T18:42:34.024902Z' + timestamp: '2024-05-20T16:35:31.961638Z' - torchscript_onnx_tflite: - inference_time: 7217.0 - throughput: 138.56172925038103 + inference_time: 7372.0 + throughput: 135.6483993488877 estimated_peak_memory_range: min: 4579328 - max: 18295080 + max: 7772616 primary_compute_unit: NPU precision: fp16 layer_info: @@ -122,7 +154,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 337 - job_id: j0pxnq9l5 + job_id: jwgov2yx5 + job_status: Passed + torchscript_onnx_qnn: + inference_time: 6402.0 + throughput: 156.20118712902217 + estimated_peak_memory_range: + min: 4939776 + max: 15507456 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 333 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 333 + job_id: jygz73ekp job_status: Passed reference_device_info: name: QCS8550 (Proxy) @@ -131,4 +178,57 @@ models: os_name: Android manufacturer: Qualcomm chipset: Qcs8550 - timestamp: '2024-04-23T18:42:34.024944Z' + timestamp: '2024-05-20T16:35:31.961654Z' + - torchscript_onnx_qnn: + inference_time: 7604.0 + throughput: 131.5097317201473 + estimated_peak_memory_range: + min: 4923392 + max: 4923392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 333 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 333 + job_id: jlpevd915 + job_status: Passed + torchscript_onnx_ort: + inference_time: 8070.0 + throughput: 123.91573729863693 + estimated_peak_memory_range: + min: 22331392 + max: 22331392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 336 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 336 + job_id: jnp184l2g + job_status: Passed + torchscript_onnx_ort_dml_gpu: + inference_time: 22496.0 + throughput: 44.45234708392603 + estimated_peak_memory_range: + min: 104538112 + max: 104538112 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 1 + layers_on_cpu: 0 + total_layers: 1 + job_id: jvgdvx9eg + job_status: Passed + reference_device_info: + name: Snapdragon X Elite CRD + os: '11' + form_factor: Compute + os_name: Windows + manufacturer: Qualcomm + chipset: Snapdragon® X Elite + timestamp: '2024-05-20T16:35:31.961681Z' diff --git a/qai_hub_models/requirements-dev.txt b/qai_hub_models/requirements-dev.txt index be5243dc..e1b0706d 100644 --- a/qai_hub_models/requirements-dev.txt +++ b/qai_hub_models/requirements-dev.txt @@ -1,6 +1,6 @@ boto3==1.34.40 botocore==1.34.40 -coverage==6.5.0 +coverage==5.3.1 imageio[ffmpeg]==2.31.5 jinja2==3.0.3 mypy==0.991 diff --git a/qai_hub_models/utils/aimet/default_config_llama.json b/qai_hub_models/utils/aimet/default_config_llama.json new file mode 100644 index 00000000..f084e00d --- /dev/null +++ b/qai_hub_models/utils/aimet/default_config_llama.json @@ -0,0 +1,176 @@ +{ + "defaults": + { + "ops": + { + "is_output_quantized": "True" + }, + "params": + { + "is_quantized": "True", + "is_symmetric": "True" + }, + "per_channel_quantization": "False", + "strict_symmetric": "False", + "unsigned_symmetric": "False" + }, + + "params": + { + "bias": + { + "is_quantized": "False" + } + }, + + "op_type": + { + "Squeeze": + { + "is_output_quantized": "False" + }, + "Pad": + { + "is_output_quantized": "False" + }, + "Reshape": + { + "is_output_quantized": "False" + }, + "ChannelShuffle": + { + "is_output_quantized": "False" + }, + "Tile": + { + "is_output_quantized": "False" + }, + "Cast": + { + "is_output_quantized": "False" + }, + "TopK": + { + "is_output_quantized": "False" + }, + "GatherND": + { + "is_output_quantized": "False" + }, + "ReduceMin": + { + "is_output_quantized": "False" + }, + "ReduceMax": + { + "is_output_quantized": "False" + }, + "Slice": + { + "is_output_quantized": "False" + }, + "NonZero": + { + "is_output_quantized": "False" + }, + "DepthToSpace": + { + "is_output_quantized": "False" + }, + "MaxPool": + { + "is_output_quantized": "False" + }, + "Split": + { + "is_output_quantized": "False" + }, + "Mean": + { + "is_output_quantized": "False" + }, + "Gemm": + { + "per_channel_quantization": "True" + }, + "Conv": + { + "per_channel_quantization": "True" + }, + "Transpose": + { + "is_output_quantized": "False" + }, + "LayerNorm": + { + "per_channel_quantization": "False", + "params": { + "weight": { + "is_symmetric": "False" + } + } + }, + "Gather": + { + "is_output_quantized": "True" + }, + "Sigmoid": + { + "encoding_constraints": + { + "min": 0.0, + "max": 1.0 + } + }, + "Softmax": + { + "encoding_constraints": + { + "min": 0.0, + "max": 1.0 + } + } + }, + + "supergroups": + [ + { + "op_list": ["Conv", "Relu"] + }, + { + "op_list": ["Conv", "Clip"] + }, + { + "op_list": ["Conv", "BatchNormalization", "Relu"] + }, + { + "op_list": ["ConvTranspose", "Relu"] + }, + { + "op_list": ["Add", "Relu"] + }, + { + "op_list": ["Gemm", "Relu"] + }, + { + "op_list": ["Conv", "PRelu"] + }, + { + "op_list": ["Conv", "BatchNormalization","PRelu"] + }, + { + "op_list": ["Conv", "HardSwish"] + }, + { + "op_list": ["Conv", "BatchNormalization","HardSwish"] + } + ], + + "model_input": + { + "is_input_quantized": "True" + }, + + "model_output": + {} +} diff --git a/qai_hub_models/utils/args.py b/qai_hub_models/utils/args.py index 93185c2c..9261722d 100644 --- a/qai_hub_models/utils/args.py +++ b/qai_hub_models/utils/args.py @@ -56,6 +56,17 @@ def add_output_dir_arg(parser: argparse.ArgumentParser) -> argparse.ArgumentPars return parser +def _get_default_runtime(available_runtimes: List[TargetRuntime]): + if len(available_runtimes) == 0: + raise RuntimeError("available_runtimes empty, expecting at-least one runtime.") + + return ( + TargetRuntime.TFLITE + if TargetRuntime.TFLITE in available_runtimes + else available_runtimes[0] + ) + + def add_target_runtime_arg( parser: argparse.ArgumentParser, help: str, @@ -116,11 +127,7 @@ def get_on_device_demo_parser( default="", help="If running on-device, use these options when submitting the inference job.", ) - default_runtime = ( - TargetRuntime.TFLITE - if TargetRuntime.TFLITE in available_target_runtimes - else available_target_runtimes[0] - ) + default_runtime = _get_default_runtime(available_runtimes=available_target_runtimes) add_target_runtime_arg( parser, help="The runtime to demo (if --on-device is specified).", @@ -378,9 +385,12 @@ def get_qcom_chipsets() -> Set[str]: def export_parser( model_cls: Type[FromPretrainedTypeVar] | Type[FromPrecompiledTypeVar], components: Optional[List[str]] = None, - supports_qnn=True, - supports_ort=True, - exporting_compiled_model=False, + supports_tflite: bool = True, + supports_qnn: bool = True, + supports_ort: bool = True, + default_runtime: TargetRuntime = TargetRuntime.TFLITE, + exporting_compiled_model: bool = False, + default_export_device: str = DEFAULT_EXPORT_DEVICE, ) -> argparse.ArgumentParser: """ Arg parser to be used in export scripts. @@ -401,6 +411,8 @@ def export_parser( True when exporting compiled model. If set, removing skip_profiling flag from export arguments. Default = False. + default_export_device: + Default device to set for export. Returns: Arg parser object. @@ -409,7 +421,7 @@ def export_parser( parser.add_argument( "--device", type=str, - default=DEFAULT_EXPORT_DEVICE, + default=default_export_device, help="Device for which to export.", ) parser.add_argument( @@ -450,14 +462,19 @@ def export_parser( ) if not exporting_compiled_model: # Default runtime for compiled model is fixed for given model - available_runtimes = [TargetRuntime.TFLITE] + available_runtimes = [] + if supports_tflite: + available_runtimes.append(TargetRuntime.TFLITE) if supports_qnn: available_runtimes.append(TargetRuntime.QNN) if supports_ort: available_runtimes.append(TargetRuntime.ORT) + + default_runtime = _get_default_runtime(available_runtimes) add_target_runtime_arg( parser, available_target_runtimes=available_runtimes, + default=default_runtime, help="The runtime for which to export.", ) # No compilation for compiled models diff --git a/qai_hub_models/utils/asset_loaders.py b/qai_hub_models/utils/asset_loaders.py index d7d6b0dd..ab2dd61d 100644 --- a/qai_hub_models/utils/asset_loaders.py +++ b/qai_hub_models/utils/asset_loaders.py @@ -30,6 +30,7 @@ from git import Repo from PIL import Image from schema import And, Schema, SchemaError +from tqdm import tqdm ASSET_BASES_DEFAULT_PATH = os.path.join( os.path.dirname(os.path.dirname(__file__)), "asset_bases.yaml" @@ -112,7 +113,7 @@ def maybe_clone_git_repo( model_name: str, model_version: VersionType, patches: List[str] = [], -) -> str: +) -> Path: """Clone (or pull) a repository, save it to disk in a standard location, and return the absolute path to the cloned location. Patches can be applied by providing a list of paths to diff files.""" @@ -242,12 +243,14 @@ def SourceAsRoot( Only one of this class should be active per Python session. """ - repository_path = maybe_clone_git_repo( - source_repo_url, - source_repo_commit_hash, - source_repo_name, - source_repo_version, - patches=source_repo_patches, + repository_path = str( + maybe_clone_git_repo( + source_repo_url, + source_repo_commit_hash, + source_repo_name, + source_repo_version, + patches=source_repo_patches, + ) ) SOURCE_AS_ROOT_LOCK.acquire() original_path = list(sys.path) @@ -384,63 +387,85 @@ def get_web_asset_url(self, model_id: str, type: QAIHM_WEB_ASSET): raise NotImplementedError("unsupported web asset type") return f"{self.asset_url}/{ModelZooAssetConfig._replace_path_keywords(self.web_asset_folder, model_id=model_id)}/{file}" + def get_local_store_path(self) -> Path: + return Path(self.local_store_path) + def get_local_store_model_path( self, model_name: str, version: VersionType, filename: str - ) -> str: - model_dir = os.path.join( - self.local_store_path, - self.get_relative_model_asset_path(model_name, version, filename), + ) -> Path: + return self.local_store_path / self.get_relative_model_asset_path( + model_name, version, filename ) - return model_dir def get_local_store_dataset_path( self, dataset_name: str, version: VersionType, filename: str - ) -> str: - model_dir = os.path.join( - self.local_store_path, - self.get_relative_dataset_asset_path(dataset_name, version, filename), + ) -> Path: + return self.local_store_path / self.get_relative_dataset_asset_path( + dataset_name, version, filename ) - return model_dir def get_relative_model_asset_path( self, model_id: str, version: Union[int, str], file_name: str - ): + ) -> Path: assert not file_name.startswith("/") and not file_name.startswith("\\") - return f"{ModelZooAssetConfig._replace_path_keywords(self.model_asset_folder, model_id=model_id, version=version)}/{file_name}" + return ( + Path( + ModelZooAssetConfig._replace_path_keywords( + self.model_asset_folder, model_id=model_id, version=version + ) + ) + / file_name + ) def get_relative_dataset_asset_path( self, dataset_id: str, version: Union[int, str], file_name: str - ): + ) -> Path: assert not file_name.startswith("/") and not file_name.startswith("\\") - return f"{ModelZooAssetConfig._replace_path_keywords(self.dataset_asset_folder, dataset_id=dataset_id, version=version)}/{file_name}" + return ( + Path( + ModelZooAssetConfig._replace_path_keywords( + self.dataset_asset_folder, dataset_id=dataset_id, version=version + ) + ) + / file_name + ) def get_model_asset_url( self, model_id: str, version: Union[int, str], file_name: str - ): + ) -> str: assert not file_name.startswith("/") and not file_name.startswith("\\") - return f"{self.asset_url}/{self.get_relative_model_asset_path(model_id, version, file_name)}" + return f"{self.asset_url}/{self.get_relative_model_asset_path(model_id, version, file_name).as_posix()}" def get_dataset_asset_url( self, dataset_id: str, version: Union[int, str], file_name: str - ): + ) -> str: assert not file_name.startswith("/") and not file_name.startswith("\\") - return f"{self.asset_url}/{self.get_relative_dataset_asset_path(dataset_id, version, file_name)}" + return f"{self.asset_url}/{self.get_relative_dataset_asset_path(dataset_id, version, file_name).as_posix()}" - def get_qaihm_repo(self, model_id: str, relative=True): - relative_path = f"{ModelZooAssetConfig._replace_path_keywords(self.qaihm_repo, model_id=model_id)}" + def get_qaihm_repo(self, model_id: str, relative=True) -> Path | str: + relative_path = Path( + ModelZooAssetConfig._replace_path_keywords( + self.qaihm_repo, model_id=model_id + ) + ) if not relative: - return self.repo_url + "/" + relative_path - + return f"{self.repo_url}/{relative_path.as_posix()}" return relative_path - def get_website_url(self, model_id: str, relative=False): - relative_path = f"{ModelZooAssetConfig._replace_path_keywords(self.models_website_relative_path, model_id=model_id)}" + def get_website_url(self, model_id: str, relative=False) -> str: + relative_path = Path( + ModelZooAssetConfig._replace_path_keywords( + self.models_website_relative_path, model_id=model_id + ) + ).as_posix() if not relative: - return self.models_website_url + "/" + relative_path + return f"{self.models_website_url}/{relative_path}" return relative_path - def get_example_use(self, model_id: str): - return f"{ModelZooAssetConfig._replace_path_keywords(self.example_use, model_id=model_id)}" + def get_example_use(self, model_id: str) -> str: + return ModelZooAssetConfig._replace_path_keywords( + self.example_use, model_id=model_id + ) ### # Helpers @@ -558,7 +583,7 @@ class CachedWebAsset: def __init__( self, url: str, - local_cache_path: str, + local_cache_path: Path, asset_config=ASSET_CONFIG, model_downloader: Callable[[str, str, int], str] | None = None, downloader_num_retries=4, @@ -573,12 +598,12 @@ def __init__( path, ext = os.path.splitext(self.local_cache_path) if not ext: file_name = self.url.rsplit("/", 1)[-1] - self.local_cache_path = os.path.join(path, file_name) + self.local_cache_path = Path(path) / file_name # Set is_extracted if already extracted on disk file, _ = os.path.splitext(self.local_cache_path) self.is_extracted = list( - filter(local_cache_path.endswith, [".zip", ".tar", ".tar.gz", ".tgz"]) + filter(str(local_cache_path).endswith, [".zip", ".tar", ".tar.gz", ".tgz"]) ) != [] and os.path.isdir(file) def __repr__(self): @@ -602,7 +627,7 @@ def from_asset_store( web_store_path = f"{asset_config.asset_url}/{relative_store_file_path}" return CachedWebAsset( web_store_path, - relative_store_file_path, + Path(relative_store_file_path), asset_config, download_file, num_retries, @@ -611,7 +636,7 @@ def from_asset_store( @staticmethod def from_google_drive( gdrive_file_id: str, - relative_store_file_path: str, + relative_store_file_path: str | Path, num_retries=4, asset_config=ASSET_CONFIG, ): @@ -630,7 +655,7 @@ def from_google_drive( """ return CachedWebAsset( f"https://drive.google.com/uc?id={gdrive_file_id}", - relative_store_file_path, + Path(relative_store_file_path), asset_config, download_and_cache_google_drive, num_retries, @@ -647,12 +672,13 @@ def path(self, extracted=None) -> Path: extracted: If true, return the path of the extracted asset on disk. If false, return the path of the archive path on disk. """ + file: str | Path if (extracted is None and self.is_extracted) or extracted: file, _ = os.path.splitext(self.local_cache_path) else: file = self.local_cache_path - return Path(self.asset_config.local_store_path) / file + return self.asset_config.get_local_store_path() / file def fetch(self, force=False, extract=False) -> Path: """ @@ -930,11 +956,22 @@ def download_file(web_url: str, dst_path: str, num_retries: int = 4) -> str: """ if not os.path.exists(dst_path): print(f"Downloading data at {web_url} to {dst_path}... ", end="") - file_data = requests.get(web_url) - if file_data.status_code != 200: + + # Streaming, so we can iterate over the response. + response = requests.get(web_url, stream=True) + + # Sizes in bytes. + total_size = int(response.headers.get("content-length", 0)) + block_size = 1024 + + with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar: + with open(dst_path, "wb") as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + + if response.status_code != 200: raise ValueError(f"Unable to download file at {web_url}") - with open(dst_path, "wb") as dst_file: - dst_file.write(file_data.content) print("Done") return dst_path @@ -1020,4 +1057,16 @@ def callback_with_retry( return callback_with_retry(num_retries - 1, callback, *args, **kwargs) +@contextmanager +def qaihm_temp_dir(): + """ + Keep temp file under LOCAL_STORE_DEFAULT_PATH instead of /tmp which has + limited space. + """ + path = os.path.join(LOCAL_STORE_DEFAULT_PATH, "tmp") + os.makedirs(path, exist_ok=True) + with tempfile.TemporaryDirectory(dir=path) as tempdir: + yield tempdir + + PathType = Union[str, Path, CachedWebAsset] diff --git a/qai_hub_models/utils/base_model.py b/qai_hub_models/utils/base_model.py index fb87155d..bc8a9a87 100644 --- a/qai_hub_models/utils/base_model.py +++ b/qai_hub_models/utils/base_model.py @@ -5,10 +5,11 @@ from __future__ import annotations from pathlib import Path -from typing import Any +from typing import Any, List, Optional +import qai_hub import torch -from qai_hub.client import SourceModel +from qai_hub.client import Device, SourceModel from qai_hub_models.models.common import ( SampleInputsType, @@ -124,6 +125,8 @@ def convert_to_hub_source_model( output_path: str | Path, input_spec: InputSpec | None = None, check_trace: bool = True, + external_onnx_weights: bool = False, + output_names: Optional[List[str]] = None, ) -> SourceModel: """ Convert to a AI Hub source model appropriate for the export method. @@ -138,6 +141,8 @@ def convert_to_hub_source_model( output_path=output_path, input_spec=input_spec, check_trace=check_trace, + external_onnx_weights=external_onnx_weights, + output_names=output_names, ) return source_model @@ -145,17 +150,44 @@ def get_hub_compile_options( self, target_runtime: TargetRuntime, other_compile_options: str = "", + device: Optional[Device] = None, ) -> str: """ AI Hub compile options recommended for the model. """ - compile_options = "" - if target_runtime == TargetRuntime.QNN: - compile_options = "--target_runtime qnn_lib_aarch64_android" - if target_runtime == TargetRuntime.ORT: - compile_options = "--target_runtime onnx" + target_runtime_flag = None + if "--target_runtime" not in other_compile_options: + if target_runtime == TargetRuntime.QNN: + if device: + if not device.attributes: + # Only name / os specified + devices = qai_hub.get_devices(device.name, device.os) + elif not device.name: + # Only attribute specified + devices = qai_hub.get_devices(attributes=device.attributes) + else: + devices = [device] + + for device in devices: + if "os:android" not in device.attributes: + target_runtime_flag = "qnn_bin" + break + + target_runtime_flag = target_runtime_flag or "qnn_lib_aarch64_android" + elif target_runtime == TargetRuntime.ORT: + target_runtime_flag = "onnx" + elif target_runtime == TargetRuntime.TFLITE: + target_runtime_flag = "tflite" + else: + raise NotImplementedError() + + compile_options = ( + f"--target_runtime {target_runtime_flag}" if target_runtime_flag else "" + ) + if other_compile_options != "": return compile_options + " " + other_compile_options + return compile_options def preferred_hub_source_model_format( diff --git a/qai_hub_models/utils/compare.py b/qai_hub_models/utils/compare.py index a89d437e..8b887ba9 100644 --- a/qai_hub_models/utils/compare.py +++ b/qai_hub_models/utils/compare.py @@ -4,13 +4,24 @@ # --------------------------------------------------------------------- from __future__ import annotations -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd import torch +def _flatten_tuple(out_tuple): + if not isinstance(out_tuple, tuple): + return (out_tuple.detach(),) + + flattened_tuple = [] + for elem in out_tuple: + flattened_tuple.extend(_flatten_tuple(elem)) + + return tuple(flattened_tuple) + + def torch_inference( model: torch.nn.Module, sample_inputs: Dict[str, List[np.ndarray]] ) -> List[np.ndarray]: @@ -33,8 +44,10 @@ def torch_inference( "cpu" ) with torch.no_grad(): - out = model(**inputs) + out = model(*inputs.values()) out_tuple = (out,) if isinstance(out, torch.Tensor) else out + out_tuple = _flatten_tuple(out_tuple) + for i, out_val in enumerate(out_tuple): if i == len(torch_outs): torch_outs.append([]) @@ -120,7 +133,7 @@ def compute_top_k_accuracy(expected, actual, k): def generate_comparison_metrics( expected: List[np.ndarray], actual: List[np.ndarray], - names: List[str] | None = None, + names: Optional[List[str]] = None, metrics: str = "psnr", ) -> pd.DataFrame: """ diff --git a/qai_hub_models/utils/config_loaders.py b/qai_hub_models/utils/config_loaders.py index 14b4136c..e0c71145 100644 --- a/qai_hub_models/utils/config_loaders.py +++ b/qai_hub_models/utils/config_loaders.py @@ -155,6 +155,10 @@ def __repr__(self) -> str: return self.__str__() +def is_gen_ai_model(tags: List[MODEL_TAG]) -> bool: + return MODEL_TAG.LLM in tags or MODEL_TAG.GENERATIVE_AI in tags + + class MODEL_STATUS(Enum): PUBLIC = 0 PRIVATE = 1 @@ -176,6 +180,7 @@ class MODEL_USE_CASE(Enum): IMAGE_GENERATION = 102 SUPER_RESOLUTION = 103 SEMANTIC_SEGMENTATION = 104 + DEPTH_ESTIMATION = 105 # Ex: OCR, image caption IMAGE_TO_TEXT = 105 OBJECT_DETECTION = 106 @@ -479,6 +484,7 @@ def __init__( has_on_target_demo: bool, qnn_export_failure_reason: str, tflite_export_failure_reason: str, + ort_export_failure_reason: str, has_demo: bool, check_trace: bool, channel_last_input: List[str], @@ -490,15 +496,17 @@ def __init__( skip_tests: bool, is_precompiled: bool, no_assets: bool, + skip_export: bool, global_requirements_incompatible: bool, torchscript_opt: List[str], inference_metrics: str, - supports_ort: bool, + additional_readme_section: str, ) -> None: self.is_aimet = is_aimet self.has_on_target_demo = has_on_target_demo self.qnn_export_failure_reason = qnn_export_failure_reason self.tflite_export_failure_reason = tflite_export_failure_reason + self.ort_export_failure_reason = ort_export_failure_reason self.has_demo = has_demo self.check_trace = check_trace self.channel_last_input = channel_last_input @@ -513,7 +521,8 @@ def __init__( self.global_requirements_incompatible = global_requirements_incompatible self.torchscript_opt = torchscript_opt self.inference_metrics = inference_metrics - self.supports_ort = supports_ort + self.additional_readme_section = additional_readme_section + self.skip_export = skip_export def validate(self) -> Tuple[bool, Optional[str]]: """Returns false with a reason if the info spec for this model is not valid.""" @@ -537,6 +546,7 @@ def from_yaml( code_gen_config["has_on_target_demo"], code_gen_config["qnn_export_failure_reason"], code_gen_config["tflite_export_failure_reason"], + code_gen_config["ort_export_failure_reason"], code_gen_config["has_demo"], code_gen_config["check_trace"], code_gen_config["channel_last_input"], @@ -551,7 +561,8 @@ def from_yaml( code_gen_config["global_requirements_incompatible"], code_gen_config["torchscript_opt"], code_gen_config["inference_metrics"], - code_gen_config["supports_ort"], + code_gen_config["additional_readme_section"], + code_gen_config["skip_export"], ) # Schema for code-gen.yaml @@ -563,6 +574,7 @@ def from_yaml( OptionalSchema("has_on_target_demo", default=False): bool, OptionalSchema("qnn_export_failure_reason", default=""): str, OptionalSchema("tflite_export_failure_reason", default=""): str, + OptionalSchema("ort_export_failure_reason", default=""): str, OptionalSchema("has_demo", default=True): bool, OptionalSchema("check_trace", default=True): bool, OptionalSchema("channel_last_input", default=[]): list, @@ -577,7 +589,8 @@ def from_yaml( OptionalSchema("global_requirements_incompatible", default=False): bool, OptionalSchema("torchscript_opt", default=[]): list, OptionalSchema("inference_metrics", default="psnr"): str, - OptionalSchema("supports_ort", default=False): bool, + OptionalSchema("additional_readme_section", default=""): str, + OptionalSchema("skip_export", default=False): bool, } ) ) @@ -736,7 +749,7 @@ def validate(self) -> Tuple[bool, Optional[str]]: if session.head(animated_banner_url).status_code != requests.codes.ok: return False, f"Animated banner is missing at {animated_banner_url}" - expected_qaihm_repo = f"qai_hub_models/models/{self.id}" + expected_qaihm_repo = Path("qai_hub_models") / "models" / self.id if expected_qaihm_repo != ASSET_CONFIG.get_qaihm_repo(self.id): return False, "QAIHM repo not pointing to expected relative path" diff --git a/qai_hub_models/utils/draw.py b/qai_hub_models/utils/draw.py index 9352e7ef..2f89e8fc 100644 --- a/qai_hub_models/utils/draw.py +++ b/qai_hub_models/utils/draw.py @@ -4,27 +4,27 @@ # --------------------------------------------------------------------- from __future__ import annotations -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Union import cv2 -import numpy +import numpy as np import torch def draw_points( - frame: numpy.ndarray, - points: numpy.ndarray | torch.Tensor, + frame: np.ndarray, + points: np.ndarray | torch.Tensor, color: Tuple[int, int, int] = (0, 0, 0), - size: int = 3, + size: Union[int, List[int]] = 10, ): """ Draw the given points on the frame. Parameters: - frame: numpy.ndarray - numpy array (H W C x uint8, BGR) + frame: np.ndarray + np array (H W C x uint8, BGR) - points: numpy.ndarray | torch.Tensor + points: np.ndarray | torch.Tensor array (N, 2) where layout is [x1, y1] [x2, y2], ... or @@ -40,38 +40,49 @@ def draw_points( Returns: None; modifies frame in place. """ - n2 = len(points.shape) == 2 - for i in range(0, len(points) if n2 else len(points) // 2): - x, y = points[i] if n2 else (points[i * 2], points[i * 2 + 1]) - cv2.circle(frame, (int(x), int(y)), size, color, thickness=size) + if len(points.shape) == 1: + points = points.reshape(-1, 2) + assert isinstance(size, int) or len(size) == len(points) + cv_keypoints = [] + for i, (x, y) in enumerate(points): + curr_size = size if isinstance(size, int) else size[i] + cv_keypoints.append(cv2.KeyPoint(int(x), int(y), curr_size)) + + cv2.drawKeypoints( + frame, + cv_keypoints, + outImage=frame, + color=color, + flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS, + ) def draw_connections( - frame: numpy.ndarray, - points: numpy.ndarray | torch.Tensor, + frame: np.ndarray, + points: np.ndarray | torch.Tensor, connections: List[Tuple[int, int]], color: Tuple[int, int, int] = (0, 0, 0), - size: int = 3, + size: int = 1, ): """ Draw connecting lines between the given points on the frame. Parameters: - frame: numpy.ndarray - numpy array (H W C x uint8, BGR) + frame: + np array (H W C x uint8, BGR) - points: numpy.ndarray | torch.Tensor + points: array (N, 2) where layout is [x1, y1] [x2, y2], ... or array (N * 2,) where layout is x1, y1, x2, y2, ... - connections: List[Tuple[int, int]] + connections: List of points that should be connected by a line. Format is [(src point index, dst point index), ...] - color: Tuple[int, int, int] + color: Color of drawn points (RGB) size: int @@ -80,34 +91,28 @@ def draw_connections( Returns: None; modifies frame in place. """ - n2 = len(points.shape) == 2 - for connection in connections: - x0, y0 = ( - points[connection[0]] - if n2 - else (points[connection[0] * 2], points[connection[0] * 2 + 1]) - ) - x1, y1 = ( - points[connection[1]] - if n2 - else (points[connection[1] * 2], points[connection[1] * 2 + 1]) - ) - x0, y0 = int(x0), int(y0) - x1, y1 = int(x1), int(y1) - cv2.line(frame, (x0, y0), (x1, y1), color, size) + if len(points.shape) == 1: + points = points.reshape(-1, 2) + point_pairs = [ + ((int(points[i][0]), int(points[i][1])), (int(points[j][0]), int(points[j][1]))) + for (i, j) in connections + ] + cv2.polylines( + frame, np.array(point_pairs), isClosed=False, color=color, thickness=size # type: ignore + ) def draw_box_from_corners( - frame: numpy.ndarray, corners: numpy.ndarray | torch.Tensor, color=(0, 0, 0), size=3 + frame: np.ndarray, corners: np.ndarray | torch.Tensor, color=(0, 0, 0), size=3 ): """ Draw a box using the 4 points provided as boundaries. Parameters: - frame: numpy.ndarray - numpy array (H W C x uint8, BGR) + frame: np.ndarray + np array (H W C x uint8, BGR) - corners: numpy.ndarray | torch.Tensor + corners: np.ndarray | torch.Tensor array (4, 2) where layout is [x1, y1] [x2, y2], ... or @@ -128,8 +133,8 @@ def draw_box_from_corners( def draw_box_from_xywh( - frame: numpy.ndarray, - box: numpy.ndarray | torch.Tensor, + frame: np.ndarray, + box: np.ndarray | torch.Tensor, color: Tuple[int, int, int] = (0, 0, 0), size: int = 3, ): @@ -137,10 +142,10 @@ def draw_box_from_xywh( Draw a box using the provided data (center / height / width) to compute the box. Parameters: - frame: numpy.ndarray - numpy array (H W C x uint8, BGR) + frame: np.ndarray + np array (H W C x uint8, BGR) - box: numpy.ndarray | torch.Tensor + box: np.ndarray | torch.Tensor array (4), where layout is [xcenter, ycenter, h, w] @@ -160,9 +165,9 @@ def draw_box_from_xywh( def draw_box_from_xyxy( - frame: numpy.ndarray, - top_left: numpy.ndarray | torch.Tensor | Tuple[int, int], - bottom_right: numpy.ndarray | torch.Tensor | Tuple[int, int], + frame: np.ndarray, + top_left: np.ndarray | torch.Tensor | Tuple[int, int], + bottom_right: np.ndarray | torch.Tensor | Tuple[int, int], color: Tuple[int, int, int] = (0, 0, 0), size: int = 3, text: Optional[str] = None, @@ -171,10 +176,10 @@ def draw_box_from_xyxy( Draw a box using the provided top left / bottom right points to compute the box. Parameters: - frame: numpy.ndarray - numpy array (H W C x uint8, BGR) + frame: np.ndarray + np array (H W C x uint8, BGR) - box: numpy.ndarray | torch.Tensor + box: np.ndarray | torch.Tensor array (4), where layout is [xc, yc, h, w] @@ -217,7 +222,7 @@ def create_color_map(num_classes): Returns: A list of `num_classes` colors in RGB format. """ - numpy.random.seed(42) # For reproducible results - color_map = numpy.random.randint(0, 256, size=(num_classes, 3), dtype=numpy.uint8) + np.random.seed(42) # For reproducible results + color_map = np.random.randint(0, 256, size=(num_classes, 3), dtype=np.uint8) color_map[0] = [0, 0, 0] # Background class, usually black return color_map diff --git a/qai_hub_models/utils/huggingface.py b/qai_hub_models/utils/huggingface.py index d278d95c..4ddd9bef 100644 --- a/qai_hub_models/utils/huggingface.py +++ b/qai_hub_models/utils/huggingface.py @@ -8,7 +8,9 @@ from pathlib import Path from typing import List -from huggingface_hub import HfFileSystem, hf_hub_download +from huggingface_hub import HfApi, HfFileSystem, hf_hub_download +from huggingface_hub.utils import GatedRepoError +from packaging import version from qai_hub_models.utils.asset_loaders import ASSET_CONFIG, ModelZooAssetConfig from qai_hub_models.utils.base_model import TargetRuntime @@ -45,3 +47,37 @@ def fetch_huggingface_target_model( paths.append(path) return paths + + +def has_model_access(repo_name: str, repo_url: str): + # Huggingface returns GatedRepoError if model is not accessible to current User. + # ref: https://github.com/huggingface/huggingface_hub/blob/5ff2d150d121d04799b78bc08f2343c21b8f07a9/src/huggingface_hub/utils/_errors.py#L135 + + try: + hf_api = HfApi() + hf_api.model_info(repo_name) + except GatedRepoError: + no_access_error = ( + f"Seems like you don't have access to {repo_name} yet.\nPlease follow the following steps:" + f"\n 1. Apply for access at {repo_url}" + f"\n 2. Setup Huggingface API token as described in https://huggingface.co/docs/huggingface_hub/en/quick-start#login-command" + f"\nOnce access request is approved, you should be able to export/load {repo_name} via AI-Hub." + ) + raise RuntimeError(no_access_error) + + # Model is accesible for current User. + return True + + +def ensure_has_required_transformer(least_expected_version): + # import transformer as part of this function + # to avoid leaking installation globally on file import. + # NOTE: #10761 this function should not be required once AIMET (https://pypi.org/project/aimet-torch/) + # remove tight dependency on transformers. + import transformers + + if version.parse(transformers.__version__) < version.parse(least_expected_version): + raise RuntimeError( + f"Installed transformers version not supported. Expected >= {least_expected_version}, got {str(transformers.__version__)}\n" + f"Please run `pip install transformers=={least_expected_version}`" + ) diff --git a/qai_hub_models/utils/image_processing.py b/qai_hub_models/utils/image_processing.py index 4fb9405a..8d86f924 100644 --- a/qai_hub_models/utils/image_processing.py +++ b/qai_hub_models/utils/image_processing.py @@ -5,6 +5,7 @@ from __future__ import annotations import functools +import math from typing import Callable, List, Tuple import cv2 @@ -16,6 +17,15 @@ from torch.nn.functional import interpolate, pad from torchvision import transforms +IMAGENET_DIM = 224 +IMAGENET_TRANSFORM = transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(IMAGENET_DIM), + transforms.ToTensor(), + ] +) + def app_to_net_image_inputs( pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], @@ -175,12 +185,15 @@ def resize_pad(image: torch.Tensor, dst_size: Tuple[int, int]): h_ratio = dst_frame_height / height w_ratio = dst_frame_width / width - if width * h_ratio > dst_frame_height: - scale = w_ratio - else: + scale = min(h_ratio, w_ratio) + if h_ratio < w_ratio: scale = h_ratio - - import math + new_height = dst_frame_height + new_width = math.floor(width * scale) + else: + scale = w_ratio + new_height = math.floor(height * scale) + new_width = dst_frame_width new_height = math.floor(height * scale) new_width = math.floor(width * scale) diff --git a/qai_hub_models/utils/inference.py b/qai_hub_models/utils/inference.py index ef927a52..e122014d 100644 --- a/qai_hub_models/utils/inference.py +++ b/qai_hub_models/utils/inference.py @@ -5,9 +5,8 @@ from __future__ import annotations import os -import tempfile from pathlib import Path -from typing import List, Mapping, Tuple +from typing import List, Mapping, Optional, Tuple import numpy as np import qai_hub as hub @@ -15,7 +14,7 @@ from qai_hub.public_rest_api import DatasetEntries from qai_hub_models.models.protocols import ExecutableModelProtocol -from qai_hub_models.utils.asset_loaders import ModelZooAssetConfig +from qai_hub_models.utils.asset_loaders import ModelZooAssetConfig, qaihm_temp_dir from qai_hub_models.utils.base_model import BaseModel, SourceModelFormat, TargetRuntime from qai_hub_models.utils.input_spec import InputSpec from qai_hub_models.utils.qai_hub_helpers import ( @@ -38,6 +37,8 @@ def prepare_compile_zoo_model_to_hub( input_spec: InputSpec | None = None, check_trace: bool = True, prepare_compile_options_only: bool = False, + external_onnx_weights: bool = False, + output_names: Optional[List[str]] = None, ) -> Tuple[str | None, str]: """ Args: @@ -86,12 +87,19 @@ def prepare_compile_zoo_model_to_hub( compilation_options = model.get_hub_compile_options(target_runtime) + if output_names is None: + output_names = [] + if is_aimet: if source_model_format == SourceModelFormat.ONNX: def export_model_func(): + print("Exporting model to ONNX and generating AIMET encodings") return model.convert_to_onnx_and_aimet_encodings( - output_path, model_name=model_name + output_path, + model_name=model_name, + external_weights=external_onnx_weights, + output_names=output_names, ) elif ( @@ -100,6 +108,7 @@ def export_model_func(): ): def export_model_func(): + print("Converting model to Torchscript") traced_model = model.convert_to_torchscript( input_spec=input_spec, check_trace=check_trace ) @@ -111,6 +120,7 @@ def export_model_func(): else: # Torchscript and QNN def export_model_func(): + print("Converting model to Torchscript and generating AIMET encodings") exported_model = model.convert_to_torchscript_and_aimet_encodings( # type: ignore output_path, model_name=model_name, @@ -161,7 +171,7 @@ def compile_zoo_model_to_hub( model_name = model.__class__.__name__ - with tempfile.TemporaryDirectory() as tmp_dir: + with qaihm_temp_dir() as tmp_dir: assert tmp_dir is not None source_model, compilation_options = prepare_compile_zoo_model_to_hub( model=model, @@ -218,11 +228,13 @@ def __init__( input_names: List[str], device: hub.Device, inference_options: str = "", + output_names: Optional[List[str]] = None, ): self.model = model self.input_names = input_names self.device = device self.inference_options = inference_options + self.output_names = [] if output_names is None else output_names def __call__( self, @@ -309,9 +321,12 @@ def forward( target_runtime, ) # type: ignore + outputs = output_dataset.values() # type: ignore + if len(self.output_names) > 0: + outputs = [output_dataset[out_name] for out_name in self.output_names] # type: ignore + output_torch = [ - torch.from_numpy(np.concatenate(outputs, axis=0)) - for outputs in output_dataset.values() # type: ignore + torch.from_numpy(np.concatenate(output, axis=0)) for output in outputs ] if len(output_torch) == 1: @@ -334,9 +349,8 @@ def get_uploaded_precompiled_model( model_name, model_version, f"{model_component}_model_id.cached" ) - use_cached_model = not ignore_cached_model or os.path.exists(model_id_path) uploaded_model = None - if use_cached_model: + if not ignore_cached_model: try: with open(model_id_path, "r") as model_id_file: model_id = model_id_file.readline().strip() @@ -346,8 +360,7 @@ def get_uploaded_precompiled_model( return uploaded_model except Exception: - # Try uploading model instead - use_cached_model = False + pass # Upload model on hub uploaded_model = hub.upload_model(model_path) diff --git a/qai_hub_models/utils/measurement.py b/qai_hub_models/utils/measurement.py index 2c4a8f21..b0eb555a 100644 --- a/qai_hub_models/utils/measurement.py +++ b/qai_hub_models/utils/measurement.py @@ -5,7 +5,6 @@ from __future__ import annotations import os -import tempfile from pathlib import Path from typing import List, Union @@ -13,6 +12,8 @@ import qai_hub as hub from tflite import Model as TFModel # type: ignore +from qai_hub_models.utils.asset_loaders import qaihm_temp_dir + def display_with_sig_figs(num: float, num_sig_figs: int = 3) -> str: """ @@ -103,7 +104,7 @@ def get_model_size_mb(hub_model: hub.Model) -> float: """Return target model size in MB. This is a special case for ease of testing""" assert hub_model is not None - with tempfile.TemporaryDirectory() as tmp_dir: + with qaihm_temp_dir() as tmp_dir: download_path = Path(tmp_dir) / "model" # Download the model into the temporary directory hub_model.download(download_path) # type: ignore diff --git a/qai_hub_models/utils/model_adapters.py b/qai_hub_models/utils/model_adapters.py index 44e94b16..720a3aa5 100644 --- a/qai_hub_models/utils/model_adapters.py +++ b/qai_hub_models/utils/model_adapters.py @@ -16,7 +16,7 @@ def flatten(obj): flattened_list = [] for item in obj: if isinstance(item, tgt_type): - flattened_list.extend(flatten(item, tgt_type)) + flattened_list.extend(flatten(item)) else: flattened_list.append(item) return flattened_list diff --git a/qai_hub_models/utils/printing.py b/qai_hub_models/utils/printing.py index 89e5ab3f..95aa9bdc 100644 --- a/qai_hub_models/utils/printing.py +++ b/qai_hub_models/utils/printing.py @@ -4,7 +4,7 @@ # --------------------------------------------------------------------- from collections import Counter from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import numpy as np import qai_hub as hub @@ -20,17 +20,36 @@ _INFO_DASH = "-" * 60 +def print_with_box(data: List[str]) -> None: + """ + Print input list with box around it as follows + +-----------------------------+ + | list data 1 | + | list data 2 that is longest | + | data | + +-----------------------------+ + """ + size = max(len(line) for line in data) + size += 2 + print("+" + "-" * size + "+") + for line in data: + print("| {:<{}} |".format(line, size - 2)) + print("+" + "-" * size + "+") + + def print_inference_metrics( inference_job: hub.InferenceJob, inference_result: DatasetEntries, torch_out: List[np.ndarray], outputs_to_skip: Optional[List[int]] = None, + output_names: Optional[List[str]] = None, metrics: str = "psnr", ) -> None: + if output_names is None: + output_names = list(inference_result.keys()) inference_data = [ - np.concatenate(outputs, axis=0) for outputs in inference_result.values() + np.concatenate(inference_result[out_name], axis=0) for out_name in output_names ] - output_names = list(inference_result.keys()) df_eval = generate_comparison_metrics( torch_out, inference_data, names=output_names, metrics=metrics ) @@ -78,7 +97,7 @@ def print_profile_metrics_from_job( runtime = TargetRuntime.TFLITE elif is_qnn_hub_model(profile_job.model): runtime = TargetRuntime.QNN - elif profile_job.model.model_type == SourceModelType.ORT: + elif profile_job.model.model_type in [SourceModelType.ORT, SourceModelType.ONNX]: runtime = TargetRuntime.ORT else: raise NotImplementedError() @@ -128,18 +147,30 @@ def print_profile_metrics( def print_on_target_demo_cmd( - compile_job: hub.CompileJob, model_folder: Path, device: str + compile_job: Union[hub.CompileJob, List[hub.CompileJob]], + model_folder: Path, + device: str, ) -> None: """ Outputs a command that will run a model's demo script via inference job. """ - assert compile_job.wait().success - print("\nRun this model on a hosted device on sample data using:") - target_model = compile_job.get_target_model() - assert target_model is not None + if isinstance(compile_job, hub.CompileJob): + compile_job = [compile_job] + + target_model_id = [] + for job in compile_job: + assert job.wait().success + target_model = job.get_target_model() + assert target_model is not None + target_model_id.append(target_model.model_id) + + target_model_id_str = ",".join(target_model_id) + print( + f"\nRun compiled model{'s' if len(target_model_id) > 1 else ''} on a hosted device on sample data using:" + ) print( f"python {model_folder / 'demo.py'} " "--on-device " - f"--hub-model-id {target_model.model_id} " + f"--hub-model-id {target_model_id_str} " f'--device "{device}"\n' ) diff --git a/qai_hub_models/utils/qai_hub_helpers.py b/qai_hub_models/utils/qai_hub_helpers.py index 89deb1d6..d8db058e 100644 --- a/qai_hub_models/utils/qai_hub_helpers.py +++ b/qai_hub_models/utils/qai_hub_helpers.py @@ -6,7 +6,7 @@ import os from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Optional import numpy as np import qai_hub as hub @@ -89,7 +89,7 @@ def export_without_hub_access( target_runtime: TargetRuntime, compile_options: str, profile_options: str, - components: List[str] | None = None, + components: Optional[List[str]] = None, ) -> List[str]: print(_WARNING_DASH) print( diff --git a/qai_hub_models/utils/quantization_aimet.py b/qai_hub_models/utils/quantization_aimet.py index 02ebd2a1..0a0a61d0 100644 --- a/qai_hub_models/utils/quantization_aimet.py +++ b/qai_hub_models/utils/quantization_aimet.py @@ -31,15 +31,16 @@ ) import shutil -import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Tuple -from zipfile import ZipFile +from zipfile import ZIP_DEFLATED, ZipFile import aimet_torch.elementwise_ops as aimet_ops import torch import torch.nn.modules as nn -from qai_hub.client import DatasetEntries +from onnx import load_model as load_onnx_model +from onnx import save_model as save_onnx_model +from qai_hub.client import DatasetEntries, Device from qai_hub_models.evaluators.base_evaluators import _DataLoader, _for_each_batch from qai_hub_models.models._shared.common import apply_module_function_recursively @@ -48,6 +49,7 @@ PretrainedHubModelProtocol, QuantizableModelProtocol, ) +from qai_hub_models.utils.asset_loaders import qaihm_temp_dir from qai_hub_models.utils.input_spec import InputSpec, make_torch_inputs @@ -58,19 +60,27 @@ def _should_tie_observers(op: torch.nn.Module) -> bool: if not hasattr(op, "_module_to_wrap"): return False wrapped_op = op._module_to_wrap - op_types_to_tie = [nn.MaxPool2d, nn.AvgPool2d, nn.Upsample, aimet_ops.Concat] + op_types_to_tie = [ + nn.MaxPool2d, + nn.AvgPool2d, + nn.Upsample, + aimet_ops.Concat, + aimet_ops.Interpolate, + ] for op_type in op_types_to_tie: if isinstance(wrapped_op, op_type): return True return False -def _get_observer_module_name(modules: Dict[str, Any], name: str) -> Optional[str]: - module = modules.get(name) +def _get_observer_module_name(modules: Dict[str, Any], target: Any) -> Optional[str]: + if not isinstance(target, str): + return None + module = modules.get(target) if isinstance(module, QcQuantizeWrapper): - return name + return target elif isinstance(module, aimet_ops.CustomSiLU): - return name + ".mul" + return target + ".mul" return None @@ -140,7 +150,13 @@ def tie_observers(quant_sim: QuantizationSimModel) -> None: modules, input_node.target ) ) is None: + if input_node.target == getattr: + # If the input node is getting a tensor attribute (e.g. shape) + # No observers need to be tied + break input_node = input_node.all_input_nodes[0] + if input_node.target == getattr or observer_module_name is None: + continue if observer_module_name not in quantizer_deps: quantizer_deps[observer_module_name] = [] quantizer_deps[observer_module_name].append(node.target) @@ -315,7 +331,7 @@ def convert_to_torchscript_and_aimet_encodings( zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip") base_dir = Path(f"{model_name}.aimet") - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: base_path = Path(tmpdir) / base_dir os.makedirs(base_path) self.quant_sim.export( @@ -343,6 +359,8 @@ def convert_to_onnx_and_aimet_encodings( output_dir: str | Path, input_spec: InputSpec | None = None, model_name: str | None = None, + external_weights: bool = False, + output_names: Optional[List[str]] = None, ) -> str: """ Converts the torch module to a zip file containing an @@ -357,27 +375,53 @@ def convert_to_onnx_and_aimet_encodings( zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip") base_dir = Path(f"{model_name}.aimet") - with tempfile.TemporaryDirectory() as tmpdir: + with qaihm_temp_dir() as tmpdir: base_path = Path(tmpdir) / base_dir if base_path.exists(): shutil.rmtree(base_path) os.makedirs(base_path) onnx_utils.EXPORT_TO_ONNX_DIRECT = self.needs_onnx_direct_aimet_export + self.quant_sim.export( str(base_path), model_name, tuple(make_torch_inputs(input_spec)), - onnx_export_args=dict(input_names=[name for name in input_spec]), + onnx_export_args=dict( + input_names=[name for name in input_spec], output_names=output_names + ), ) - onnx_file_name = f"{model_name}.onnx" encodings_file_name = f"{model_name}.encodings" - with ZipFile(zip_path, "w") as zip_object: + external_weights_file_name = f"{model_name}.data" + + if external_weights: + # Torch exports to onnx with external weights scattered in a directory. + # Save ONNX model with weights to one file. + onnx_file_path = str(base_path / onnx_file_name) + onnx_model = load_onnx_model(onnx_file_path) + save_onnx_model( + onnx_model, + str(onnx_file_path), + save_as_external_data=True, + all_tensors_to_one_file=True, + location=external_weights_file_name, + ) + + # compresslevel defines how fine compression should run + # higher the level, heavier algorithm is used leading to more time. + # For large models, higher compression takes longer time to compress. + with ZipFile(zip_path, "w", ZIP_DEFLATED, compresslevel=4) as zip_object: zip_object.write(base_path, base_dir) + zip_object.write( base_path / onnx_file_name, os.path.join(base_dir, onnx_file_name) ) + if external_weights: + zip_object.write( + base_path / external_weights_file_name, + os.path.join(base_dir, external_weights_file_name), + ) zip_object.write( base_path / encodings_file_name, os.path.join(base_dir, encodings_file_name), @@ -391,7 +435,7 @@ def convert_to_torchscript( if not input_spec: input_spec = self.get_input_spec() - with tempfile.TemporaryDirectory() as tempdir: + with qaihm_temp_dir() as tempdir: self.quant_sim.export( tempdir, "model", @@ -415,12 +459,19 @@ def get_calibration_data( return {k: v.numpy() for k, v in zip(input_spec.keys(), inputs)} def get_hub_compile_options( - self, target_runtime: TargetRuntime, other_compile_options: str = "" + self, + target_runtime: TargetRuntime, + other_compile_options: str = "", + device: Optional[Device] = None, ) -> str: compile_options = super().get_hub_compile_options( # type: ignore - target_runtime, other_compile_options + target_runtime, other_compile_options, device ) - return compile_options + " --quantize_full_type int8 --quantize_io" + compile_options = compile_options + " --quantize_full_type int8" + if target_runtime != TargetRuntime.ORT: + # TODO(#10896): Restore quantize_io flag when targeting ORT + compile_options = compile_options + " --quantize_io" + return compile_options def preferred_hub_source_model_format( self, target_runtime: TargetRuntime diff --git a/qai_hub_models/utils/scorecard/common.py b/qai_hub_models/utils/scorecard/common.py index f08b909b..b3230a71 100644 --- a/qai_hub_models/utils/scorecard/common.py +++ b/qai_hub_models/utils/scorecard/common.py @@ -2,35 +2,262 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -import qai_hub as hub +import os +from enum import Enum +from typing import Dict, List, Optional, Tuple -SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME = { - "s23": "qualcomm-snapdragon-8gen2", - "s24": "qualcomm-snapdragon-8gen3", - "6490": "qualcomm-qcs6490", - "8250": "qualcomm-qcs8250", - "8550": "qualcomm-qcs8550", -} +import qai_hub as hub +from qai_hub_models.models.common import TargetRuntime -SCORECARD_DEVICE_NAME_TO_CHIPSET = { - device: f"chipset:{chipset}" - for device, chipset in SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME.items() -} +_DEVICE_CACHE: Dict[str, hub.Device] = {} -def __get_device(device_name) -> hub.Device: +def _get_cached_device(device_name: str) -> hub.Device: # Gets a device with attributes & OS. This only comes from hub.get_devices() - for device in hub.get_devices(): - if device.name == device_name: - return device - raise ValueError(f"No device named {device_name}") - - -REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS = { - "qualcomm-snapdragon-8gen2": __get_device("Samsung Galaxy S23"), - "qualcomm-snapdragon-8gen3": __get_device("Samsung Galaxy S24"), - "qualcomm-qcs6490": __get_device("RB3 Gen 2 (Proxy)"), - "qualcomm-qcs8250": __get_device("RB5 (Proxy)"), - "qualcomm-qcs8550": __get_device("QCS8550 (Proxy)"), -} + device = _DEVICE_CACHE.get(device_name, None) + if not device: + device = hub.get_devices(device_name)[0] + _DEVICE_CACHE[device_name] = device + return device + + +class ScorecardDevice(Enum): + any = 0 # no specific device (usable only during compilation) + + # cs == chipset + cs_8_gen_2 = 1 + cs_8_gen_3 = 2 + cs_6490 = 3 + cs_8250 = 4 + cs_8550 = 5 + cs_x_elite = 6 + + def enabled(self) -> bool: + valid_test_devices = os.environ.get("WHITELISTED_PROFILE_TEST_DEVICES", "ALL") + return ( + valid_test_devices == "ALL" + or self == ScorecardDevice.any + or self.name in valid_test_devices.split(",") + ) + + def all_enabled(self) -> List["ScorecardDevice"]: + return [x for x in ScorecardDevice if x.enabled()] + + def get_reference_device(self) -> hub.Device: + if self in [ScorecardDevice.cs_8_gen_2, ScorecardDevice.any]: + return _get_cached_device("Samsung Galaxy S23") + if self == ScorecardDevice.cs_8_gen_3: + return _get_cached_device("Samsung Galaxy S24") + if self == ScorecardDevice.cs_6490: + return _get_cached_device("RB3 Gen 2 (Proxy)") + if self == ScorecardDevice.cs_8250: + return _get_cached_device("RB5 (Proxy)") + if self == ScorecardDevice.cs_8550: + return _get_cached_device("QCS8550 (Proxy)") + if self == ScorecardDevice.cs_x_elite: + return _get_cached_device("Snapdragon X Elite CRD") + raise NotImplementedError(f"No reference device for {self.name}") + + def get_chipset(self) -> str: + if self in [ScorecardDevice.cs_8_gen_2, ScorecardDevice.any]: + return "qualcomm-snapdragon-8gen2" + if self == ScorecardDevice.cs_8_gen_3: + return "qualcomm-snapdragon-8gen3" + if self == ScorecardDevice.cs_6490: + return "qualcomm-qcs6490" + if self == ScorecardDevice.cs_8250: + return "qualcomm-qcs8250" + if self == ScorecardDevice.cs_8550: + return "qualcomm-qcs8550" + if self == ScorecardDevice.cs_x_elite: + return "qualcomm-snapdragon-x-elite" + raise NotImplementedError(f"No chipset for {self.name}") + + def get_os(self) -> str: + for attr in self.get_reference_device().attributes: + if attr.startswith("os:"): + return attr[3:] + raise ValueError(f"OS Not found for device: {self.name}") + + +class ScorecardCompilePath(Enum): + TFLITE = 0 + QNN = 1 + ORT = 2 + + def __str__(self): + return self.name.lower() + + @property + def long_name(self): + return f"torchscript_onnx_{self.name.lower()}" + + def enabled(self) -> bool: + valid_test_runtimes = os.environ.get("WHITELISTED_TEST_RUNTIMES", "ALL") + return valid_test_runtimes == "ALL" or ( + self.get_runtime().name.lower() + in [x.lower() for x in valid_test_runtimes.split(",")] + ) + + @staticmethod + def all_enabled() -> List["ScorecardCompilePath"]: + return [x for x in ScorecardCompilePath if x.enabled()] + + @staticmethod + def get_parameterized_test_config( + aimet_model=False, + only_enabled_paths=True, + only_enabled_devices=True, + ) -> List[Tuple["ScorecardCompilePath", ScorecardDevice]]: + path_list: List[ScorecardCompilePath] = ScorecardCompilePath.all_enabled() if only_enabled_paths else ScorecardCompilePath # type: ignore + path_devices_dict = { + sc_path: sc_path.get_test_devices(aimet_model, only_enabled_devices) + for sc_path in path_list + } + return [ + (key, dev) for key, devices in path_devices_dict.items() for dev in devices + ] + + def get_runtime(self) -> TargetRuntime: + if self == ScorecardCompilePath.TFLITE: + return TargetRuntime.TFLITE + if self == ScorecardCompilePath.ORT: + return TargetRuntime.ORT + if self == ScorecardCompilePath.QNN: + return TargetRuntime.QNN + raise NotImplementedError() + + def get_test_devices( + self, aimet_model=False, only_enabled=True + ) -> List[ScorecardDevice]: + if self == ScorecardCompilePath.QNN: + devices = [ScorecardDevice.any, ScorecardDevice.cs_x_elite] + else: + devices = [ScorecardDevice.any] + + return [x for x in devices if x.enabled()] if only_enabled else devices + + def get_compile_options(self, aimet_model=False) -> str: + if aimet_model and self.get_runtime() == TargetRuntime.ORT: + # TODO(#10896): Restore quantize_io flag to + # the default set of flags used to target ORT. + # This flag can be removed when that happens. + return "--quantize_io" + return "" + + def get_job_cache_name( + self, + model: str, + device: ScorecardDevice = ScorecardDevice.any, + component: Optional[str] = None, + ): + if device not in self.get_test_devices(): + device = ScorecardDevice.any # default to the "generic" compilation path + return f"{model}_{self.name}{'-' + device.name if device != ScorecardDevice.any else ''}{'_' + component if component else ''}" + + +class ScorecardProfilePath(Enum): + TFLITE = 0 + QNN = 1 + ORT = 2 + ORT_DML_GPU = 3 + + def __str__(self): + return self.name.lower() + + @property + def long_name(self): + return f"torchscript_onnx_{self.name.lower()}" + + def enabled(self) -> bool: + valid_test_runtimes = os.environ.get("WHITELISTED_TEST_RUNTIMES", "ALL") + return valid_test_runtimes == "ALL" or ( + self.get_runtime().name.lower() + in [x.lower() for x in valid_test_runtimes.split(",")] + ) + + @staticmethod + def all_enabled() -> List["ScorecardProfilePath"]: + return [x for x in ScorecardProfilePath if x.enabled()] + + @staticmethod + def get_parameterized_test_config( + aimet_model=False, + only_enabled_paths=True, + only_enabled_devices=True, + ) -> List[Tuple["ScorecardProfilePath", ScorecardDevice]]: + path_list: List[ScorecardProfilePath] = ScorecardProfilePath.all_enabled() if only_enabled_paths else ScorecardProfilePath # type: ignore + path_devices_dict = { + sc_path: sc_path.get_test_devices(aimet_model, only_enabled_devices) + for sc_path in path_list + } + return [ + (key, dev) for key, devices in path_devices_dict.items() for dev in devices + ] + + def get_runtime(self) -> TargetRuntime: + if self == ScorecardProfilePath.TFLITE: + return TargetRuntime.TFLITE + if self in [ScorecardProfilePath.ORT, ScorecardProfilePath.ORT_DML_GPU]: + return TargetRuntime.ORT + if self == ScorecardProfilePath.QNN: + return TargetRuntime.QNN + raise NotImplementedError() + + def get_compile_path(self) -> ScorecardCompilePath: + if self == ScorecardProfilePath.TFLITE: + return ScorecardCompilePath.TFLITE + if self in [ScorecardProfilePath.ORT, ScorecardProfilePath.ORT_DML_GPU]: + return ScorecardCompilePath.ORT + if self == ScorecardProfilePath.QNN: + return ScorecardCompilePath.QNN + raise NotImplementedError() + + def get_profile_options(self) -> str: + if self == ScorecardProfilePath.ORT_DML_GPU: + return "--compute_unit gpu" + return "" + + def get_test_devices( + self, aimet_model=False, only_enabled=True + ) -> List[ScorecardDevice]: + if self == ScorecardProfilePath.TFLITE: + devices = [ + ScorecardDevice.cs_8_gen_2, + ScorecardDevice.cs_8_gen_3, + ScorecardDevice.cs_8550, + ] + ( + [ScorecardDevice.cs_6490, ScorecardDevice.cs_8250] + if aimet_model + else [] + ) + elif self == ScorecardProfilePath.ORT: + devices = [ + ScorecardDevice.cs_8_gen_2, + ScorecardDevice.cs_8_gen_3, + ScorecardDevice.cs_x_elite, + ] + elif self == ScorecardProfilePath.QNN: + devices = [ + ScorecardDevice.cs_8_gen_2, + ScorecardDevice.cs_8_gen_3, + ScorecardDevice.cs_x_elite, + ScorecardDevice.cs_8550, + ] + ([ScorecardDevice.cs_6490] if aimet_model else []) + elif self == ScorecardProfilePath.ORT_DML_GPU: + devices = [ScorecardDevice.cs_x_elite] + else: + raise NotImplementedError() + + return [x for x in devices if x.enabled()] if only_enabled else devices + + def get_job_cache_name( + self, + model: str, + device: ScorecardDevice, + component: Optional[str] = None, + ): + return ( + f"{model}_{self.name}-{device.name}{'_' + component if component else ''}" + ) diff --git a/qai_hub_models/utils/scorecard/job_summary.py b/qai_hub_models/utils/scorecard/job_summary.py index a9fc112f..683dda40 100644 --- a/qai_hub_models/utils/scorecard/job_summary.py +++ b/qai_hub_models/utils/scorecard/job_summary.py @@ -8,11 +8,11 @@ import qai_hub as hub -from qai_hub_models.models.common import TargetRuntime from qai_hub_models.utils.config_loaders import QAIHMModelCodeGen, QAIHMModelInfo from qai_hub_models.utils.scorecard.common import ( - REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS, - SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME, + ScorecardCompilePath, + ScorecardDevice, + ScorecardProfilePath, ) @@ -20,11 +20,10 @@ class JobSummary: model_id: str job_id: Optional[str] - runtime: TargetRuntime + _device: ScorecardDevice def __post_init__(self): assert self.model_id - assert self.runtime # Verify Job Exists if self.job_id: assert self.job @@ -100,6 +99,8 @@ def quantized(self) -> str: @dataclass class CompileJobSummary(JobSummary): + path: ScorecardCompilePath + @classmethod def from_model_id( cls: Type["CompileJobSummary"], model_id: str, job_ids: Dict[str, str] @@ -123,25 +124,25 @@ def from_model_id( components = model_code_gen.default_components else: components = list(model_code_gen.components.keys()) + else: + components.append(None) # type: ignore - for runtime in TargetRuntime: - if not components: - model_runs.append( - cls( - model_id=model_info.name, - job_id=job_ids.get(f"{model_id}_{runtime.name}", None), - runtime=runtime, - ) - ) - else: - for component in components: + path: ScorecardCompilePath + for path in ScorecardCompilePath.all_enabled(): + for component in components: + for device in path.get_test_devices(model_code_gen.is_aimet): model_runs.append( cls( - model_id=component, + model_id=component or model_info.name, job_id=job_ids.get( - f"{model_id}_{runtime.name}_{component}", None + path.get_job_cache_name( + model=model_id, + device=device, + component=component, + ) ), - runtime=runtime, + path=path, + _device=device, ) ) @@ -162,7 +163,7 @@ def compile_job(self) -> Optional[hub.CompileJob]: @dataclass class ProfileJobSummary(JobSummary): - _chipset: str + path: ScorecardProfilePath @classmethod def from_model_id( @@ -187,43 +188,33 @@ def from_model_id( components = model_code_gen.default_components else: components = list(model_code_gen.components.keys()) + else: + components.append(None) # type: ignore - for runtime in TargetRuntime: - for device, chipset in SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME.items(): - run_dev = f"{runtime.name}-{device}" - if not components: - if (job_id := job_ids.get(f"{model_id}_{run_dev}", None)) is None: - continue + path: ScorecardProfilePath + for path in ScorecardProfilePath.all_enabled(): + for component in components: + for device in path.get_test_devices(model_code_gen.is_aimet): model_runs.append( cls( - model_id=model_info.name, - job_id=job_id, - runtime=runtime, - _chipset=chipset, + model_id=component or model_info.name, + job_id=job_ids.get( + path.get_job_cache_name( + model=model_id, + device=device, + component=component, + ), + None, + ), + _device=device, + path=path, ) ) - else: - for component in components: - if ( - job_id := job_ids.get( - f"{model_id}_{run_dev}_{component}", None - ) - ) is None: - continue - model_runs.append( - cls( - model_id=component, - job_id=job_id, - runtime=runtime, - _chipset=chipset, - ) - ) return model_runs def __post_init__(self): super().__post_init__() - assert self.chipset in REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS if not self.skipped: assert isinstance(self.job, hub.ProfileJob) if self._job_status.success: @@ -233,7 +224,7 @@ def __post_init__(self): def chipset(self) -> str: """Chipset the job was run on.""" if not self.job: - return self._chipset + return self._device.get_chipset() hub_device = self.job.device for attr in hub_device.attributes: @@ -243,11 +234,7 @@ def chipset(self) -> str: @cached_property def device(self) -> hub.Device: - return ( - self.job.device - if self.job - else REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS[self.chipset] - ) + return self.job.device if self.job else self._device.get_reference_device() @cached_property def profile_job(self) -> Optional[hub.ProfileJob]: diff --git a/qai_hub_models/utils/scorecard/model_card.py b/qai_hub_models/utils/scorecard/model_card.py index 1b989dbc..ae5798e4 100644 --- a/qai_hub_models/utils/scorecard/model_card.py +++ b/qai_hub_models/utils/scorecard/model_card.py @@ -13,10 +13,11 @@ import qai_hub as hub -from qai_hub_models.models.common import TargetRuntime from qai_hub_models.utils.config_loaders import MODEL_IDS from qai_hub_models.utils.scorecard.common import ( - REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS, + ScorecardCompilePath, + ScorecardDevice, + ScorecardProfilePath, ) from qai_hub_models.utils.scorecard.job_summary import ( CompileJobSummary, @@ -127,9 +128,10 @@ def supported_oses() -> List[str]: __REFERENCE_DEVICE_INFO_PER_CHIPSET = {} -def get_reference_device_info(chipset: str) -> Dict[str, str]: +def get_reference_device_info(device: ScorecardDevice) -> Dict[str, str]: + chipset = device.get_chipset() if chipset not in __REFERENCE_DEVICE_INFO_PER_CHIPSET: - hub_device = REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS[chipset] + hub_device = device.get_reference_device() device_name = hub_device.name os_version = hub_device.os os_name, form_factor, manufacturer = "", "", "" @@ -153,28 +155,26 @@ def get_reference_device_info(chipset: str) -> Dict[str, str]: @dataclass -class ChipsetPerfSummary: - chipset_name: str - run_per_runtime: Dict[TargetRuntime, ProfileJobSummary] # Map +class DevicePerfSummary: + device: ScorecardDevice + run_per_path: Dict[ScorecardProfilePath, ProfileJobSummary] # Map @staticmethod - def from_runs(chipset_name: str, runtime_runs: List[ProfileJobSummary]): + def from_runs(device: ScorecardDevice, path_runs: List[ProfileJobSummary]): # Figure out unique devices in various baselines - run_per_runtime: Dict[TargetRuntime, ProfileJobSummary] = {} - for run in runtime_runs: - assert run.chipset == chipset_name # Chipset should match - run_per_runtime[run.runtime] = run + run_per_path: Dict[ScorecardProfilePath, ProfileJobSummary] = {} + for run in path_runs: + assert run._device == device # Device should match + run_per_path[run.path] = run - return ChipsetPerfSummary(chipset_name, run_per_runtime) + return DevicePerfSummary(device, run_per_path) def get_perf_card(self) -> Dict[str, str | Dict[str, str]]: perf_card: Dict[str, str | Dict[str, str]] = {} - for runtime, run in self.run_per_runtime.items(): + for path, run in self.run_per_path.items(): if not run.skipped: # Skipped runs are not included - perf_card[runtime.long_name] = run.performance_metrics - perf_card["reference_device_info"] = get_reference_device_info( - self.chipset_name - ) + perf_card[path.long_name] = run.performance_metrics + perf_card["reference_device_info"] = get_reference_device_info(self.device) perf_card["timestamp"] = datetime.datetime.utcnow().isoformat() + "Z" return perf_card @@ -185,29 +185,31 @@ def __repr__(self) -> str: @dataclass class ModelPerfSummary: model_id: str - runs_per_chipset: Dict[str, ChipsetPerfSummary] # Map + runs_per_device: Dict[ + ScorecardDevice, DevicePerfSummary + ] # Map @staticmethod def from_runs(model_id: str, device_runs: List[ProfileJobSummary]): # Figure out unique devices in various baselines - runs_per_chipset: Dict[str, List[ProfileJobSummary]] = {} + runs_per_device: Dict[ScorecardDevice, List[ProfileJobSummary]] = {} for run in device_runs: assert run.model_id == model_id # All should have the same model ID - list = runs_per_chipset.get(run.chipset or "", []) - runs_per_chipset[run.chipset] = list + list = runs_per_device.get(run._device, []) + runs_per_device[run._device] = list list.append(run) return ModelPerfSummary( model_id, { - chipset_name: ChipsetPerfSummary.from_runs(chipset_name, runs) - for chipset_name, runs in runs_per_chipset.items() + device: DevicePerfSummary.from_runs(device, runs) + for device, runs in runs_per_device.items() }, ) def get_perf_card(self) -> List[Dict[str, Union[str, Dict[str, str]]]]: perf_card = [] - for summary in self.runs_per_chipset.values(): + for summary in self.runs_per_device.values(): perf_card.append(summary.get_perf_card()) return perf_card @@ -226,8 +228,8 @@ def from_model_ids( """ Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format: Either: - ||| : job_id - || : job_id + _-_ : job_id + _- : job_id Returns models in this format: model_id: List[Summary] @@ -248,8 +250,8 @@ def from_model_id( """ Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format: Either: - ||| : job_id - || : job_id + _-_ : job_id + _- : job_id Returns models in this format: model_id: List[Summary] @@ -277,7 +279,9 @@ def from_runs(model_runs: List[ProfileJobSummary]): def get_chipsets(self) -> Set[str]: chips: Set[str] = set() for _, model_summary in self.runs_per_model.items(): - chips.update(model_summary.runs_per_chipset.keys()) + chips.update( + [x.get_chipset() for x in model_summary.runs_per_device.keys()] + ) return chips def get_perf_card(self) -> Dict[str, str | List[Any] | Dict[str, Any]]: @@ -302,20 +306,44 @@ def __repr__(self): return pprint.pformat(self.get_perf_card()) +@dataclass +class DeviceCompileSummary: + device: ScorecardDevice + run_per_path: Dict[ScorecardCompilePath, CompileJobSummary] # Map + + @staticmethod + def from_runs(device: ScorecardDevice, path_runs: List[CompileJobSummary]): + # Figure out unique devices in various baselines + run_per_path: Dict[ScorecardCompilePath, CompileJobSummary] = {} + for run in path_runs: + assert run._device == device # Device should match + run_per_path[run.path] = run + + return DeviceCompileSummary(device, run_per_path) + + @dataclass class ModelCompileSummary: model_id: str - runs_per_runtime: Dict[ - TargetRuntime, CompileJobSummary + runs_per_device: Dict[ + ScorecardDevice, DeviceCompileSummary ] # Map @staticmethod - def from_runs(model_id: str, runtime_runs: List[CompileJobSummary]): - run_per_runtime: Dict[TargetRuntime, CompileJobSummary] = {} - for run in runtime_runs: + def from_runs(model_id: str, path_runs: List[CompileJobSummary]): + runs_per_device: Dict[ScorecardDevice, List[CompileJobSummary]] = {} + for run in path_runs: assert run.model_id == model_id # model id should match - run_per_runtime[run.runtime] = run - return ModelCompileSummary(model_id, run_per_runtime) + list = runs_per_device.get(run._device, []) + runs_per_device[run._device] = list + list.append(run) + return ModelCompileSummary( + model_id, + { + device: DeviceCompileSummary.from_runs(device, runs) + for device, runs in runs_per_device.items() + }, + ) @dataclass @@ -329,8 +357,10 @@ def from_model_ids( """ Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format: Either: - ||| : job_id - || : job_id + _-_ : job_id + _- : job_id + __ : job_id + _ : job_id Returns models in this format: model_id: List[Summary] @@ -351,8 +381,10 @@ def from_model_id( """ Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format: Either: - ||| : job_id - || : job_id + _-_ : job_id + _- : job_id + __ : job_id + _ : job_id Returns models in this format: model_id: List[Summary] diff --git a/qai_hub_models/utils/system_info.py b/qai_hub_models/utils/system_info.py new file mode 100644 index 00000000..50f8d531 --- /dev/null +++ b/qai_hub_models/utils/system_info.py @@ -0,0 +1,49 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- + +import psutil + +from qai_hub_models.utils.printing import print_with_box + + +def has_recommended_memory(required_memory_in_gb: float) -> None: + """ + Prints out warning if system has less memory(RAM+swap-space) than recommended. + """ + total_ram = psutil.virtual_memory().total + total_swap = psutil.swap_memory().total + + # Get total memory in GB + total_ram_in_gb = total_ram / 1024**3 + total_swap_in_gb = total_swap / 1024**3 + + total_memory_in_gb = int(total_ram_in_gb + total_swap_in_gb) + + if required_memory_in_gb > total_memory_in_gb: + recommended_swap = int(required_memory_in_gb - total_ram_in_gb) + 1 + warning_msgs = [ + f"Recommended minimum memory of {required_memory_in_gb} GB memory (RAM + swap-space), found {total_memory_in_gb} GB.", + "You might see process killed error due to OOM during export/demo.", + "", + "Please increase your swap-space temporarily as a work-around. It might slow down export but allow you to export successfully.", + "You can refer to https://askubuntu.com/questions/178712/how-to-increase-swap-space for instructions", + "or run following commands: ", + "", + "sudo swapoff -a", + "# bs=", + "# count=number of to allocate for swapfile", + "# Total size = * count", + "# = 1 MB * 40k = ~40GB", + f"sudo dd if=/dev/zero of=/local/mnt/swapfile bs=1M count={recommended_swap}k", + "" "# Set the correct permissions", + "sudo chmod 0600 /local/mnt/swapfile", + "", + "sudo mkswap /local/mnt/swapfile # Set up a Linux swap area", + "sudo swapon /local/mnt/swapfile # Turn the swap on", + "", + "You can update `count` to increase swap space that works for machine." + "NOTE: above commands does not persist through reboot.", + ] + print_with_box(warning_msgs) diff --git a/scripts/build_and_test.py b/scripts/build_and_test.py index 7768a28f..471bae6c 100755 --- a/scripts/build_and_test.py +++ b/scripts/build_and_test.py @@ -374,7 +374,7 @@ def test_all_models(self, plan: Plan, step_id: str = "test_all_models") -> str: PyTestModelsTask( self.python_executable, all_models, - [], + REPRESENTATIVE_EXPORT_MODELS, self.venv_path, venv_for_each_model=False, use_shared_cache=True, diff --git a/scripts/examples/quantize_yolo.py b/scripts/examples/quantize_detector_coco.py similarity index 92% rename from scripts/examples/quantize_yolo.py rename to scripts/examples/quantize_detector_coco.py index 66f3df10..b5e405e3 100644 --- a/scripts/examples/quantize_yolo.py +++ b/scripts/examples/quantize_detector_coco.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- """ -This is a sample script showing how to take a AIMET model zoo model without -pre-computed activations, and compute those activations using QAIHM. +This is a sample script showing how to compute AIMET encodings for a YOLO model + using the COCO dataset. This script assumes the model is added to QAIHM, but is missing quantization parameters. """ import argparse @@ -14,6 +14,7 @@ from torch.utils.data import DataLoader from qai_hub_models.datasets.coco import CocoDataset +from qai_hub_models.models.yolonas_quantized.model import YoloNASQuantizable from qai_hub_models.models.yolov7_quantized.model import YoloV7Quantizable from qai_hub_models.models.yolov8_det_quantized.model import YoloV8DetectorQuantizable @@ -25,6 +26,7 @@ MODELS = { "yolov7": YoloV7Quantizable, "yolov8": YoloV8DetectorQuantizable, + "yolonas": YoloNASQuantizable, } if __name__ == "__main__": diff --git a/scripts/examples/quantize_ffnet.py b/scripts/examples/quantize_ffnet.py index beb54b5b..eca22d69 100644 --- a/scripts/examples/quantize_ffnet.py +++ b/scripts/examples/quantize_ffnet.py @@ -2,6 +2,11 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +""" +This is a sample script showing how to compute AIMET encodings for an FFNet model + using the Cityscapes dataset. +This script assumes the model is added to QAISM, but is missing quantization parameters. +""" import argparse from pathlib import Path @@ -20,13 +25,6 @@ "ffnet_78s": FFNet78SQuantizable, } - -""" -This is a sample script showing how to take a AIMET model zoo model without -pre-computed activations, and compute those activations using QAISM. - -This script assumes the model is added to QAISM, but is missing quantization parameters. -""" if __name__ == "__main__": # Args parser = argparse.ArgumentParser() diff --git a/scripts/examples/quantize_hrnet.py b/scripts/examples/quantize_hrnet.py index b8c40b13..835beb6c 100644 --- a/scripts/examples/quantize_hrnet.py +++ b/scripts/examples/quantize_hrnet.py @@ -3,9 +3,8 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- """ -This is a sample script showing how to take a AIMET model zoo model without -pre-computed activations, and compute those activations using QAIHM. - +This is a sample script showing how to compute AIMET encodings for an HRNet model + using the COCO dataset. This script assumes the model is added to QAIHM, but is missing quantization parameters. """ import argparse diff --git a/scripts/examples/quantize_imagenet_classifier.py b/scripts/examples/quantize_imagenet_classifier.py index 980bfef6..907e877b 100644 --- a/scripts/examples/quantize_imagenet_classifier.py +++ b/scripts/examples/quantize_imagenet_classifier.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- """ -This is a sample script showing how to take a AIMET model zoo model without -pre-computed activations, and compute those activations using QAIHM. +This is a sample script showing how to compute AIMET encodings for an + Imagenet Classifier using the Imagenette dataset. This script assumes the model is added to QAIHM, but is missing quantization parameters. """ import argparse @@ -14,6 +14,12 @@ from torch.utils.data import DataLoader from qai_hub_models.datasets.imagenette import ImagenetteDataset +from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import ( + ConvNextTinyW8A8Quantizable, +) +from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import ( + ConvNextTinyW8A16Quantizable, +) from qai_hub_models.models.googlenet_quantized.model import GoogLeNetQuantizable from qai_hub_models.models.inception_v3_quantized.model import InceptionNetV3Quantizable from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable @@ -45,6 +51,8 @@ "shufflenet_v2": ShufflenetV2Quantizable, "squeezenet1_1": SqueezeNetQuantizable, "wideresnet50": WideResNet50Quantizable, + "convnext_tiny_w8a8": ConvNextTinyW8A8Quantizable, + "convnext_tiny_w8a16": ConvNextTinyW8A16Quantizable, } if __name__ == "__main__": diff --git a/scripts/examples/quantize_deeplabv3.py b/scripts/examples/quantize_segmenter_voc.py similarity index 91% rename from scripts/examples/quantize_deeplabv3.py rename to scripts/examples/quantize_segmenter_voc.py index ad160ee7..2efa487a 100644 --- a/scripts/examples/quantize_deeplabv3.py +++ b/scripts/examples/quantize_segmenter_voc.py @@ -3,9 +3,8 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- """ -This is a sample script showing how to take a AIMET model zoo model without -pre-computed activations, and compute those activations using QAIHM. - +This is a sample script showing how to compute AIMET encodings for a DeepLab model + using the PASCAL VOC dataset. This script assumes the model is added to QAIHM, but is missing quantization parameters. """ import argparse @@ -18,9 +17,11 @@ from qai_hub_models.models.deeplabv3_plus_mobilenet_quantized.model import ( DeepLabV3PlusMobilenetQuantizable, ) +from qai_hub_models.models.fcn_resnet50_quantized.model import FCN_ResNet50Quantizable MODELS = { "deeplabv3_plus_mobilenet": DeepLabV3PlusMobilenetQuantizable, + "fcn_resnet50": FCN_ResNet50Quantizable, } if __name__ == "__main__": @@ -69,7 +70,6 @@ evaluator = model.get_evaluator() evaluator.add_from_dataset(model, dataloader, args.num_iter) accuracy_fp32 = evaluator.get_accuracy_score() - model.quantize(dataloader, args.num_iter, data_has_gt=True) evaluator.reset() evaluator.add_from_dataset(model, dataloader, args.num_iter) diff --git a/scripts/examples/quantize_superresolution.py b/scripts/examples/quantize_superresolution.py index 0a354e54..6807c7d2 100644 --- a/scripts/examples/quantize_superresolution.py +++ b/scripts/examples/quantize_superresolution.py @@ -3,53 +3,88 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- """ -This is a sample script showing how to take a AIMET model zoo model without -pre-computed activations, and compute those activations using QAISM. - +This is a sample script showing how to compute AIMET encodings for an SuperResolution + model using the BSD300 dataset. This script assumes the model is added to QAISM, but is missing quantization parameters. """ import argparse -import importlib from pathlib import Path import torch from torch.utils.data import DataLoader from qai_hub_models.datasets.bsd300 import BSD300Dataset +from qai_hub_models.models.quicksrnetlarge_quantized.model import ( + QuickSRNetLargeQuantizable, +) +from qai_hub_models.models.quicksrnetmedium_quantized.model import ( + QuickSRNetMediumQuantizable, +) +from qai_hub_models.models.quicksrnetsmall_quantized.model import ( + QuickSRNetSmallQuantizable, +) +from qai_hub_models.models.xlsr_quantized.model import XLSRQuantizable from qai_hub_models.utils.quantization_aimet import ( # isort: skip AIMETQuantizableMixin, ) +MODELS = { + "xlsr": XLSRQuantizable, + "quicksrnetsmall": QuickSRNetSmallQuantizable, + "quicksrnetmedium": QuickSRNetMediumQuantizable, + "quicksrnetlarge": QuickSRNetLargeQuantizable, +} + + if __name__ == "__main__": # Args parser = argparse.ArgumentParser() parser.add_argument( - "--num-iter", type=int, default=1, help="Number of batches to use." + "--num-iter", type=int, default=128, help="Number of batches to use." ) parser.add_argument( "--batch-size", type=int, - default=128, + default=1, help="Batch size to use on each iteration.", ) parser.add_argument( "--model", type=str, - default="sesr_m5_quantized", + choices=MODELS.keys(), + required=True, help="Name of the model folder to compute encodings. This script expects a super resolution model with a scaling parameter, eg SESR M5 Quantized.", ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory where encodings should be stored. Defaults to ./build.", + ) + parser.add_argument( + "--output-name", + type=str, + default=None, + help="Encodings filename. Defaults to _encodings.", + ) parser.add_argument( "--seed", type=int, default=42, help="Manual seed to ensure reproducibility for quantization.", ) + parser.add_argument( + "--scale-factor", + type=int, + default=4, + help="Scaling factor of the model.", + ) args = parser.parse_args() - module = importlib.import_module(f"qai_hub_models.models.{args.model}") + model = MODELS[args.model].from_pretrained(aimet_encodings=None) # Load dataset - dataset = BSD300Dataset(scaling_factor=module.model.SCALING_FACTOR) + dataset = BSD300Dataset(scaling_factor=args.scale_factor) torch.manual_seed(args.seed) # Pass it to the dataloader dataloader = DataLoader( @@ -57,7 +92,6 @@ ) # Load model and confirm it's a quantizable type. - model = module.Model.from_pretrained(aimet_encodings=None) assert isinstance(model, AIMETQuantizableMixin) evaluator = model.get_evaluator() @@ -73,8 +107,10 @@ evaluator.add_from_dataset(model, dataloader, args.num_iter) accuracy_int8 = evaluator.get_accuracy_score() - print(f"FP32 PSNR: {accuracy_fp32} dB") - print(f"INT8 PSNR: {accuracy_int8} dB") + print(f"FP32 PSNR: {accuracy_fp32:.2f} dB") + print(f"INT8 PSNR: {accuracy_int8:.2f} dB") # Export encodings - model.quant_sim.save_encodings_to_json(Path() / "build", module.MODEL_ID) + output_path = args.output_dir or str(Path() / "build") + output_name = args.output_name or f"{args.model}_quantized_encodings" + model.quant_sim.save_encodings_to_json(output_path, output_name) diff --git a/scripts/examples/test_numerics_imagenet_classifier_quantized.py b/scripts/examples/test_numerics_imagenet_classifier_quantized.py index e278ff1f..d9191056 100644 --- a/scripts/examples/test_numerics_imagenet_classifier_quantized.py +++ b/scripts/examples/test_numerics_imagenet_classifier_quantized.py @@ -16,6 +16,12 @@ from qai_hub_models.datasets.imagenette import ImagenetteDataset from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier +from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import ( + ConvNextTinyW8A8Quantizable, +) +from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import ( + ConvNextTinyW8A16Quantizable, +) from qai_hub_models.models.inception_v3_quantized.model import InceptionNetV3Quantizable from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable from qai_hub_models.models.mobilenet_v3_large_quantized.model import ( @@ -89,69 +95,47 @@ def test_dataloader_is_deterministic(data_loaders): assert labels[:5].tolist() == expected_test_labels -@pytest.fixture( - scope="module", - params=[ - # Class, AIMET accuracy - (MobileNetV2Quantizable, 0.8100), - (MobileNetV3LargeQuantizable, 0.8430), - (ResNet18Quantizable, 0.8010), - (ResNet50Quantizable, 0.8520), - (ResNet101Quantizable, 0.8530), - (ResNeXt50Quantizable, 0.8880), - (ResNeXt101Quantizable, 0.9250), - (SqueezeNetQuantizable, 0.6410), - (RegNetQuantizable, 0.8750), - (WideResNet50Quantizable, 0.9190), - (ShufflenetV2Quantizable, 0.6740), - (InceptionNetV3Quantizable, 0.8430), - ], -) -def quantized_model(request, data_loaders, test_data): - """ - Create encoding from calibration data and returned quantized model with - validated off-target accuracy computed on QuantSim - """ - img_test, label_test, hub_dataset = test_data - calib_loader, test_loader = data_loaders - model_cls, target_sim_acc = request.param - model = model_cls.from_pretrained(aimet_encodings=None) - - # Calibration in quantization - num_calib_batches = 3 - model.quantize(calib_loader, num_calib_batches, data_has_gt=True) - - # QuantSim evaluation on eval set - evaluator = model.get_evaluator() - - batch_size = 32 - for i in tqdm(list(range(0, img_test.size(0), batch_size)), desc="QuantSim eval"): - img_batch = img_test[i : i + batch_size] - label_batch = label_test[i : i + batch_size] - - sim_out = model(img_batch).detach() - evaluator.add_batch(sim_out, label_batch) - - sim_acc = evaluator.get_accuracy_score() - print(f"{model_cls=}, {sim_acc=}") - np.testing.assert_allclose(target_sim_acc, sim_acc, atol=0.01) - return model - - @on_device @pytest.mark.parametrize( - "source_model_format,target_runtime,hub_needs_calib_data", + "model_cls,target_runtime,expected_size_mb,expected_acc", [ - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, False), - (SourceModelFormat.ONNX, TargetRuntime.QNN, False), + (MobileNetV2Quantizable, TargetRuntime.TFLITE, 3.64, 0.816), + (MobileNetV2Quantizable, TargetRuntime.QNN, 4.09, 0.813), + (MobileNetV3LargeQuantizable, TargetRuntime.TFLITE, 5.72, 0.848), + # MobileNetV3LargeQuantizable, TargetRuntime.QNN fails to convert (AISW-87206) + (ResNet18Quantizable, TargetRuntime.TFLITE, 11.30, 0.805), + (ResNet18Quantizable, TargetRuntime.QNN, 11.65, 0.796), + (ResNet50Quantizable, TargetRuntime.TFLITE, 25.09, 0.847), + (ResNet50Quantizable, TargetRuntime.QNN, 25.41, 0.848), + (ResNet101Quantizable, TargetRuntime.TFLITE, 43.88, 0.858), + (ResNet101Quantizable, TargetRuntime.QNN, 44.08, 0.831), + (ResNeXt50Quantizable, TargetRuntime.TFLITE, 24.77, 0.891), + (ResNeXt50Quantizable, TargetRuntime.QNN, 25.03, 0.893), + (ResNeXt101Quantizable, TargetRuntime.TFLITE, 87.28, 0.926), + # Fails to infer (#9827) + (ResNeXt101Quantizable, TargetRuntime.QNN, 87.26, None), + (SqueezeNetQuantizable, TargetRuntime.TFLITE, 1.30, 0.637), + (SqueezeNetQuantizable, TargetRuntime.QNN, 1.69, 0.636), + (RegNetQuantizable, TargetRuntime.TFLITE, 15.42, 0.872), + (RegNetQuantizable, TargetRuntime.QNN, 15.89, 0.876), + (WideResNet50Quantizable, TargetRuntime.TFLITE, 66.59, 0.923), + (WideResNet50Quantizable, TargetRuntime.QNN, 66.86, 0.922), + (ShufflenetV2Quantizable, TargetRuntime.TFLITE, 1.46, 0.674), + (ShufflenetV2Quantizable, TargetRuntime.QNN, 1.99, 0.670), + (InceptionNetV3Quantizable, TargetRuntime.TFLITE, 23.32, 0.841), + (InceptionNetV3Quantizable, TargetRuntime.QNN, 23.85, 0.845), + # ConvNextTinyW8A8Quantizable, SourceModelFormat.ONNX not supported yet (#10862) + (ConvNextTinyW8A8Quantizable, TargetRuntime.QNN, 28.33, 0.846), + # ConvNextTinyW8A16Quantizable, SourceModelFormat.ONNX not supported yet (#10862) + (ConvNextTinyW8A16Quantizable, TargetRuntime.QNN, 28.34, 0.876), ], ) -def test_make_encoding_w8a8_accuracy( - source_model_format, +def test_quantized_accuracy( + model_cls, target_runtime, - hub_needs_calib_data, + expected_size_mb, + expected_acc, test_data, - quantized_model, data_loaders, ): """ @@ -160,133 +144,18 @@ def test_make_encoding_w8a8_accuracy( Note: We don't run profile job to get perf here but leave that to the score card. """ - model = quantized_model - - expected_size_mb_and_acc = { - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, MobileNetV2Quantizable): ( - 3.64, - 0.801, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, MobileNetV2Quantizable): ( - 4.02, - 0.801, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, MobileNetV3LargeQuantizable): ( - 5.79, - 0.859, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, MobileNetV3LargeQuantizable): ( - None, # Fails to convert (AISW-87206) - None, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet18Quantizable): ( - 11.30, - 0.778, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet18Quantizable): ( - 11.61, - 0.789, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet50Quantizable): ( - 25.09, - 0.837, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet50Quantizable): ( - 25.33, - 0.834, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet101Quantizable): ( - 43.89, - 0.827, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet101Quantizable): ( - 44.08, - 0.831, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNeXt50Quantizable): ( - 24.77, - 0.888, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNeXt50Quantizable): ( - 24.96, - 0.888, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNeXt101Quantizable): ( - 87.29, - 0.906, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNeXt101Quantizable): ( - 87.11, - None, # Fails to infer (#9827) - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, SqueezeNetQuantizable): ( - 1.30, - 0.609, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, SqueezeNetQuantizable): ( - 1.66, - 0.609, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, RegNetQuantizable): ( - 15.43, - 0.859, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, RegNetQuantizable): ( - 15.77, - 0.859, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, WideResNet50Quantizable): ( - 66.59, - 0.900, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, WideResNet50Quantizable): ( - 66.78, - 0.897, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ShufflenetV2Quantizable): ( - 1.47, - 0.661, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, ShufflenetV2Quantizable): ( - 1.90, - 0.661, - ), - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, InceptionNetV3Quantizable): ( - 23.33, - 0.843, - ), - (SourceModelFormat.ONNX, TargetRuntime.QNN, InceptionNetV3Quantizable): ( - 23.81, - 0.844, - ), - } - expected_size_mb, expected_acc = expected_size_mb_and_acc[ - (source_model_format, target_runtime, model.__class__) - ] - if expected_size_mb is None: - pytest.skip("Fails to compile") + model = model_cls.from_pretrained() img_test, label_test, hub_dataset = test_data calib_loader, test_loader = data_loaders - # calibration data - calibration_data = None - if hub_needs_calib_data: - # AIMET export has missing encoding and needs calibration data - num_calib_batches = 3 - calib_imgs = [] - for b, (img_calib, labels) in enumerate(iter(calib_loader)): - if b >= num_calib_batches: - break - img_np = img_calib.numpy() - calib_imgs.extend(np.split(img_np, img_np.shape[0])) - calibration_data = {list(model.get_input_spec().keys())[0]: calib_imgs} + calibration_data = model.get_calibration_data(target_runtime) # On-device inference device = hub.Device("Samsung Galaxy S23") hub_model = compile_zoo_model_to_hub( model=model, - source_model_format=source_model_format, + source_model_format=SourceModelFormat.ONNX, device=device, target_runtime=target_runtime, calibration_data=calibration_data, @@ -294,10 +163,7 @@ def test_make_encoding_w8a8_accuracy( # Make sure model is quantized tgt_model_size_mb = get_model_size_mb(hub_model.model) - model_cls = quantized_model.__class__ - print( - f"{model_cls=}, {source_model_format=}, {target_runtime=}, {tgt_model_size_mb=}" - ) + print(f"{model_cls=}, {target_runtime=}, {tgt_model_size_mb=}") np.testing.assert_allclose(expected_size_mb, tgt_model_size_mb, rtol=0.1) if expected_acc is None: @@ -308,5 +174,5 @@ def test_make_encoding_w8a8_accuracy( evaluator = model.get_evaluator() evaluator.add_batch(hub_out, label_test) hub_acc = evaluator.get_accuracy_score() - print(f"{model_cls=}, {source_model_format=}, {target_runtime=}, {hub_acc=}") + print(f"{model_cls=}, {target_runtime=}, {hub_acc=}") np.testing.assert_allclose(expected_acc, hub_acc, atol=0.01) diff --git a/scripts/examples/yolov6_evaluation.py b/scripts/examples/yolov6_evaluation.py index 6ecb2fe7..367e3d71 100644 --- a/scripts/examples/yolov6_evaluation.py +++ b/scripts/examples/yolov6_evaluation.py @@ -3,9 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- """ -This is a sample script showing how to take a AIMET model zoo model without -pre-computed activations, and compute those activations using QAIHM. -This script assumes the model is added to QAIHM, but is missing quantization parameters. +This is a sample script showing how to evaluate accuracy (mAP) of a yolov6 model. Packages to install: pycocotools, object-detection-metrics==0.4.post1, shapely """ diff --git a/scripts/tasks/changes.py b/scripts/tasks/changes.py index 0f806300..bebe6068 100644 --- a/scripts/tasks/changes.py +++ b/scripts/tasks/changes.py @@ -37,6 +37,7 @@ "qai_hub_models/models/resnet18_quantized/model.py", ], "qai_hub_models/utils/printing.py": REPRESENTATIVE_EXPORT_FILES, + "qai_hub_models/utils/config_loaders.py": REPRESENTATIVE_EXPORT_FILES, } diff --git a/scripts/tasks/venv.py b/scripts/tasks/venv.py index b1c08eb0..f73fc35c 100644 --- a/scripts/tasks/venv.py +++ b/scripts/tasks/venv.py @@ -52,17 +52,8 @@ def __init__( ) -> None: tasks = [] - extras_str = f"[{','.join(extras)}]" if extras else "" - tasks.append( - RunCommandsWithVenvTask( - group_name=f"Install QAIHM{extras_str}", - venv=venv_path, - commands=[ - f'pip install -e "{PY_PACKAGE_INSTALL_ROOT}{extras_str}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html', - ], - ) - ) - + # Install AIMET before model requirements to give preference over + # model specific versions. if include_aimet: if can_support_aimet(): if is_package_installed("aimet_torch", venv_path): @@ -95,6 +86,17 @@ def __init__( ) ) + extras_str = f"[{','.join(extras)}]" if extras else "" + tasks.append( + RunCommandsWithVenvTask( + group_name=f"Install QAIHM{extras_str}", + venv=venv_path, + commands=[ + f'pip install -e "{PY_PACKAGE_INSTALL_ROOT}{extras_str}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html', + ], + ) + ) + super().__init__( f"Create Local QAIHM{extras_str} Virtual Environment at {venv_path}", [task for task in tasks], diff --git a/setup.py b/setup.py index a40bd1ae..22aa2931 100644 --- a/setup.py +++ b/setup.py @@ -29,10 +29,12 @@ def get_data_files() -> List[str]: data_files = [] for ext in data_file_extensions: data_files.extend( - glob.glob(f"{str(qaihm_path.absolute())}/**/*.{ext}", recursive=True) + glob.glob( + f"{str(qaihm_path.absolute() / '**' / f'*.{ext}')}", recursive=True + ) ) for i in range(0, len(data_files)): - data_files[i] = data_files[i].split("/qai_hub_models/")[1] + data_files[i] = data_files[i].split("qai_hub_models")[1][1:] return data_files