diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 633a8882..cb4e51f1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,6 +37,7 @@ repos:
         args: [--allow-multiple-documents]
     -   id: trailing-whitespace
         exclude: '\.diff$'
+        args: [--markdown-linebreak-ext=md]
     -   id: check-added-large-files
         args: ['--maxkb=1024']
     -   id: check-merge-conflict
@@ -56,6 +57,7 @@ repos:
     rev: v0.9.0.6
     hooks:
     -   id: shellcheck
+        exclude: '\.yml$'
 -   repo: https://github.com/pycqa/isort
     rev: 5.12.0
     hooks:
diff --git a/README.md b/README.md
index eb316a71..716bd959 100644
--- a/README.md
+++ b/README.md
@@ -12,12 +12,21 @@ memory etc.) and ready to deploy on Qualcomm® devices.
 * Access the models through [Hugging Face](https://huggingface.co/qualcomm).
 * [Sign up](https://myaccount.qualcomm.com/signup) to run these models on hosted Qualcomm® devices.
 
+Supported **python package host machine** Operating Systems:
+- Linux (x86, ARM)
+- Windows (x86)
+- Windows (ARM-- ONLY via x86 Python, not ARM Python)
+- MacOS (x86, ARM)
+
 Supported runtimes
 * [TensorFlow Lite](https://www.tensorflow.org/lite)
 * [Qualcomm AI Engine Direct](https://www.qualcomm.com/developer/artificial-intelligence#overview)
+* [ONNX](https://onnxruntime.ai/docs/execution-providers/QNN-ExecutionProvider.html)
 
-Supported operating systems:
-* Android 11+
+Models can be deployed on:
+* Android
+* Windows
+* Linux
 
 Supported compute units
 * CPU, GPU, NPU (includes [Hexagon DSP](https://developer.qualcomm.com/software/hexagon-dsp-sdk/dsp-processor), [HTP](https://developer.qualcomm.com/hardware/qualcomm-innovators-development-kit/ai-resources-overview/ai-hardware-cores-accelerators))
@@ -28,12 +37,13 @@ Supported precision
 
 Supported chipsets
 * [Snapdragon 845](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-845-mobile-platform), [Snapdragon 855/855+](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-855-mobile-platform), [Snapdragon 865/865+](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-865-plus-5g-mobile-platform), [Snapdragon 888/888+](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-888-5g-mobile-platform)
-* [Snapdragon 8 Gen 1](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-1-mobile-platform), [Snapdragon 8 Gen 2](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-2-mobile-platform), [Snapdragon 8 Gen 3](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-3-mobile-platform)
+* [Snapdragon 8 Gen 1](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-1-mobile-platform), [Snapdragon 8 Gen 2](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-2-mobile-platform), [Snapdragon 8 Gen 3](https://www.qualcomm.com/products/mobile/snapdragon/smartphones/snapdragon-8-series-mobile-platforms/snapdragon-8-gen-3-mobile-platform), [Snapdragon X Elite](https://www.qualcomm.com/products/mobile/snapdragon/pcs-and-tablets/snapdragon-x-elite)
 
 Select supported devices
 * Samsung Galaxy S21 Series, Galaxy S22 Series, Galaxy S23 Series, Galaxy S24 Series
 * Xiaomi 12, 13
 * Google Pixel 3, 4, 5
+* Snapdragon X Elite CRD (Compute Reference Device)
 
 and many more.
 
@@ -261,6 +271,8 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | | | | |
 | **Image Classification**
 | [ConvNext-Tiny](https://aihub.qualcomm.com/models/convnext_tiny) | [qai_hub_models.models.convnext_tiny](qai_hub_models/models/convnext_tiny/README.md) | ✔️ | ✔️ | ✔️
+| [ConvNext-Tiny-w8a16-Quantized](qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md) | [qai_hub_models.models.convnext_tiny_w8a16_quantized](qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md) | ✔️ | ✔️ | ✔️
+| [ConvNext-Tiny-w8a8-Quantized](qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md) | [qai_hub_models.models.convnext_tiny_w8a8_quantized](qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [DenseNet-121](https://aihub.qualcomm.com/models/densenet121) | [qai_hub_models.models.densenet121](qai_hub_models/models/densenet121/README.md) | ✔️ | ✔️ | ✔️
 | [EfficientNet-B0](https://aihub.qualcomm.com/models/efficientnet_b0) | [qai_hub_models.models.efficientnet_b0](qai_hub_models/models/efficientnet_b0/README.md) | ✔️ | ✔️ | ✔️
 | [GoogLeNet](https://aihub.qualcomm.com/models/googlenet) | [qai_hub_models.models.googlenet](qai_hub_models/models/googlenet/README.md) | ✔️ | ✔️ | ✔️
@@ -321,7 +333,8 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | [DeepLabV3-Plus-MobileNet](https://aihub.qualcomm.com/models/deeplabv3_plus_mobilenet) | [qai_hub_models.models.deeplabv3_plus_mobilenet](qai_hub_models/models/deeplabv3_plus_mobilenet/README.md) | ✔️ | ✔️ | ✔️
 | [DeepLabV3-Plus-MobileNet-Quantized](https://aihub.qualcomm.com/models/deeplabv3_plus_mobilenet_quantized) | [qai_hub_models.models.deeplabv3_plus_mobilenet_quantized](qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [DeepLabV3-ResNet50](https://aihub.qualcomm.com/models/deeplabv3_resnet50) | [qai_hub_models.models.deeplabv3_resnet50](qai_hub_models/models/deeplabv3_resnet50/README.md) | ✔️ | ✔️ | ✔️
-| [FCN_ResNet50](https://aihub.qualcomm.com/models/fcn_resnet50) | [qai_hub_models.models.fcn_resnet50](qai_hub_models/models/fcn_resnet50/README.md) | ✔️ | ✔️ | ✔️
+| [FCN-ResNet50](https://aihub.qualcomm.com/models/fcn_resnet50) | [qai_hub_models.models.fcn_resnet50](qai_hub_models/models/fcn_resnet50/README.md) | ✔️ | ✔️ | ✔️
+| [FCN-ResNet50-Quantized](https://aihub.qualcomm.com/models/fcn_resnet50_quantized) | [qai_hub_models.models.fcn_resnet50_quantized](qai_hub_models/models/fcn_resnet50_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [FFNet-122NS-LowRes](https://aihub.qualcomm.com/models/ffnet_122ns_lowres) | [qai_hub_models.models.ffnet_122ns_lowres](qai_hub_models/models/ffnet_122ns_lowres/README.md) | ✔️ | ✔️ | ✔️
 | [FFNet-40S](https://aihub.qualcomm.com/models/ffnet_40s) | [qai_hub_models.models.ffnet_40s](qai_hub_models/models/ffnet_40s/README.md) | ✔️ | ✔️ | ✔️
 | [FFNet-40S-Quantized](https://aihub.qualcomm.com/models/ffnet_40s_quantized) | [qai_hub_models.models.ffnet_40s_quantized](qai_hub_models/models/ffnet_40s_quantized/README.md) | ✔️ | ✔️ | ✔️
@@ -347,6 +360,8 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | [MediaPipe-Hand-Detection](https://aihub.qualcomm.com/models/mediapipe_hand) | [qai_hub_models.models.mediapipe_hand](qai_hub_models/models/mediapipe_hand/README.md) | ✔️ | ✔️ | ✔️
 | [YOLOv8-Detection](https://aihub.qualcomm.com/models/yolov8_det) | [qai_hub_models.models.yolov8_det](qai_hub_models/models/yolov8_det/README.md) | ✔️ | ✔️ | ✔️
 | [YOLOv8-Detection-Quantized](https://aihub.qualcomm.com/models/yolov8_det_quantized) | [qai_hub_models.models.yolov8_det_quantized](qai_hub_models/models/yolov8_det_quantized/README.md) | ✔️ | ✔️ | ✔️
+| [Yolo-NAS](https://aihub.qualcomm.com/models/yolonas) | [qai_hub_models.models.yolonas](qai_hub_models/models/yolonas/README.md) | ✔️ | ✔️ | ✔️
+| [Yolo-NAS-Quantized](https://aihub.qualcomm.com/models/yolonas_quantized) | [qai_hub_models.models.yolonas_quantized](qai_hub_models/models/yolonas_quantized/README.md) | ✔️ | ✔️ | ✔️
 | [Yolo-v6](https://aihub.qualcomm.com/models/yolov6) | [qai_hub_models.models.yolov6](qai_hub_models/models/yolov6/README.md) | ✔️ | ✔️ | ✔️
 | [Yolo-v7](https://aihub.qualcomm.com/models/yolov7) | [qai_hub_models.models.yolov7](qai_hub_models/models/yolov7/README.md) | ✔️ | ✔️ | ✔️
 | [Yolo-v7-Quantized](https://aihub.qualcomm.com/models/yolov7_quantized) | [qai_hub_models.models.yolov7_quantized](qai_hub_models/models/yolov7_quantized/README.md) | ✔️ | ✔️ | ✔️
@@ -356,6 +371,10 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | [LiteHRNet](https://aihub.qualcomm.com/models/litehrnet) | [qai_hub_models.models.litehrnet](qai_hub_models/models/litehrnet/README.md) | ✔️ | ✔️ | ✔️
 | [MediaPipe-Pose-Estimation](https://aihub.qualcomm.com/models/mediapipe_pose) | [qai_hub_models.models.mediapipe_pose](qai_hub_models/models/mediapipe_pose/README.md) | ✔️ | ✔️ | ✔️
 | [OpenPose](https://aihub.qualcomm.com/models/openpose) | [qai_hub_models.models.openpose](qai_hub_models/models/openpose/README.md) | ✔️ | ✔️ | ✔️
+| [Posenet-Mobilenet](qai_hub_models/models/posenet_mobilenet/README.md) | [qai_hub_models.models.posenet_mobilenet](qai_hub_models/models/posenet_mobilenet/README.md) | ✔️ | ✔️ | ✔️
+| | | | |
+| **Depth Estimation**
+| [Midas-V2](qai_hub_models/models/midas/README.md) | [qai_hub_models.models.midas](qai_hub_models/models/midas/README.md) | ✔️ | ✔️ | ✔️
 
 ### Audio
 
@@ -386,7 +405,9 @@ Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICE
 | | | | |
 | **Image Generation**
 | [ControlNet](https://aihub.qualcomm.com/models/controlnet_quantized) | [qai_hub_models.models.controlnet_quantized](qai_hub_models/models/controlnet_quantized/README.md) | ✔️ | ✔️ | ✔️
-| [Stable-Diffusion](https://aihub.qualcomm.com/models/stable_diffusion_quantized) | [qai_hub_models.models.stable_diffusion_quantized](qai_hub_models/models/stable_diffusion_quantized/README.md) | ✔️ | ✔️ | ✔️
+| [Riffusion](qai_hub_models/models/riffusion_quantized/README.md) | [qai_hub_models.models.riffusion_quantized](qai_hub_models/models/riffusion_quantized/README.md) | ✔️ | ✔️ | ✔️
+| [Stable-Diffusion-v1.5](https://aihub.qualcomm.com/models/stable_diffusion_v1_5_quantized) | [qai_hub_models.models.stable_diffusion_v1_5_quantized](qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md) | ✔️ | ✔️ | ✔️
+| [Stable-Diffusion-v2.1](qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md) | [qai_hub_models.models.stable_diffusion_v2_1_quantized](qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md) | ✔️ | ✔️ | ✔️
 | | | | |
 | **Text Generation**
 | [Baichuan-7B](https://aihub.qualcomm.com/models/baichuan_7b_quantized) | [qai_hub_models.models.baichuan_7b_quantized](qai_hub_models/models/baichuan_7b_quantized/README.md) | ✔️ | ✔️ | ✔️
diff --git a/apps/android/ImageClassification/README.md b/apps/android/ImageClassification/README.md
index 29945ae2..e5916678 100644
--- a/apps/android/ImageClassification/README.md
+++ b/apps/android/ImageClassification/README.md
@@ -85,5 +85,5 @@ Also, you can use AI-HUB Model name as mentioned in models directory, to directl
 You can also select the model provided in the list menu during the execution of build_apk.py without specifying the model name and model path.
 
 ```
-    python build_apk.py -q "<QNN_SDK_PATH>" 
+    python build_apk.py -q "<QNN_SDK_PATH>"
 ```
diff --git a/qai_hub_models/_version.py b/qai_hub_models/_version.py
index 21c6afdf..f9e86596 100644
--- a/qai_hub_models/_version.py
+++ b/qai_hub_models/_version.py
@@ -2,4 +2,4 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-__version__ = "0.5.1"
+__version__ = "0.6.0"
diff --git a/qai_hub_models/datasets/bsd300.py b/qai_hub_models/datasets/bsd300.py
index a6d534b7..8a2c9bf5 100644
--- a/qai_hub_models/datasets/bsd300.py
+++ b/qai_hub_models/datasets/bsd300.py
@@ -32,20 +32,17 @@ class BSD300Dataset(BaseDataset):
 
     def __init__(self, scaling_factor=4):
         self.bsd_path = BSD300_ASSET.path(extracted=True)
-        self.images_path = os.path.join(self.bsd_path, "images/train")
+        self.images_path = self.bsd_path / "images" / "train"
         BaseDataset.__init__(self, self.bsd_path)
         self.scaling_factor = scaling_factor
 
     def _validate_data(self) -> bool:
-        images_path = os.path.join(self.dataset_path, "images/train")
-
         # Check image path exists
-        if not os.path.exists(images_path):
+        if not self.images_path.exists():
             return False
 
         # Ensure the correct number of images are there
-        files = os.listdir(images_path)
-        images = [f for f in files if ".jpg" in f]
+        images = [f for f in self.images_path.iterdir() if ".jpg" in f.name]
         if len(images) != DATASET_LENGTH:
             return False
 
@@ -53,18 +50,18 @@ def _validate_data(self) -> bool:
 
     def _prepare_data(self):
         # Rename images to be more friendly to enumeration
-        directory = os.path.join(self.dataset_path, "images/train")
-        files = os.listdir(directory)
-        for i, filename in enumerate(files):
-            if filename.endswith(".jpg"):
+        # directory = os.path.join(self.dataset_path, "images/train")
+        # files = os.listdir(directory)
+        for i, filepath in enumerate(self.images_path.iterdir()):
+            if filepath.name.endswith(".jpg"):
                 # Open the image and convert it to png
                 try:
-                    with Image.open(os.path.join(directory, filename)) as img:
-                        img.save(os.path.join(directory, f"img_{i + 1:03d}_HR.jpg"))
+                    with Image.open(filepath) as img:
+                        img.save(self.images_path / f"img_{i + 1:03d}_HR.jpg")
                     # delete the old image
-                    os.remove(os.path.join(directory, filename))
+                    os.remove(filepath)
                 except ValueError:
-                    print(f"File {filename} does not exist!")
+                    print(f"File {filepath} does not exist!")
 
     def __len__(self):
         return DATASET_LENGTH
diff --git a/qai_hub_models/datasets/common.py b/qai_hub_models/datasets/common.py
index ff2bf47b..4579cf6d 100644
--- a/qai_hub_models/datasets/common.py
+++ b/qai_hub_models/datasets/common.py
@@ -7,6 +7,7 @@
 import os
 import shutil
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import final
 
 from torch.utils.data import Dataset
@@ -17,17 +18,17 @@ class BaseDataset(Dataset, ABC):
     Base class to be extended by Datasets used in this repo for quantizing models.
     """
 
-    def __init__(self, dataset_path: str):
-        self.dataset_path = dataset_path
+    def __init__(self, dataset_path: str | Path):
+        self.dataset_path = Path(dataset_path)
         self.download_data()
 
     @final
     def download_data(self) -> None:
         if self._validate_data():
             return
-        if os.path.exists(self.dataset_path):
+        if self.dataset_path.exists():
             # Data is corrupted, delete and re-download
-            if os.path.isdir(self.dataset_path):
+            if self.dataset_path.is_dir():
                 shutil.rmtree(self.dataset_path)
             else:
                 os.remove(self.dataset_path)
@@ -49,4 +50,4 @@ def _validate_data(self) -> bool:
         """
         Validates data downloaded on disk. By default just checks that folder exists.
         """
-        return os.path.exists(self.dataset_path)
+        return self.dataset_path.exists()
diff --git a/qai_hub_models/datasets/imagenet.py b/qai_hub_models/datasets/imagenet.py
new file mode 100644
index 00000000..96d8ff71
--- /dev/null
+++ b/qai_hub_models/datasets/imagenet.py
@@ -0,0 +1,94 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import os
+import subprocess
+
+from torchvision.datasets import ImageNet
+
+from qai_hub_models.datasets.common import BaseDataset
+from qai_hub_models.utils.asset_loaders import CachedWebDatasetAsset
+from qai_hub_models.utils.image_processing import IMAGENET_TRANSFORM
+
+IMAGENET_FOLDER_NAME = "imagenet"
+IMAGENET_VERSION = 1
+
+IMAGENET_ASSET = CachedWebDatasetAsset(
+    "https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar",
+    IMAGENET_FOLDER_NAME,
+    IMAGENET_VERSION,
+    "ILSVRC2012_img_val.tar",
+)
+DEVKIT_NAME = "ILSVRC2012_devkit_t12.tar.gz"
+DEVKIT_ASSET = CachedWebDatasetAsset(
+    f"https://image-net.org/data/ILSVRC/2012/{DEVKIT_NAME}",
+    IMAGENET_FOLDER_NAME,
+    IMAGENET_VERSION,
+    DEVKIT_NAME,
+)
+VAL_PREP_ASSET = CachedWebDatasetAsset(
+    "https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh",
+    IMAGENET_FOLDER_NAME,
+    IMAGENET_VERSION,
+    "valprep.sh",
+)
+
+
+class ImagenetDataset(BaseDataset, ImageNet):
+    """
+    Wrapper class for using the Imagenet validation dataset: https://www.image-net.org/
+    """
+
+    def __init__(self):
+        """
+        A direct download link for the validation set is not available.
+        Users should download the validation dataset manually and pass the local filepath
+        as an argument here. After this is done once, it will be symlinked to an
+        internal location and doesn't need to be passed again.
+
+        input_data_path: Local filepath to imagenet validation set.
+        """
+        BaseDataset.__init__(self, IMAGENET_ASSET.path().parent)
+        ImageNet.__init__(
+            self,
+            root=self.dataset_path,
+            split="val",
+            transform=IMAGENET_TRANSFORM,
+        )
+
+    def _validate_data(self) -> bool:
+        val_path = self.dataset_path / "val"
+        if not (self.dataset_path / DEVKIT_NAME).exists():
+            print("Missing Devkit.")
+            return False
+
+        subdirs = [filepath for filepath in val_path.iterdir() if filepath.is_dir()]
+        if len(subdirs) != 1000:
+            print(f"Expected 1000 subdirectories but got {len(subdirs)}")
+            return False
+
+        total_images = 0
+        for subdir in subdirs:
+            total_images += len(list(subdir.iterdir()))
+
+        if total_images != 50000:
+            print(f"Expected 50000 images but got {total_images}")
+            return False
+        return True
+
+    def _download_data(self) -> None:
+        val_path = self.dataset_path / "val"
+        os.makedirs(val_path, exist_ok=True)
+
+        IMAGENET_ASSET.fetch(extract=True)
+        DEVKIT_ASSET.fetch()
+        VAL_PREP_ASSET.fetch()
+
+        os.rename(VAL_PREP_ASSET.path(), val_path / VAL_PREP_ASSET.path().name)
+        for filepath in self.dataset_path.iterdir():
+            if filepath.name.endswith(".JPEG"):
+                os.rename(filepath, val_path / filepath.name)
+
+        print("Moving images to appropriate class folder. This may take a few minutes.")
+        subprocess.call(f"sh {VAL_PREP_ASSET.path().name}", shell=True, cwd=val_path)
diff --git a/qai_hub_models/datasets/imagenette.py b/qai_hub_models/datasets/imagenette.py
index 31f4d24c..f9b92fa6 100644
--- a/qai_hub_models/datasets/imagenette.py
+++ b/qai_hub_models/datasets/imagenette.py
@@ -9,14 +9,16 @@
 
 from qai_hub_models.datasets.common import BaseDataset
 from qai_hub_models.utils.asset_loaders import CachedWebDatasetAsset
+from qai_hub_models.utils.image_processing import IMAGENET_TRANSFORM
 
 IMAGENETTE_FOLDER_NAME = "imagenette2-320"
 IMAGENETTE_VERSION = 1
+DEVKIT_NAME = "ILSVRC2012_devkit_t12.tar.gz"
 DEVKIT_ASSET = CachedWebDatasetAsset(
-    "https://image-net.org/data/ILSVRC/2012/ILSVRC2012_devkit_t12.tar.gz",
+    f"https://image-net.org/data/ILSVRC/2012/{DEVKIT_NAME}",
     IMAGENETTE_FOLDER_NAME,
     IMAGENETTE_VERSION,
-    "ILSVRC2012_devkit_t12.tar.gz",
+    DEVKIT_NAME,
 )
 IMAGENETTE_ASSET = CachedWebDatasetAsset(
     "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz",
@@ -51,11 +53,6 @@ class ImagenetteDataset(BaseDataset, ImageNet):
 
     def __init__(self):
         BaseDataset.__init__(self, str(IMAGENETTE_ASSET.path(extracted=True)))
-        # Avoid circular import
-        from qai_hub_models.models._shared.imagenet_classifier.app import (
-            IMAGENET_TRANSFORM,
-        )
-
         ImageNet.__init__(
             self,
             root=IMAGENETTE_ASSET.path(),
@@ -77,18 +74,18 @@ def _validate_data(self) -> bool:
             return False
 
         # Check val data exists
-        val_data_path = os.path.join(self.dataset_path, "val")
-        if not os.path.exists(val_data_path):
+        val_data_path = self.dataset_path / "val"
+        if not val_data_path.exists():
             return False
 
         # Ensure 10 classes
-        subdirs = os.listdir(val_data_path)
+        subdirs = list(val_data_path.iterdir())
         if len(subdirs) != 10:
             return False
 
         # Ensure >= 300 samples per classes
         for subdir in subdirs:
-            if len(os.listdir(os.path.join(val_data_path, subdir))) < 300:
+            if len(list(subdir.iterdir())) < 300:
                 return False
         return True
 
@@ -97,6 +94,6 @@ def _download_data(self) -> None:
         devkit_path = DEVKIT_ASSET.fetch()
         devkit_st = os.stat(devkit_path)
         os.chmod(devkit_path, devkit_st.st_mode | stat.S_IEXEC)
-        target_path = IMAGENETTE_ASSET.path() / os.path.basename(DEVKIT_ASSET.path())
-        if not os.path.exists(target_path):
+        target_path = IMAGENETTE_ASSET.path() / DEVKIT_NAME
+        if not target_path.exists():
             os.symlink(DEVKIT_ASSET.path(), target_path)
diff --git a/qai_hub_models/datasets/pascal_voc.py b/qai_hub_models/datasets/pascal_voc.py
index a7f5b9ea..1da92aa9 100644
--- a/qai_hub_models/datasets/pascal_voc.py
+++ b/qai_hub_models/datasets/pascal_voc.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 
-from pathlib import Path
 from typing import Tuple
 
 import numpy as np
@@ -36,7 +35,7 @@ def __init__(self, split: str = "train", image_size: Tuple[int, int] = (224, 224
         assert split in ["train", "val", "trainval"]
         self.split = split
 
-        base_path = Path(self.dataset_path) / "VOC2012"
+        base_path = self.dataset_path / "VOC2012"
         image_dir = base_path / "JPEGImages"
         category_dir = base_path / "SegmentationClass"
         splits_dir = base_path / "ImageSets" / "Segmentation"
diff --git a/qai_hub_models/evaluators/image_evaluator.py b/qai_hub_models/evaluators/segmentation_evaluator.py
similarity index 91%
rename from qai_hub_models/evaluators/image_evaluator.py
rename to qai_hub_models/evaluators/segmentation_evaluator.py
index a5439a5d..4f2adfb8 100644
--- a/qai_hub_models/evaluators/image_evaluator.py
+++ b/qai_hub_models/evaluators/segmentation_evaluator.py
@@ -10,7 +10,7 @@
 
 
 class SegmentationOutputEvaluator(BaseEvaluator):
-    """Evaluator for comparing a batched image output."""
+    """Evaluator for comparing segmentation output against ground truth."""
 
     def __init__(self, num_classes):
         self.num_classes = num_classes
@@ -18,6 +18,7 @@ def __init__(self, num_classes):
 
     def add_batch(self, output: torch.Tensor, gt: torch.Tensor):
         # This evaluator supports only 1 output tensor at a time.
+        output = output.argmax(1).cpu()
         assert gt.shape == output.shape
         self.confusion_matrix += self._generate_matrix(gt, output)
 
@@ -62,3 +63,6 @@ def _generate_matrix(self, gt_image, pre_image):
         count = torch.bincount(label, minlength=self.num_classes**2)
         confusion_matrix = count.reshape(self.num_classes, self.num_classes)
         return confusion_matrix
+
+    def get_accuracy_score(self) -> float:
+        return self.Mean_Intersection_over_Union()
diff --git a/qai_hub_models/global_requirements.txt b/qai_hub_models/global_requirements.txt
index 15343116..567fddcb 100644
--- a/qai_hub_models/global_requirements.txt
+++ b/qai_hub_models/global_requirements.txt
@@ -4,18 +4,22 @@
 # - Then install this requirements file
 # That should create an environment that works for every single model.
 
+Deprecated==1.2.11
 PySoundFile; sys_platform == 'win32'
 albumentations==0.5.2
 av==10.0.0
 basicsr==1.4.2
-click==8.0
+click==8.1.7
+data-gradients==0.3.1
 datasets==2.14.5
 diffusers[torch]==0.21.4
 easydict==1.10
+einops==0.3.2
 ffmpeg==1.4
 ftfy==6.1.1
 hydra-core==1.3.0
 imageio[ffmpeg]==2.31.5
+imagesize==1.4.1
 kornia==0.5.0
 librosa==0.10.1
 matplotlib==3.7.4
@@ -26,19 +30,23 @@ object-detection-metrics==0.4.post1
 openai-whisper==20230314
 pycocotools==2.0.7
 pytorch-lightning==1.6.0
+rapidfuzz==3.8.1
 regex==2023.12.25
 scikit-image==0.21.0
 scikit-learn==1.1.3
 scipy==1.8.1
 seaborn==0.11.0
 sentencepiece==0.2.0
+shapely==2.0.3
 soundfile==0.12.1
+stringcase==1.2.0
 tflite==2.10.0
 thop==0.1.1.post2209072238
 timm==0.9.11
 tensorboard==2.13.0
 torchaudio==0.13.1
 transformers==4.27.4
+treelib==1.6.1
 tucker-conv==1.0.1
 ultralytics==8.0.193
 webdataset==0.2.86
diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py b/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py
index d1eb4a7c..e9c02522 100644
--- a/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py
+++ b/qai_hub_models/models/_shared/cityscapes_segmentation/evaluator.py
@@ -5,7 +5,7 @@
 import torch.nn.functional as F
 from torch import Tensor
 
-from qai_hub_models.evaluators.image_evaluator import SegmentationOutputEvaluator
+from qai_hub_models.evaluators.segmentation_evaluator import SegmentationOutputEvaluator
 
 
 class CityscapesSegmentationEvaluator(SegmentationOutputEvaluator):
@@ -15,8 +15,4 @@ class CityscapesSegmentationEvaluator(SegmentationOutputEvaluator):
 
     def add_batch(self, output: Tensor, gt: Tensor):
         output_match_size = F.interpolate(output, gt.shape[1:3], mode="bilinear")
-        output_class = output_match_size.argmax(1).cpu()
-        return super().add_batch(output_class, gt)
-
-    def get_accuracy_score(self) -> float:
-        return super().Mean_Intersection_over_Union()
+        return super().add_batch(output_match_size, gt)
diff --git a/qai_hub_models/models/_shared/convnext_tiny_quantized/__init__.py b/qai_hub_models/models/_shared/convnext_tiny_quantized/__init__.py
new file mode 100644
index 00000000..21a22b31
--- /dev/null
+++ b/qai_hub_models/models/_shared/convnext_tiny_quantized/__init__.py
@@ -0,0 +1,4 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
diff --git a/qai_hub_models/models/_shared/convnext_tiny_quantized/model.py b/qai_hub_models/models/_shared/convnext_tiny_quantized/model.py
new file mode 100644
index 00000000..c098f281
--- /dev/null
+++ b/qai_hub_models/models/_shared/convnext_tiny_quantized/model.py
@@ -0,0 +1,126 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+# isort: off
+# This verifies aimet is installed, and this must be included first.
+from qai_hub_models.utils.quantization_aimet import (
+    AIMETQuantizableMixin,
+)
+
+# isort: on
+
+from abc import abstractmethod
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
+from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
+from torchvision.models.convnext import LayerNorm2d as ConvNextLayerNorm2d
+from torchvision.ops.misc import Permute
+
+from qai_hub_models.models._shared.common import replace_module_recursively
+from qai_hub_models.models.convnext_tiny.model import DEFAULT_WEIGHTS, ConvNextTiny
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
+from qai_hub_models.utils.quantization_aimet import (
+    constrain_quantized_inputs_to_image_range,
+)
+
+
+# The ConvNext LayerNorm uses a functional LayerNorm that is currently not
+# automatically handled by AIMET (AIMET-3928). With this fix, the LayerNorms
+# will not get quantization observers.
+class AIMETLayerNorm2d(nn.Sequential):
+    def __init__(self, orig_layer_norm: ConvNextLayerNorm2d):
+        layer_norm = nn.LayerNorm(
+            orig_layer_norm.normalized_shape,
+            eps=orig_layer_norm.eps,
+            elementwise_affine=orig_layer_norm.elementwise_affine,
+        )
+        layer_norm.bias = orig_layer_norm.bias
+        layer_norm.weight = orig_layer_norm.weight
+        super().__init__(
+            Permute([0, 2, 3, 1]),
+            layer_norm,
+            Permute([0, 3, 1, 2]),
+        )
+
+
+class ConvNextTinyQuantizableBase(AIMETQuantizableMixin, ConvNextTiny):
+    def __init__(
+        self,
+        quant_sim_model: QuantizationSimModel,
+    ) -> None:
+        # Input is already normalized by sim_model. Disable it in the wrapper model.
+        ConvNextTiny.__init__(self, quant_sim_model.model, normalize_input=False)
+        AIMETQuantizableMixin.__init__(
+            self,
+            quant_sim_model,
+        )
+
+    @classmethod
+    @abstractmethod
+    def _default_aimet_encodings(cls) -> str | Path:
+        """
+        Default AIMET encodings path.
+        """
+        ...
+
+    @classmethod
+    @abstractmethod
+    def _output_bw(cls) -> int:
+        """
+        Quantization bitwidth of activations.
+        """
+        ...
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        weights: str = DEFAULT_WEIGHTS,
+        aimet_encodings: str | None = "DEFAULT",
+    ) -> "ConvNextTinyQuantizableBase":
+        """
+        Parameters:
+          weights:
+            Weights of the model. See Torchvision ConvNext for information of
+            the format of this object.
+          aimet_encodings:
+            if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette.
+            elif None: Doesn't load any encodings. Used when computing encodings.
+            else: Interprets as a filepath and loads the encodings stored there.
+        """
+        # Load Model
+        model = ConvNextTiny.from_pretrained(weights=weights)
+
+        replace_module_recursively(
+            model,
+            ConvNextLayerNorm2d,
+            AIMETLayerNorm2d,
+        )
+
+        input_shape = cls.get_input_spec()["image_tensor"][0]
+        model = prepare_model(model)
+        equalize_model(model, input_shape)
+
+        sim = QuantizationSimModel(
+            model,
+            quant_scheme="tf_enhanced",
+            default_param_bw=8,
+            default_output_bw=cls._output_bw(),
+            config_file=get_default_aimet_config(),
+            dummy_input=torch.rand(input_shape),
+        )
+        constrain_quantized_inputs_to_image_range(sim)
+
+        if aimet_encodings:
+            if aimet_encodings == "DEFAULT":
+                aimet_encodings = cls._default_aimet_encodings()
+            load_encodings_to_sim(sim, aimet_encodings)
+
+        sim.model.eval()
+        return cls(sim)
diff --git a/qai_hub_models/models/_shared/deeplab/evaluator.py b/qai_hub_models/models/_shared/deeplab/evaluator.py
deleted file mode 100644
index 32a836c1..00000000
--- a/qai_hub_models/models/_shared/deeplab/evaluator.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# ---------------------------------------------------------------------
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-# ---------------------------------------------------------------------
-from torch import Tensor
-
-from qai_hub_models.evaluators.image_evaluator import SegmentationOutputEvaluator
-
-
-class DeepLabV3Evaluator(SegmentationOutputEvaluator):
-    """
-    Evaluates the output of DeepLabV3Plus
-
-    Expected data format for this evaluator:
-    * output has the same shape & meaning as output of any deeplabV3 forward() function.
-    * gt is argmax'd on the first dimension (see add_batch).
-    """
-
-    def add_batch(self, output: Tensor, gt: Tensor):
-        output = output.argmax(1).cpu()
-        return super().add_batch(output, gt)
-
-    def get_accuracy_score(self) -> float:
-        return super().Mean_Intersection_over_Union()
diff --git a/qai_hub_models/models/_shared/deeplab/model.py b/qai_hub_models/models/_shared/deeplab/model.py
index 75d45bbe..7b7d1351 100644
--- a/qai_hub_models/models/_shared/deeplab/model.py
+++ b/qai_hub_models/models/_shared/deeplab/model.py
@@ -5,7 +5,7 @@
 import torch
 
 from qai_hub_models.evaluators.base_evaluators import BaseEvaluator
-from qai_hub_models.models._shared.deeplab.evaluator import DeepLabV3Evaluator
+from qai_hub_models.evaluators.segmentation_evaluator import SegmentationOutputEvaluator
 from qai_hub_models.utils.base_model import BaseModel
 from qai_hub_models.utils.image_processing import normalize_image_torchvision
 from qai_hub_models.utils.input_spec import InputSpec
@@ -24,7 +24,7 @@ def __init__(
         self.normalize_input = normalize_input
 
     def get_evaluator(self) -> BaseEvaluator:
-        return DeepLabV3Evaluator(NUM_CLASSES)
+        return SegmentationOutputEvaluator(NUM_CLASSES)
 
     def forward(self, image):
         """
diff --git a/qai_hub_models/models/_shared/fastsam/demo.py b/qai_hub_models/models/_shared/fastsam/demo.py
index 59281888..bd6544d1 100644
--- a/qai_hub_models/models/_shared/fastsam/demo.py
+++ b/qai_hub_models/models/_shared/fastsam/demo.py
@@ -5,7 +5,6 @@
 from __future__ import annotations
 
 import os
-import tempfile
 from typing import Type
 
 from PIL import Image
@@ -17,7 +16,11 @@
     get_on_device_demo_parser,
     validate_on_device_demo_args,
 )
-from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebAsset,
+    load_image,
+    qaihm_temp_dir,
+)
 from qai_hub_models.utils.base_model import BaseModel
 from qai_hub_models.utils.display import display_or_save_image
 
@@ -46,7 +49,7 @@ def fastsam_demo(
 
     image = load_image(args.image)
 
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         image_path = os.path.join(tmpdir, "inp_image.jpg")
         image.save(image_path)
         pred, prompt_process = app.segment_image(image_path)
diff --git a/qai_hub_models/models/_shared/imagenet_classifier/app.py b/qai_hub_models/models/_shared/imagenet_classifier/app.py
index ac0424ad..bee0e780 100644
--- a/qai_hub_models/models/_shared/imagenet_classifier/app.py
+++ b/qai_hub_models/models/_shared/imagenet_classifier/app.py
@@ -6,18 +6,11 @@
 
 import torch
 from PIL.Image import Image
-from torchvision import transforms
 
-from qai_hub_models.models._shared.imagenet_classifier.model import IMAGENET_DIM
 from qai_hub_models.models.protocols import ExecutableModelProtocol
-from qai_hub_models.utils.image_processing import normalize_image_transform
-
-IMAGENET_TRANSFORM = transforms.Compose(
-    [
-        transforms.Resize(256),
-        transforms.CenterCrop(IMAGENET_DIM),
-        transforms.ToTensor(),
-    ]
+from qai_hub_models.utils.image_processing import (
+    IMAGENET_TRANSFORM,
+    normalize_image_transform,
 )
 
 
diff --git a/qai_hub_models/models/_shared/imagenet_classifier/model.py b/qai_hub_models/models/_shared/imagenet_classifier/model.py
index 3e0f904b..ac4e1b4d 100644
--- a/qai_hub_models/models/_shared/imagenet_classifier/model.py
+++ b/qai_hub_models/models/_shared/imagenet_classifier/model.py
@@ -12,13 +12,15 @@
 from qai_hub_models.evaluators.base_evaluators import BaseEvaluator
 from qai_hub_models.evaluators.classification_evaluator import ClassificationEvaluator
 from qai_hub_models.utils.base_model import BaseModel
-from qai_hub_models.utils.image_processing import normalize_image_torchvision
+from qai_hub_models.utils.image_processing import (
+    IMAGENET_DIM,
+    normalize_image_torchvision,
+)
 from qai_hub_models.utils.input_spec import InputSpec
 from qai_hub_models.utils.quantization import get_image_quantization_samples
 
 MODEL_ASSET_VERSION = 1
 MODEL_ID = __name__.split(".")[-2]
-IMAGENET_DIM = 224
 
 
 class ImagenetClassifier(BaseModel):
diff --git a/qai_hub_models/models/_shared/mediapipe/app.py b/qai_hub_models/models/_shared/mediapipe/app.py
index 05ffcb6d..a121d4f9 100644
--- a/qai_hub_models/models/_shared/mediapipe/app.py
+++ b/qai_hub_models/models/_shared/mediapipe/app.py
@@ -566,7 +566,7 @@ def _draw_box_and_roi(
             # Draw detector bounding box
             draw_box_from_xyxy(NHWC_int_numpy_frame, box[0], box[1], (255, 0, 0), 1)
             # Draw detector keypoints
-            draw_points(NHWC_int_numpy_frame, kp)
+            draw_points(NHWC_int_numpy_frame, kp, size=30)
             # Draw region of interest box computed from the detector box & keypoints
             # (this is the input to the landmark detector)
             draw_box_from_corners(NHWC_int_numpy_frame, roi, (0, 255, 0))
diff --git a/qai_hub_models/models/stable_diffusion_quantized/app.py b/qai_hub_models/models/_shared/stable_diffusion/app.py
similarity index 78%
rename from qai_hub_models/models/stable_diffusion_quantized/app.py
rename to qai_hub_models/models/_shared/stable_diffusion/app.py
index 48d33849..8246d90a 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/app.py
+++ b/qai_hub_models/models/_shared/stable_diffusion/app.py
@@ -2,10 +2,16 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-from typing import Any, Callable, Tuple
+from __future__ import annotations
 
+from typing import Any, Tuple
+
+import diffusers
 import torch
 from diffusers.models.embeddings import get_timestep_embedding
+from transformers import CLIPTokenizer
+
+from qai_hub_models.utils.inference import HubModel
 
 OUT_H, OUT_W = 512, 512
 
@@ -28,12 +34,13 @@ class StableDiffusionApp:
 
     def __init__(
         self,
-        text_encoder: Callable[..., Tuple[torch.Tensor, ...]],
-        vae_decoder: Callable[..., Tuple[torch.Tensor, ...]],
-        unet: Callable[..., Tuple[torch.Tensor, ...]],
-        tokenizer: Any,
-        scheduler: Any,
-        time_embedding: Any,
+        text_encoder: HubModel | torch.nn.Module,
+        vae_decoder: HubModel | torch.nn.Module,
+        unet: HubModel | torch.nn.Module,
+        tokenizer: CLIPTokenizer | Any,
+        scheduler: diffusers.DPMSolverMultistepScheduler,
+        time_embedding: diffusers.embeddings.TimeEmbedding,
+        channel_last_latent: bool,
     ):
         """
         Initializes StableDiffusionApp with required neural networks for end-to-end pipeline.
@@ -55,6 +62,9 @@ def __init__(
             Updates latent space during each iteration.
         time_embedding:
             Projects time-step into embedding used during denoising in latent space.
+        channel_last_latent:
+            True if unet outputs latent of shape like (1, 64, 64, 4). False
+            for (1, 4, 64, 64)
         """
 
         self.text_encoder = text_encoder
@@ -63,21 +73,39 @@ def __init__(
         self.tokenizer = tokenizer
         self.scheduler = scheduler
         self.time_embedding = time_embedding
+        self.channel_last_latent = channel_last_latent
 
     def get_time_embedding(self, timestep):
+        """
+        Since these time embeddings aren't dependent on prompt, they can be
+        pre-computed (for a pre-defined set of timesteps) in deployment and
+        skip these computation. We include them in demo for better clarity.
+        """
         timestep = torch.tensor([timestep])
+        # TODO: pull 320 from UNet block output dim
         t_emb = get_timestep_embedding(timestep, 320, True, 0)
         emb = self.time_embedding(t_emb)
 
         return emb
 
-    def _encode_text_prompt(self, prompt: str) -> torch.Tensor:
+    def _encode_text_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Takes a text prompt and returns a tensor with its text embedding.
 
         Parameters
         ----------
         prompt: The text prompt to encode.
+
+        Returns
+        -------
+        cond_embedding
+
+        uncond_embedding
+
+        Note that uncond_embedding is the same for any prompt (since it's not
+        conditioned on the prompt). So in deploymenet this should be
+        cached instead of computed every time. We compute it here for better
+        clarity.
         """
         # Tokenize input prompt
         text_input = self.tokenizer(
@@ -153,9 +181,9 @@ def generate_image(
         Returns
         -------
         torch.Tensor
-            The generated image in RGB scaled in [0, 1] with tensor shape (H,
-            W, 3). The height and the width may depend on the underlying Stable
-            Diffusion version, but is typically 512x512.
+            The generated image in RGB scaled in [0, 1] with tensor shape
+            (OUT_H, OUT_W, 3). The height and the width may depend on the
+            underlying Stable Diffusion version, but is typically 512x512.
         """
 
         # Encode text prompt
@@ -182,7 +210,8 @@ def _make_channel_first_torch(input_tensor):
             print(f"\nStep: {i + 1}\n{'-' * 10}")
             time_emb = self.get_time_embedding(t)
             latent_model_input = self.scheduler.scale_model_input(latents, t)
-            latent_model_input = _make_channel_last_torch(latent_model_input)
+            if self.channel_last_latent:
+                latent_model_input = _make_channel_last_torch(latent_model_input)
 
             print(f"\nDenoising image in latent space (inference on UNet)\n{'-' * 50}")
             # Denoise image in latent space
@@ -195,11 +224,13 @@ def _make_channel_first_torch(input_tensor):
             noise_cond, noise_uncond = torch.split(noise, 1, 0)
             noise_pred = noise_uncond + guidance_scale * (noise_cond - noise_uncond)
 
-            noise_pred = _make_channel_first_torch(noise_pred)
+            if self.channel_last_latent:
+                noise_pred = _make_channel_first_torch(noise_pred)
             latents = self.scheduler.step(noise_pred, t, latents).prev_sample
 
         print(f"\nDecoding generated image (inference on VAEDecoder)\n{'-' * 50}")
         # Decode generated image from latent space
-        latents_vae = _make_channel_last_torch(latents)
-        image = self.vae_decoder(latents_vae)
+        if self.channel_last_latent:
+            latents = _make_channel_last_torch(latents)
+        image = self.vae_decoder(latents)
         return image
diff --git a/qai_hub_models/models/stable_diffusion_quantized/demo.py b/qai_hub_models/models/_shared/stable_diffusion/demo.py
similarity index 70%
rename from qai_hub_models/models/stable_diffusion_quantized/demo.py
rename to qai_hub_models/models/_shared/stable_diffusion/demo.py
index d95ec30e..d09f2461 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/demo.py
+++ b/qai_hub_models/models/_shared/stable_diffusion/demo.py
@@ -2,22 +2,19 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
+from __future__ import annotations
+
 import argparse
+from typing import Any
 
+import diffusers
 import numpy as np
 import qai_hub as hub
-from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel
+from diffusers import DPMSolverMultistepScheduler
 from PIL import Image
 from transformers import CLIPTokenizer
 
-from qai_hub_models.models.stable_diffusion_quantized.app import StableDiffusionApp
-from qai_hub_models.models.stable_diffusion_quantized.model import (
-    MODEL_ASSET_VERSION,
-    MODEL_ID,
-    ClipVITTextEncoder,
-    Unet,
-    VAEDecoder,
-)
+from qai_hub_models.models._shared.stable_diffusion.app import StableDiffusionApp
 from qai_hub_models.utils.args import add_output_dir_arg
 from qai_hub_models.utils.base_model import BasePrecompiledModel
 from qai_hub_models.utils.display import display_or_save_image
@@ -29,11 +26,13 @@
 
 
 def _get_hub_model(
+    model_id: str,
+    model_asset_version: str,
     input_model: BasePrecompiledModel,
     model_name: str,
     ignore_cached_model: bool = False,
     device_name=DEFAULT_DEVICE_NAME,
-):
+) -> HubModel:
     if not can_access_qualcomm_ai_hub():
         raise RuntimeError(
             "Stable-diffusion on-device demo requires access to QAI-Hub.\n"
@@ -42,8 +41,8 @@ def _get_hub_model(
     # Upload model
     uploaded_model = get_uploaded_precompiled_model(
         input_model.get_target_model_path(),
-        MODEL_ID,
-        MODEL_ASSET_VERSION,
+        model_id,
+        model_asset_version,
         model_name,
         ignore_cached_model=ignore_cached_model,
     )
@@ -53,7 +52,25 @@ def _get_hub_model(
 
 # Run Stable Diffuison end-to-end on a given prompt. The demo will output an
 # AI-generated image based on the description in the prompt.
-def main(is_test: bool = False):
+def stable_diffusion_demo(
+    model_id: str,
+    model_asset_version: str,
+    text_encoder: BasePrecompiledModel,
+    unet: BasePrecompiledModel,
+    vae_decoder: BasePrecompiledModel,
+    tokenizer: CLIPTokenizer | Any,
+    scheduler: DPMSolverMultistepScheduler,
+    time_embedding: diffusers.embeddings.TimeEmbedding,
+    channel_last_latent: bool = True,
+    is_test: bool = False,
+):
+    """
+    Generate an image by running text_encoder, unet, vae_decoder via AI Hub
+    inference job on target physical device, and tokenizer, scheduler, and
+    time embedding in torch locally.
+
+    See parser arguments for parameters.
+    """
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--prompt",
@@ -64,7 +81,7 @@ def main(is_test: bool = False):
         "--num-steps",
         default=5,
         type=int,
-        help="The number of diffusion iteration steps (higher means better quality).",
+        help="The number of diffusion steps (higher means better quality).",
     )
     parser.add_argument(
         "--seed",
@@ -110,50 +127,47 @@ def main(is_test: bool = False):
 
     print(f"Downloading model assets\n{'-' * 35}")
     # Load target models
-    text_encoder = ClipVITTextEncoder.from_precompiled()
-    unet = Unet.from_precompiled()
-    vae_decoder = VAEDecoder.from_precompiled()
 
     # Create three HubModel instances to prepare for on-device inference.
     # This is similar to initializing PyTorch model to call forward method later.
     # Instead of forward, we later submit inference_jobs on QAI-Hub for
     # on-device evaluation.
     print(f"Uploading model assets on QAI-Hub\n{'-' * 35}")
-    text_encoder = _get_hub_model(
-        text_encoder, "text_encoder", args.ignore_cached_model, args.device_name
+    hub_text_encoder = _get_hub_model(
+        model_id,
+        model_asset_version,
+        text_encoder,
+        "text_encoder",
+        args.ignore_cached_model,
+        args.device_name,
     )
-    unet = _get_hub_model(unet, "unet", args.ignore_cached_model, args.device_name)
-    vae_decoder = _get_hub_model(
-        vae_decoder, "vae_decoder", args.ignore_cached_model, args.device_name
+    hub_unet = _get_hub_model(
+        model_id,
+        model_asset_version,
+        unet,
+        "unet",
+        args.ignore_cached_model,
+        args.device_name,
     )
-
-    # Create tokenizer, scheduler and time_embedding required
-    # for stable-diffusion pipeline.
-    tokenizer = CLIPTokenizer.from_pretrained(
-        "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer", revision="main"
+    hub_vae_decoder = _get_hub_model(
+        model_id,
+        model_asset_version,
+        vae_decoder,
+        "vae_decoder",
+        args.ignore_cached_model,
+        args.device_name,
     )
 
-    scheduler = DPMSolverMultistepScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )
-
-    time_embedding = UNet2DConditionModel.from_pretrained(
-        "runwayml/stable-diffusion-v1-5", subfolder="unet"
-    ).time_embedding
-    # Load Application
     app = StableDiffusionApp(
-        text_encoder=text_encoder,
-        vae_decoder=vae_decoder,
-        unet=unet,
+        text_encoder=hub_text_encoder,
+        vae_decoder=hub_vae_decoder,
+        unet=hub_unet,
         tokenizer=tokenizer,
         scheduler=scheduler,
         time_embedding=time_embedding,
+        channel_last_latent=channel_last_latent,
     )
 
-    # Generate image
     image = app.generate_image(
         args.prompt,
         num_steps=args.num_steps,
@@ -165,7 +179,3 @@ def main(is_test: bool = False):
 
     if not is_test:
         display_or_save_image(pil_img, args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/qai_hub_models/models/stable_diffusion_quantized/test.py b/qai_hub_models/models/_shared/stable_diffusion/test_utils.py
similarity index 54%
rename from qai_hub_models/models/stable_diffusion_quantized/test.py
rename to qai_hub_models/models/_shared/stable_diffusion/test_utils.py
index b0cc4bf5..29e61824 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/test.py
+++ b/qai_hub_models/models/_shared/stable_diffusion/test_utils.py
@@ -2,29 +2,15 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-import tempfile
+from qai_hub_models.utils.asset_loaders import qaihm_temp_dir
 
-import pytest
 
-from qai_hub_models.models.stable_diffusion_quantized.demo import main as demo_main
-from qai_hub_models.models.stable_diffusion_quantized.export import export_model
-from qai_hub_models.models.stable_diffusion_quantized.model import (
-    StableDiffusionQuantized,
-)
-
-
-def test_from_precompiled():
-    StableDiffusionQuantized.from_precompiled()
-
-
-@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
-@pytest.mark.slow_cloud
-def test_export():
-    with tempfile.TemporaryDirectory() as tmpdir:
+def export_for_component(export_model, component_name: str):
+    with qaihm_temp_dir() as tmpdir:
         exported_jobs = export_model(
             # Testing text_encoder as it's smallest model in
             # Stable-Diffusion pipeline
-            components=["text_encoder"],
+            components=[component_name],
             skip_inferencing=True,
             skip_downloading=True,
             skip_summary=True,
@@ -38,9 +24,3 @@ def test_export():
             profile_job, inference_job = jobs[0], jobs[1]
             assert profile_job is not None
             assert inference_job is None
-
-
-@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
-@pytest.mark.slow_cloud
-def test_demo():
-    demo_main(is_test=True)
diff --git a/qai_hub_models/models/_shared/video_classifier/demo.py b/qai_hub_models/models/_shared/video_classifier/demo.py
index 99ce64a8..2a0974f9 100644
--- a/qai_hub_models/models/_shared/video_classifier/demo.py
+++ b/qai_hub_models/models/_shared/video_classifier/demo.py
@@ -4,13 +4,12 @@
 # ---------------------------------------------------------------------
 from __future__ import annotations
 
-import tempfile
 from typing import Type
 
 from qai_hub_models.models._shared.video_classifier.app import KineticsClassifierApp
 from qai_hub_models.models._shared.video_classifier.model import KineticsClassifier
 from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args
-from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_path
+from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_path, qaihm_temp_dir
 
 
 #
@@ -35,7 +34,7 @@ def kinetics_classifier_demo(
     model = model_from_cli_args(model_type, args)
     app = KineticsClassifierApp(model)
     print("Model Loaded")
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         dst_path = load_path(args.video, tmpdir)
         predictions = app.predict(path=str(dst_path))
     top5_classes = ", ".join(predictions)
diff --git a/qai_hub_models/models/_shared/whisper/app.py b/qai_hub_models/models/_shared/whisper/app.py
index b99f54ba..27deac3d 100644
--- a/qai_hub_models/models/_shared/whisper/app.py
+++ b/qai_hub_models/models/_shared/whisper/app.py
@@ -36,7 +36,7 @@ def __init__(self, whisper: Whisper):
         self.num_decoder_blocks = whisper.num_decoder_blocks
         self.num_decoder_heads = whisper.num_decoder_heads
         self.attention_dim = whisper.attention_dim
-        self.max_decode_len = whisper.max_decode_len
+        self.mean_decode_len = whisper.mean_decode_len
 
         # Wraps torch Module so it takes np ndarray as input and outputs
         if isinstance(encoder, torch.nn.Module):
@@ -64,14 +64,28 @@ def transcribe(self, mel_input: np.ndarray) -> str:
 
         - transcribed texts
         """
-        cross_attn_cache = self.encoder(mel_input)
+        k_cache_cross, v_cache_cross = self.encoder(mel_input)
         # Start decoding
         # coreml only takes float tensors
         x = np.array([[TOKEN_SOT]])
         decoded_tokens = [TOKEN_SOT]
-        sample_len = self.max_decode_len  # max # of tokens to sample
-        cache_tensor = np.zeros((1, sample_len, self.attention_dim)).astype(np.float32)
-        self_attn_cache = [cache_tensor] * 2 * self.num_decoder_blocks
+        sample_len = self.mean_decode_len  # mean # of tokens to sample
+        k_cache_self = np.zeros(
+            (
+                self.num_decoder_blocks,
+                self.num_decoder_heads,
+                self.attention_dim // self.num_decoder_heads,
+                sample_len,
+            )
+        ).astype(np.float32)
+        v_cache_self = np.zeros(
+            (
+                self.num_decoder_blocks,
+                self.num_decoder_heads,
+                sample_len,
+                self.attention_dim // self.num_decoder_heads,
+            )
+        ).astype(np.float32)
 
         sum_logprobs = 0
         for i in range(sample_len):
@@ -80,15 +94,13 @@ def transcribe(self, mel_input: np.ndarray) -> str:
             # index - used to get positional embedding correctly.
             index = torch.zeros([1, 1], dtype=torch.int32)
             index[0, 0] = i
-            # Use mask to get the k_cache updated with new key
-            mask = torch.zeros(1, sample_len, self.attention_dim, dtype=torch.bool)
-            mask[:, i, :] = 1
             decoder_out = self.decoder(
-                x, index, mask, *cross_attn_cache, *self_attn_cache
+                x, index, k_cache_cross, v_cache_cross, k_cache_self, v_cache_self
             )
             # logit has shape (1, decoded_len, 51864)
             logits = decoder_out[0]
-            self_attn_cache = decoder_out[1:]  # type: ignore
+            k_cache_self = decoder_out[1]
+            v_cache_self = decoder_out[2]
 
             # logit has shape (51864,)
             logits = logits[0, -1]  # consider only the last token
diff --git a/qai_hub_models/models/_shared/whisper/model.py b/qai_hub_models/models/_shared/whisper/model.py
index 06e2a0d3..1ea0acb4 100644
--- a/qai_hub_models/models/_shared/whisper/model.py
+++ b/qai_hub_models/models/_shared/whisper/model.py
@@ -13,7 +13,14 @@
 from qai_hub_models.utils.base_model import BaseModel, CollectionModel, TargetRuntime
 from qai_hub_models.utils.input_spec import InputSpec
 
-MAX_DECODE_LEN = 224
+# The official default max decoded length is 448. We use mean decoded length 224 for benchmarking purpose
+MEAN_DECODE_LEN = 224
+
+# The number of 20ms audio contexts in 30 seconds of audio
+AUDIO_EMB_LEN = 1500
+
+# The number of Mel features per audio context
+N_MELS = 80
 
 MODEL_ID = "whisper_asr_shared"
 MODEL_ASSET_VERSION = 1
@@ -36,7 +43,7 @@ def __init__(
         self.num_decoder_blocks = num_decoder_blocks
         self.attention_dim = attention_dim
         self.num_decoder_heads = num_heads
-        self.max_decode_len = MAX_DECODE_LEN
+        self.mean_decode_len = MEAN_DECODE_LEN
 
     @classmethod
     def from_pretrained(cls, model: str = "tiny.en"):
@@ -63,16 +70,40 @@ class WhisperEncoderInf(BaseModel):
 
     def __init__(self, model: whisper.model.Whisper):
         super().__init__()
-        self.model = model
+        self.encoder = model.encoder
+        dims = model.dims
+
+        states_per_head = dims.n_audio_state // dims.n_audio_head
+        scale = states_per_head**-0.25
+
+        self.cross_attn_key_list = torch.nn.ModuleList(
+            [
+                SplitLinear(block.cross_attn.key, dims.n_audio_head, scale)
+                for block in model.decoder.blocks
+            ]
+        )
+        self.cross_attn_value_list = torch.nn.ModuleList(
+            [
+                SplitLinear(block.cross_attn.value, dims.n_audio_head)
+                for block in model.decoder.blocks
+            ]
+        )
 
     def forward(self, audio: torch.Tensor) -> List[torch.Tensor]:
-        # Return 2 * self.num_blocks tensors (k, v for each block)
-        encoder_out = self.model.encoder(audio)
-        res = []
-        for residual_block in self.model.decoder.blocks:
-            res.append(residual_block.cross_attn.key(encoder_out))
-            res.append(residual_block.cross_attn.value(encoder_out))
-        return res
+        # Return cross attention key and value cache tensors
+        encoder_out = self.encoder(audio)
+        k_cache = torch.cat(
+            [
+                key(encoder_out, transpose=True).unsqueeze(0)
+                for key in self.cross_attn_key_list
+            ],
+            dim=0,
+        )
+        v_cache = torch.cat(
+            [value(encoder_out).unsqueeze(0) for value in self.cross_attn_value_list],
+            dim=0,
+        )
+        return k_cache, v_cache
 
     @staticmethod
     def get_input_spec() -> InputSpec:
@@ -80,7 +111,7 @@ def get_input_spec() -> InputSpec:
         Returns the input specification (name -> (shape, type). This can be
         used to submit profiling job on Qualcomm AI Hub.
         """
-        return dict(audio=((1, 80, 3000), "float32"))
+        return dict(audio=((1, N_MELS, AUDIO_EMB_LEN * 2), "float32"))
 
     @classmethod
     def from_pretrained(cls):
@@ -92,7 +123,12 @@ def get_hub_profile_options(
         profile_options = super().get_hub_profile_options(
             target_runtime, other_profile_options
         )
-        return profile_options + " --max_profiler_iterations 10" + " --compute_unit gpu"
+        if (
+            target_runtime == TargetRuntime.TFLITE
+            and "--compute_unit" not in profile_options
+        ):
+            profile_options = profile_options + " --compute_unit gpu"
+        return profile_options + " --max_profiler_iterations 10"
 
 
 class WhisperDecoderInf(BaseModel):
@@ -105,20 +141,47 @@ class WhisperDecoderInf(BaseModel):
     2. kv cache inputs are required, not optional
     """
 
-    def __init__(self, model: whisper.model.TextDecoder):
+    def __init__(
+        self, model: whisper.model.TextDecoder, max_decode_len: int = MEAN_DECODE_LEN
+    ):
         super().__init__()
         assert isinstance(model, whisper.model.TextDecoder)
 
+        self.max_decode_len = max_decode_len
+
         # Wraps `ResidualAttentionBlock` in
         # `ResidualAttentionBlockWrapper`
         self.blocks = torch.nn.ModuleList(
             [ResidualAttentionBlockWrapper(b) for b in model.blocks]
         )
-        self.num_blocks = len(self.blocks)
+
         for m in ["token_embedding", "ln"]:
             self.add_module(m, getattr(model, m))
-        for p in ["positional_embedding"]:
-            self.register_parameter(p, getattr(model, p))
+
+        # Replace `whisper.model.TextDecoder.positional_embedding` (nn.Parameter) with nn.Embedding for easier lookup
+        self.positional_embedding = torch.nn.Embedding(
+            max_decode_len, self.token_embedding.weight.shape[1]
+        )
+        self.positional_embedding.weight = torch.nn.Parameter(
+            model.positional_embedding[:max_decode_len, :]
+        )
+
+        self.logits = torch.nn.Linear(
+            self.token_embedding.weight.shape[1],
+            self.token_embedding.weight.shape[0],
+            bias=False,
+        )
+        self.logits.weight = self.token_embedding.weight
+
+        # Since kv cache is a fixed size, mask out elements
+        # that correspond to not yet used entries.
+        # The kv cache for current token is inserted at the last
+        # index, with the previous cache shifted down by one element.
+        self.mask = torch.nn.Embedding(max_decode_len, max_decode_len)
+        mask = torch.zeros([max_decode_len, max_decode_len], dtype=torch.float32)
+        for c_idx in range(0, max_decode_len):
+            mask[c_idx, 0 : max_decode_len - c_idx - 1] = -100
+        self.mask.weight = torch.nn.Parameter(mask)
 
     @property
     def attention_dim(self):
@@ -128,13 +191,18 @@ def attention_dim(self):
     def num_heads(self):
         return self.blocks[0].attn.n_head
 
+    @property
+    def num_blocks(self):
+        return len(self.blocks)
+
     def forward(
         self,
         x: torch.Tensor,
         index: torch.Tensor,
-        mask: torch.Tensor,
-        *kv_cache_args,
-        **kv_cache_kwargs,
+        k_cache_cross: torch.Tensor,
+        v_cache_cross: torch.Tensor,
+        k_cache_self: torch.Tensor,
+        v_cache_self: torch.Tensor,
     ):
         """
         Args:
@@ -145,56 +213,54 @@ def forward(
         - index: torch.tensor, shape = (1, 1)
             index to get the positional encoding for x.
 
-        - mask: torch.tensor, shape = (1, max_sample_length, attn_dim)
-            Mask helps create kv_cache while keeping the size consistent.
-
-        - kv_cache_args: Tuple of length 4 * num_decoder_blocks. Elements are:
-
-            b{i}_cross_attn_k: [1, 1500, attn_dim]
-            b{i}_cross_attn_v: [1, 1500, attn_dim]
+        - k_cache_cross: key cache for cross attention:
+          [num_blocks, num_heads, attn_dim/num_heads, AUDIO_EMB_LEN]
 
-            for i = 0, ..., num_blocks
+        - v_cache_cross: value cache for cross attention:
+          [num_blocks, num_heads, AUDIO_EMB_LEN, attn_dim/num_heads]
 
-            followed by
+        - k_cache_self: key cache for self attention:
+          [num_blocks, num_heads, attn_dim/num_heads, self.max_decode_len]
+          pass zeros for first call (index 0), otherwise pass in
+          previous decoder output
 
-            b{i}_self_attn_k: [1, max_sample_length, attn_dim]
-            b{i}_self_attn_v: [1, max_sample_length, attn_dim]
-
-            for i = 0, ..., num_blocks
+        - v_cache_self: value cache for self attention:
+          [num_blocks, num_heads, self.max_decode_len, attn_dim/num_heads]
+          pass zeros for first call (index 0), otherwise pass in
+          previous decoder output
 
         Returns:
 
         - logits: of shape [1, 1, 51864]
-        - b0_self_attn_k, b0_self_attn_v, b1_self_attn_k, ...: Updated self attn cache.
-          2*num_decoder_blocks
+        - k_cache_self_new: updated key cache for self attention
+        - v_cache_self_new: updated value cache for self attention
         """
 
-        if not kv_cache_args:
-            kv_cache_args = list(kv_cache_kwargs.values())
-
         assert isinstance(self.token_embedding, torch.nn.Module)  # for mypy
         assert isinstance(self.ln, torch.nn.Module)  # for mypy
-        assert isinstance(self.positional_embedding, torch.nn.Parameter)  # for mypy
+        assert isinstance(self.positional_embedding, torch.nn.Embedding)  # for mypy
         # Set up kv_cache
         kv_cache = {}  # torch.nn.Module -> torch.Tensor
         for i, block in enumerate(self.blocks):
             kv_cache.update(
                 {
-                    block.attn.key: kv_cache_args[2 * self.num_blocks + i * 2],
-                    block.attn.value: kv_cache_args[2 * self.num_blocks + i * 2 + 1],
-                    block.cross_attn.key: kv_cache_args[i * 2],
-                    block.cross_attn.value: kv_cache_args[i * 2 + 1],
+                    block.attn.key: k_cache_self[i : i + 1],
+                    block.attn.value: v_cache_self[i : i + 1],
+                    block.cross_attn.key: k_cache_cross[i : i + 1],
+                    block.cross_attn.value: v_cache_cross[i : i + 1],
                 }
             )
 
-        x = self.token_embedding(x) + self.positional_embedding[index.long()]
+        x = self.token_embedding(x) + self.positional_embedding(index)
+        mask = self.mask(index)
 
         # x shape: (1, 1, 384)
-        kv_cache_new = []
-        for block in self.blocks:
-            x, k_cache, v_cache = block(x, index, mask, kv_cache=kv_cache)
-            kv_cache_new.append(k_cache.float())
-            kv_cache_new.append(v_cache.float())
+        k_cache_new = []
+        v_cache_new = []
+        for block_idx in range(self.num_blocks):
+            x, k_cache, v_cache = self.blocks[block_idx](x, mask, kv_cache=kv_cache)
+            k_cache_new.append(k_cache.float())
+            v_cache_new.append(v_cache.float())
 
         x = self.ln(x)
         logits = (
@@ -203,9 +269,9 @@ def forward(
                 self.token_embedding.weight.to(x.dtype), 0, 1  # type: ignore
             )
         ).float()
+        logits = self.logits(x).float()
 
-        # shape: [1, 1, 51864]
-        return (logits,) + tuple(kv_cache_new)
+        return logits, torch.cat(k_cache_new), torch.cat(v_cache_new)
 
     @staticmethod
     def get_input_spec(
@@ -218,21 +284,23 @@ def get_input_spec(
         specs = dict(
             x=((1, 1), "int32"),
             index=((1, 1), "int32"),
-            mask=((1, MAX_DECODE_LEN, attention_dim), "int32"),
-        )
-        for i in range(num_blocks):
-            specs[f"b{i}_cross_attn_k"] = ((1, 1500, attention_dim), "float32")
-            specs[f"b{i}_cross_attn_v"] = ((1, 1500, attention_dim), "float32")
-
-        for i in range(num_blocks):
-            specs[f"b{i}_self_attn_k"] = (
-                (1, MAX_DECODE_LEN, attention_dim),
+            k_cache_cross=(
+                (num_blocks, num_heads, attention_dim // num_heads, AUDIO_EMB_LEN),
                 "float32",
-            )
-            specs[f"b{i}_self_attn_v"] = (
-                (1, MAX_DECODE_LEN, attention_dim),
+            ),
+            v_cache_cross=(
+                (num_blocks, num_heads, AUDIO_EMB_LEN, attention_dim // num_heads),
                 "float32",
-            )
+            ),
+            k_cache_self=(
+                (num_blocks, num_heads, attention_dim // num_heads, MEAN_DECODE_LEN),
+                "float32",
+            ),
+            v_cache_self=(
+                (num_blocks, num_heads, MEAN_DECODE_LEN, attention_dim // num_heads),
+                "float32",
+            ),
+        )
 
         return specs
 
@@ -246,6 +314,50 @@ def from_pretrained(cls):
         return Whisper.from_pretrained().decoder
 
 
+class SplitLinear(torch.nn.Module):
+    def __init__(self, linear: torch.nn.Module, num_splits: int, scale: float = 1.0):
+        """
+        Split Linear operation into multiple instances
+        Multi-head cross attention
+        Uses pre-computed cross kv cache passed as input to the
+        decoder model
+        """
+        super().__init__()
+        weight = linear.weight
+        has_bias = False if linear.bias is None else True
+        if has_bias:
+            bias = linear.bias.reshape(num_splits, -1) * scale
+        split_weight = weight.reshape(num_splits, -1, weight.shape[1]) * scale
+        self.split_linears = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(
+                    split_weight.shape[1], split_weight.shape[2], bias=has_bias
+                )
+                for split_idx in range(num_splits)
+            ]
+        )
+        for split_idx in range(num_splits):
+            self.split_linears[split_idx].weight = torch.nn.Parameter(
+                split_weight[split_idx, :, :]
+            )
+            if has_bias:
+                self.split_linears[split_idx].bias = torch.nn.Parameter(bias[split_idx])
+
+    def forward(self, x: torch.Tensor, transpose: bool = False):
+        """
+        produces output with dimension
+        [num_splits, input rows, output_features / num_splits]
+        If transpose is True, will transpose last two indices
+        """
+        if transpose:
+            x = torch.cat(
+                [spl(x).transpose(-1, -2) for spl in self.split_linears], dim=-3
+            )
+        else:
+            x = torch.cat([spl(x) for spl in self.split_linears], dim=-3)
+        return x
+
+
 class MHAWrapper(torch.nn.Module):
     """
     Wrapper around whisper.model.MultiHeadAttention to leverage kv cache for
@@ -275,7 +387,6 @@ def __init__(self, model: whisper.model.MultiHeadAttention, attn_type: str):
     def forward(
         self,
         x: torch.Tensor,
-        index: torch.Tensor,
         mask: torch.Tensor,
         kv_cache: Dict[torch.nn.Module, torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -285,30 +396,36 @@ def forward(
         - x: shape [1, 1, attention_dim]. Input feature.
 
         - kv_cache: 4 * num_decoder_blocks entries representing self attention
-          and cross attention from all attention blocks. Each entry of shape
-          [1, decoded_len, attention_dim]. We'd only use cache relevant to this
-          particular attention layer and ignore other entries in the dict.
+          and cross attention from all attention blocks. Each k entry of shape
+          [1, num_heads, attention_dim // num_heads, context_len] and
+          each v entry of shape
+          [1, num_heads, context_len, attention_dim // num_heads].
+          We'd only use cache relevant to this particular attention layer
+          and ignore other entries in the dict.
 
         Returns:
 
         - x_out: attention output
 
-        - updated k, v cache: of shape [1, decoded_len+1, attention_dim]
+        - updated k, v cache: with same shape as input
         """
         assert isinstance(self.query, torch.nn.Module)  # for mypy
         assert isinstance(self.key, torch.nn.Module)  # for mypy
         assert isinstance(self.value, torch.nn.Module)  # for mypy
         assert isinstance(self.out, torch.nn.Module)  # for mypy
         q = self.query(x)
+        q = q.view(q.shape[0], self.n_head, 1, -1)
         if self.attn_type == "self_attention":
             k_cache = kv_cache[self.key]
             v_cache = kv_cache[self.value]
-            k = torch.zeros(k_cache.shape)
-            v = torch.zeros(v_cache.shape)
-            k = mask * self.key(x) + k_cache
-            v = mask * self.value(x) + v_cache
-            new_index = torch.tensor([index[0, 0] + 1]).long()
-            wv = qkv_attention(q, k[:, :new_index], v[:, :new_index], self.n_head)
+            k = self.key(x).unsqueeze(3)
+            k = k.view(k.shape[0], self.n_head, -1, 1)
+            v = self.value(x).unsqueeze(2)
+            v = v.view(k.shape[0], self.n_head, 1, -1)
+            # shift kv cache and insert new k and v entries
+            k = torch.cat((k_cache[:, :, :, 1:], k), dim=-1)
+            v = torch.cat((v_cache[:, :, 1:, :], v), dim=-2)
+            wv = qkv_attention(q, k, v, self.n_head, mask=mask)
         else:  # cross_attention
             k, v = kv_cache[self.key], kv_cache[self.value]
             wv = qkv_attention(q, k, v, self.n_head)
@@ -327,21 +444,17 @@ def qkv_attention(
     """
     Adapted from whisper.model.MultiHeadAttention.qkv_attention
     """
-    n_batch, n_ctx, n_state = q.shape
-
-    scale = (n_state // n_head) ** -0.25
-    q = q.view(*q.shape[:2], n_head, -1).permute(0, 2, 1, 3) * scale
-    k = k.view(*k.shape[:2], n_head, -1).permute(0, 2, 3, 1) * scale
-    v = v.view(*v.shape[:2], n_head, -1).permute(0, 2, 1, 3)
-
-    qk = q @ k
-    if mask is not None:
-        qk = qk + mask
-    # Use negative infinity to mask the zeros when doing the softmax.
-    qk = qk.float()
-
-    w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
-    return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
+    wv_list = []
+    # Split heads in qkv calculation
+    for h in range(n_head):
+        qk = q[:, h : h + 1, :, :] @ k[:, h : h + 1, :, :]
+        if mask is not None:
+            qk = qk + mask
+        w = torch.nn.functional.softmax(qk, dim=-1)
+        wv_list.append(w @ v[:, h : h + 1, :, :])
+    wv = torch.cat(wv_list, dim=1)
+    wv = wv.view(wv.shape[0], 1, -1)
+    return wv
 
 
 class ResidualAttentionBlockWrapper(torch.nn.Module):
@@ -357,14 +470,26 @@ def __init__(self, model: whisper.model.ResidualAttentionBlock):
         assert isinstance(model, whisper.model.ResidualAttentionBlock)
         # Wraps `MultiheadAttention` to `MultiheadAttentionWrapper`
         self.attn = MHAWrapper(model.attn, "self_attention")
+
+        states_per_head = model.attn.query.weight.shape[0] // model.attn.n_head
+        scale = states_per_head**-0.25
         self.cross_attn = MHAWrapper(model.cross_attn, "cross_attention")
+
+        # Apply scale for qkv to parameters
+        with torch.no_grad():
+            self.attn.query.weight *= scale
+            self.attn.query.bias *= scale
+            self.attn.key.weight *= scale
+            self.cross_attn.query.weight *= scale
+            self.cross_attn.query.bias *= scale
+            self.cross_attn.key.weight *= scale
+
         for m in ["attn_ln", "cross_attn_ln", "mlp", "mlp_ln"]:
             self.add_module(m, getattr(model, m))
 
     def forward(
         self,
         x: torch.Tensor,
-        index: torch.Tensor,
         mask: torch.Tensor,
         kv_cache: Dict[torch.nn.Module, torch.Tensor],
     ):
@@ -380,14 +505,14 @@ def forward(
         assert isinstance(self.mlp, torch.nn.Module)  # for mypy
         assert isinstance(self.mlp_ln, torch.nn.Module)  # for mypy
         x_attn, k_cache, v_cache = self.attn(
-            self.attn_ln(x), index=index, mask=mask, kv_cache=kv_cache
+            self.attn_ln(x), mask=mask, kv_cache=kv_cache
         )
         x = x + x_attn
         if self.cross_attn:
             # Ignore cross attn kv cache which is constant (pre-computed in
             # `WhisperCrossAttnKVCacheTorch`)
             x_cross_attn, _, _ = self.cross_attn(
-                self.cross_attn_ln(x), index=index, mask=mask, kv_cache=kv_cache
+                self.cross_attn_ln(x), mask=mask, kv_cache=kv_cache
             )
             x = x + x_cross_attn
         x = x + self.mlp(self.mlp_ln(x))
diff --git a/qai_hub_models/models/_shared/whisper/test_utils.py b/qai_hub_models/models/_shared/whisper/test_utils.py
index b3657a76..9b4fb089 100644
--- a/qai_hub_models/models/_shared/whisper/test_utils.py
+++ b/qai_hub_models/models/_shared/whisper/test_utils.py
@@ -13,7 +13,7 @@
 )
 from qai_hub_models.models._shared.whisper.demo import TEST_AUDIO_PATH
 from qai_hub_models.models._shared.whisper.model import (
-    MAX_DECODE_LEN,
+    MEAN_DECODE_LEN,
     MEL_FILTER_PATH,
     Whisper,
     WhisperDecoderInf,
@@ -49,19 +49,36 @@ def run_test_wrapper_numerics(whisper_version):
     encoder = WhisperEncoderInf(model)
     decoder = WhisperDecoderInf(model.decoder)
 
-    cross_attn_cache = encoder(mel_input)
-    sample_len = MAX_DECODE_LEN
-    cache_tensor = np.zeros([1, sample_len, decoder.attention_dim]).astype(np.float32)
+    k_cache_cross, v_cache_cross = encoder(mel_input)
+    sample_len = MEAN_DECODE_LEN
+
+    k_cache_self = torch.zeros(
+        (
+            decoder.num_blocks,
+            decoder.num_heads,
+            decoder.attention_dim // decoder.num_heads,
+            sample_len,
+        ),
+        dtype=torch.float32,
+    )
+    v_cache_self = torch.zeros(
+        (
+            decoder.num_blocks,
+            decoder.num_heads,
+            sample_len,
+            decoder.attention_dim // decoder.num_heads,
+        ),
+        dtype=torch.float32,
+    )
     index = torch.zeros([1, 1], dtype=torch.int32)
     index[0, 0] = 0
-    mask = torch.zeros(1, sample_len, decoder.attention_dim, dtype=torch.bool)
-    mask[:, 0, :] = 1
-    self_attn_cache = [cache_tensor] * 2 * decoder.num_blocks
     with torch.no_grad():
-        decoder_out = decoder(tokens, index, mask, *cross_attn_cache, *self_attn_cache)
+        decoder_out = decoder(
+            tokens, index, k_cache_cross, v_cache_cross, k_cache_self, v_cache_self
+        )
         logits = decoder_out[0].detach().numpy()
 
-    np.testing.assert_allclose(logits_orig, logits)
+    np.testing.assert_allclose(logits_orig, logits, rtol=5e-3)
 
 
 def run_test_transcribe(whisper_version):
diff --git a/qai_hub_models/models/_shared/yolo/demo.py b/qai_hub_models/models/_shared/yolo/demo.py
index 9da62ba3..046bed7d 100644
--- a/qai_hub_models/models/_shared/yolo/demo.py
+++ b/qai_hub_models/models/_shared/yolo/demo.py
@@ -29,6 +29,7 @@ def yolo_detection_demo(
     default_image: str | CachedWebAsset,
     stride_multiple: int | None = None,
     is_test: bool = False,
+    default_score_threshold: float = 0.45,
 ):
     # Demo parameters
     parser = get_model_cli_parser(model_type)
@@ -40,7 +41,7 @@ def yolo_detection_demo(
     parser.add_argument(
         "--score-threshold",
         type=float,
-        default=0.45,
+        default=default_score_threshold,
         help="Score threshold for NonMaximumSuppression",
     )
     parser.add_argument(
diff --git a/qai_hub_models/models/aotgan/README.md b/qai_hub_models/models/aotgan/README.md
index 5e0f84a4..dd02b51a 100644
--- a/qai_hub_models/models/aotgan/README.md
+++ b/qai_hub_models/models/aotgan/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/a
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/aotgan/export.py b/qai_hub_models/models/aotgan/export.py
index 031cded6..9b628a56 100644
--- a/qai_hub_models/models/aotgan/export.py
+++ b/qai_hub_models/models/aotgan/export.py
@@ -120,12 +120,17 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image,mask"
+        + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image,mask"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +168,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image,mask", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image,mask", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +201,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
diff --git a/qai_hub_models/models/aotgan/perf.yaml b/qai_hub_models/models/aotgan/perf.yaml
index a83056a2..7f8e4c42 100644
--- a/qai_hub_models/models/aotgan/perf.yaml
+++ b/qai_hub_models/models/aotgan/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: AOT-GAN
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 172218.0
-      throughput: 5.806593968110186
+      inference_time: 164598.0
+      throughput: 6.075407963644759
       estimated_peak_memory_range:
-        min: 3301376
-        max: 6608312
+        min: 4349952
+        max: 7789760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 235
-      job_id: jlpeelxop
+      job_id: jmg9werlp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 162527.0
-      throughput: 6.15282383850068
+      inference_time: 164540.0
+      throughput: 6.077549532028686
       estimated_peak_memory_range:
-        min: 4247552
-        max: 34036840
+        min: 4341760
+        max: 36913480
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,7 +63,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 275
-      job_id: jz5w21z35
+      job_id: jz57x3mlg
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -76,7 +78,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jnp1yv18p
+      job_id: jegn384r5
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.302216Z'
+    timestamp: '2024-05-20T16:35:27.553176Z'
   - torchscript_onnx_tflite:
-      inference_time: 126778.0
-      throughput: 7.887803877644386
+      inference_time: 120809.0
+      throughput: 8.277528992045294
       estimated_peak_memory_range:
-        min: 2174976
-        max: 256099504
+        min: 2879488
+        max: 222384800
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 235
-      job_id: jygzo4yo5
+      job_id: jnp1ex92g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 119306.0
-      throughput: 8.381808123648433
+      inference_time: 121163.0
+      throughput: 8.253344667926678
       estimated_peak_memory_range:
-        min: 3887104
-        max: 166111904
+        min: 3252224
+        max: 144610800
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,7 +116,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 275
-      job_id: jmg9jx2w5
+      job_id: jqp4v07vp
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -129,7 +131,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jvgdez4r5
+      job_id: joprejr95
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.302592Z'
+    timestamp: '2024-05-20T16:35:27.684430Z'
   - torchscript_onnx_tflite:
-      inference_time: 171670.0
-      throughput: 5.825129609133803
+      inference_time: 161130.0
+      throughput: 6.206168931918326
       estimated_peak_memory_range:
-        min: 3219456
-        max: 6614600
+        min: 3170304
+        max: 13340440
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 235
-      job_id: jopr8dz05
+      job_id: jvgdolkep
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 162527.0
-      throughput: 6.15282383850068
+      inference_time: 164457.0
+      throughput: 6.080616817769995
       estimated_peak_memory_range:
-        min: 4337664
-        max: 32953256
+        min: 4214784
+        max: 29715440
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 275
-      job_id: j1p80rlkg
+      job_id: jo5m3y7wg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.302905Z'
+    timestamp: '2024-05-20T16:35:27.816239Z'
+  - torchscript_onnx_qnn:
+      inference_time: 145454.0
+      throughput: 6.87502578134668
+      estimated_peak_memory_range:
+        min: 4202496
+        max: 4202496
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 275
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 275
+      job_id: j0pxy2q1g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jep2ln14g
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jqpy60l75
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:27.949164Z'
diff --git a/qai_hub_models/models/baichuan_7b_quantized/README.md b/qai_hub_models/models/baichuan_7b_quantized/README.md
index 0e5427aa..cff30e46 100644
--- a/qai_hub_models/models/baichuan_7b_quantized/README.md
+++ b/qai_hub_models/models/baichuan_7b_quantized/README.md
@@ -15,6 +15,8 @@ a hosted Qualcomm® device.
 
 
 
+
+
 ## License
 - The license for the original implementation of Baichuan-7B can be found
   [here](https://github.com/baichuan-inc/Baichuan-7B/blob/main/LICENSE).
@@ -29,3 +31,25 @@ a hosted Qualcomm® device.
 * For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
 
 
+## Usage and Limitations
+
+This model may not be used for or in connection with any of the following applications:
+
+- Accessing essential private and public services and benefits;
+- Administration of justice and democratic processes;
+- Assessing or recognizing the emotional state of a person;
+- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics;
+- Education and vocational training;
+- Employment and workers management;
+- Exploitation of the vulnerabilities of persons resulting in harmful behavior;
+- General purpose social scoring;
+- Law enforcement;
+- Management and operation of critical infrastructure;
+- Migration, asylum and border control management;
+- Predictive policing;
+- Real-time remote biometric identification in public spaces;
+- Recommender systems of social media platforms;
+- Scraping of facial images (from the internet or otherwise); and/or
+- Subliminal manipulation
+
+
diff --git a/qai_hub_models/models/controlnet_quantized/README.md b/qai_hub_models/models/controlnet_quantized/README.md
index db6e93dc..2c155773 100644
--- a/qai_hub_models/models/controlnet_quantized/README.md
+++ b/qai_hub_models/models/controlnet_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/c
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/controlnet_quantized/info.yaml b/qai_hub_models/models/controlnet_quantized/info.yaml
index 97f9fa98..5c000cba 100644
--- a/qai_hub_models/models/controlnet_quantized/info.yaml
+++ b/qai_hub_models/models/controlnet_quantized/info.yaml
@@ -29,7 +29,7 @@ applicable_scenarios:
   - Image Editing
   - Content Creation
 related_models:
-  - stable_diffusion_quantized
+  - stable_diffusion_v1_5_quantized
 form_factors:
   - Phone
   - Tablet
diff --git a/qai_hub_models/models/controlnet_quantized/test.py b/qai_hub_models/models/controlnet_quantized/test.py
index 18c31392..d7b23999 100644
--- a/qai_hub_models/models/controlnet_quantized/test.py
+++ b/qai_hub_models/models/controlnet_quantized/test.py
@@ -2,13 +2,12 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-import tempfile
-
 import pytest
 
 from qai_hub_models.models.controlnet_quantized.demo import main as demo_main
 from qai_hub_models.models.controlnet_quantized.export import export_model
 from qai_hub_models.models.controlnet_quantized.model import ControlNetQuantized
+from qai_hub_models.utils.asset_loaders import qaihm_temp_dir
 
 
 def test_from_precompiled():
@@ -18,7 +17,7 @@ def test_from_precompiled():
 @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
 @pytest.mark.slow_cloud
 def test_export():
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         exported_jobs = export_model(
             # Testing text_encoder as it's smallest model in
             # ControlNet pipeline
diff --git a/qai_hub_models/models/convnext_tiny/README.md b/qai_hub_models/models/convnext_tiny/README.md
index bd6e6674..9e71c767 100644
--- a/qai_hub_models/models/convnext_tiny/README.md
+++ b/qai_hub_models/models/convnext_tiny/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/c
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/convnext_tiny/export.py b/qai_hub_models/models/convnext_tiny/export.py
index f00fe11b..54ff513c 100644
--- a/qai_hub_models/models/convnext_tiny/export.py
+++ b/qai_hub_models/models/convnext_tiny/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/convnext_tiny/perf.yaml b/qai_hub_models/models/convnext_tiny/perf.yaml
index f8254acd..9d8dec94 100644
--- a/qai_hub_models/models/convnext_tiny/perf.yaml
+++ b/qai_hub_models/models/convnext_tiny/perf.yaml
@@ -8,6 +8,7 @@ aggregated:
   - Google Pixel 4
   - Google Pixel 4a
   - Google Pixel 5a 5G
+  - QCS8550 (Proxy)
   - Samsung Galaxy S21
   - Samsung Galaxy S21 Ultra
   - Samsung Galaxy S21+
@@ -21,30 +22,63 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
+  - Qcs8550
   - Snapdragon® 8 Gen 1
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ConvNext-Tiny
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 11504.0
-      throughput: 86.92628650904034
+      inference_time: 5710.0
+      throughput: 175.13134851138355
+      estimated_peak_memory_range:
+        min: 49152
+        max: 2555016
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 328
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 328
+      job_id: j2p0l7w6p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 3790.0
+      throughput: 263.85224274406335
+      estimated_peak_memory_range:
+        min: 86016
+        max: 170428944
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 223
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 223
+      job_id: jn5q3on4p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 16263.0
+      throughput: 61.48927012236365
       estimated_peak_memory_range:
         min: 32768
-        max: 2493040
+        max: 155815696
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 380
+        layers_on_npu: 189
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 380
-      job_id: jlpeoyo7g
+        total_layers: 189
+      job_id: jwgo3qxxg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -53,36 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-02T15:30:19.195043Z'
+    timestamp: '2024-05-20T16:35:27.983972Z'
+  - torchscript_onnx_tflite:
+      inference_time: 3967.0
+      throughput: 252.07965717166624
+      estimated_peak_memory_range:
+        min: 16384
+        max: 210597920
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 328
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 328
+      job_id: j1p8zvnxp
+      job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 2727.0
+      throughput: 366.70333700036673
       estimated_peak_memory_range:
         min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        max: 88673616
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 223
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: ''
-      job_status: Skipped
-  - torchscript_onnx_tflite:
-      inference_time: 8139.0
-      throughput: 122.86521685710775
+        total_layers: 223
+      job_id: j1gl3rd8g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 11672.0
+      throughput: 85.67511994516792
       estimated_peak_memory_range:
-        min: 20480
-        max: 209217264
+        min: 618496
+        max: 58874592
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 380
+        layers_on_npu: 189
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 380
-      job_id: jygz2n2zg
+        total_layers: 189
+      job_id: j1pvvx8jp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -91,19 +140,95 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-02T15:30:19.195057Z'
+    timestamp: '2024-05-20T16:35:27.984000Z'
+  - torchscript_onnx_tflite:
+      inference_time: 5754.0
+      throughput: 173.79214459506431
+      estimated_peak_memory_range:
+        min: 24576
+        max: 2610064
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 328
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 328
+      job_id: jogk3m125
+      job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 3773.0
+      throughput: 265.041081367612
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 90112
+        max: 202074560
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 223
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: ''
-      job_status: Skipped
+        total_layers: 223
+      job_id: j1p3e2dl5
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:27.984018Z'
+  - torchscript_onnx_qnn:
+      inference_time: 3953.0
+      throughput: 252.97242600556538
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 223
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 223
+      job_id: jw56nlx0g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 17160.0
+      throughput: 58.27505827505828
+      estimated_peak_memory_range:
+        min: 441618432
+        max: 441618432
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 189
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 189
+      job_id: j7gje49x5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 36452.0
+      throughput: 27.4333369911116
+      estimated_peak_memory_range:
+        min: 1425408
+        max: 1425408
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 202
+        total_layers: 202
+      job_id: jlpek3q1p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:27.984040Z'
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md b/qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md
new file mode 100644
index 00000000..4266dbef
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/README.md
@@ -0,0 +1,56 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [ConvNext-Tiny-w8a16-Quantized: Imagenet classifier and general purpose backbone](#)
+
+ConvNextTiny is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases.
+
+This is based on the implementation of ConvNext-Tiny-w8a16-Quantized found
+[here](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](#).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.convnext_tiny_w8a16_quantized.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.convnext_tiny_w8a16_quantized.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of ConvNext-Tiny-w8a16-Quantized can be found
+  [here](https://github.com/pytorch/vision/blob/main/LICENSE).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
+* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/__init__.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/__init__.py
new file mode 100644
index 00000000..599858c4
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/__init__.py
@@ -0,0 +1,10 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.imagenet_classifier.app import (  # noqa: F401
+    ImagenetClassifierApp as App,
+)
+
+from .model import MODEL_ID  # noqa: F401
+from .model import ConvNextTinyW8A16Quantizable as Model  # noqa: F401
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/conftest.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/conftest.py
new file mode 100644
index 00000000..1f2c01d5
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/conftest.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+import inspect
+
+import pytest
+
+from qai_hub_models.models.convnext_tiny_w8a16_quantized import Model
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+# Instantiate the model only once for all tests.
+# Mock from_pretrained to always return the initialized model.
+# This speeds up tests and limits memory leaks.
+@pytest.fixture(scope="module", autouse=True)
+def cached_from_pretrained():
+    with pytest.MonkeyPatch.context() as mp:
+        pretrained_cache = {}
+        from_pretrained = Model.from_pretrained
+        sig = inspect.signature(from_pretrained)
+
+        @skip_clone_repo_check
+        def _cached_from_pretrained(*args, **kwargs):
+            cache_key = str(args) + str(kwargs)
+            model = pretrained_cache.get(cache_key, None)
+            if model:
+                return model
+            else:
+                model = from_pretrained(*args, **kwargs)
+                pretrained_cache[cache_key] = model
+                return model
+
+        _cached_from_pretrained.__signature__ = sig
+
+        mp.setattr(Model, "from_pretrained", _cached_from_pretrained)
+        yield mp
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/demo.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/demo.py
new file mode 100644
index 00000000..927a3d6c
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/demo.py
@@ -0,0 +1,17 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo
+from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import (
+    MODEL_ID,
+    ConvNextTinyW8A16Quantizable,
+)
+
+
+def main(is_test: bool = False):
+    imagenet_demo(ConvNextTinyW8A16Quantizable, MODEL_ID, is_test)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py
new file mode 100644
index 00000000..7dcb6b96
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/export.py
@@ -0,0 +1,227 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import qai_hub as hub
+
+from qai_hub_models.models.convnext_tiny_w8a16_quantized import Model
+from qai_hub_models.utils.args import (
+    export_parser,
+    get_input_spec_kwargs,
+    get_model_kwargs,
+)
+from qai_hub_models.utils.base_model import TargetRuntime
+from qai_hub_models.utils.compare import torch_inference
+from qai_hub_models.utils.printing import (
+    print_inference_metrics,
+    print_on_target_demo_cmd,
+    print_profile_metrics_from_job,
+)
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+    transpose_channel_first_to_last,
+)
+from qai_hub_models.utils.qnn_helpers import get_qnn_inputs
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_downloading: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    target_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    compile_options: str = "",
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[
+    str
+]:
+    """
+    This function accomplishes 6 main tasks:
+
+        1. Instantiates a PyTorch model and converts it to a traced TorchScript format.
+        2. Compiles the model to an asset that can be run on device.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Downloads the model asset to the local directory.
+        6. Summarizes the results from profiling and inference.
+
+    Each of the last four steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_downloading: If set, skips downloading of compiled model.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling and inference.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        target_runtime: Which on-device runtime to target. Default is TFLite.
+        compile_options: Additional options to pass when submitting the compile job.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_pretrained` and `model.get_input_spec`
+
+    Returns:
+        A 3-tuple of:
+            * A CompileJob object containing metadata about the compile job submitted to hub.
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "convnext_tiny_w8a16_quantized"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "convnext_tiny_w8a16_quantized",
+            "ConvNext-Tiny-w8a16-Quantized",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            skip_downloading,
+            skip_summary,
+            output_path,
+            target_runtime,
+            compile_options,
+            profile_options,
+        )
+
+    # 1. Initialize PyTorch model
+    model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs))
+    input_spec = model.get_input_spec(
+        **get_input_spec_kwargs(model, additional_model_kwargs)
+    )
+
+    # Trace the model
+    source_model = model.convert_to_hub_source_model(
+        target_runtime, output_path, input_spec
+    )
+    if target_runtime == TargetRuntime.TFLITE:
+        quant_calibration_data = None
+    else:
+        quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
+
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
+    # 2. Compile the model to an on-device asset
+    model_compile_options = model.get_hub_compile_options(
+        target_runtime, compile_options + channel_last_flags, hub_device
+    )
+    print(f"Optimizing model {model_name} to run on-device")
+    submitted_compile_job = hub.submit_compile_job(
+        model=source_model,
+        input_specs=input_spec,
+        device=hub_device,
+        name=model_name,
+        calibration_data=quant_calibration_data,
+        options=model_compile_options,
+    )
+    compile_job = cast(hub.client.CompileJob, submitted_compile_job)
+
+    # 3. Profile the model asset on real devices
+    profile_job: Optional[hub.client.ProfileJob] = None
+    if not skip_profiling:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(f"Profiling model {model_name} on a hosted device.")
+        submitted_profile_job = hub.submit_profile_job(
+            model=compile_job.get_target_model(),
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        profile_job = cast(hub.client.ProfileJob, submitted_profile_job)
+
+    # 4. Run inference on-device with sample inputs
+    inference_job: Optional[hub.client.InferenceJob] = None
+    if not skip_inferencing:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(
+            f"Running inference for {model_name} on a hosted device with example inputs."
+        )
+        sample_inputs = model.sample_inputs(input_spec)
+        hub_inputs = sample_inputs
+        if target_runtime == TargetRuntime.QNN:
+            hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
+        )
+        submitted_inference_job = hub.submit_inference_job(
+            model=compile_job.get_target_model(),
+            inputs=hub_inputs,
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        inference_job = cast(hub.client.InferenceJob, submitted_inference_job)
+
+    # 5. Download the model asset to a local file
+    if not skip_downloading:
+        os.makedirs(output_path, exist_ok=True)
+        target_model: hub.Model = compile_job.get_target_model()  # type: ignore
+        target_model.download(str(output_path / f"{model_name}.tflite"))
+
+    # 6. Summarize the results from profiling and inference
+    if not skip_summary and not skip_profiling:
+        assert profile_job is not None and profile_job.wait().success
+        profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+        print_profile_metrics_from_job(profile_job, profile_data)
+
+    if not skip_summary and not skip_inferencing:
+        torch_out = torch_inference(model, sample_inputs)
+        assert inference_job is not None and inference_job.wait().success
+        inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
+        print_inference_metrics(
+            inference_job, inference_result, torch_out, metrics="psnr,top1,top5"
+        )
+
+    if not skip_summary:
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+
+    return (compile_job, profile_job, inference_job)
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(model_cls=Model, supports_ort=False)
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/info.yaml b/qai_hub_models/models/convnext_tiny_w8a16_quantized/info.yaml
new file mode 100644
index 00000000..5370c05d
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/info.yaml
@@ -0,0 +1,42 @@
+name: ConvNext-Tiny-w8a16-Quantized
+# id must match with the model dir name in qai_hub_models
+id: convnext_tiny_w8a16_quantized
+status: public
+headline: Imagenet classifier and general purpose backbone.
+domain: Computer Vision
+description: ConvNextTiny is a machine learning model that can classify images from
+  the Imagenet dataset. It can also be used as a backbone in building more complex
+  models for specific use cases.
+use_case: Image Classification
+tags:
+  - quantized
+research_paper: https://arxiv.org/abs/2201.03545
+research_paper_title: A ConvNet for the 2020s
+license: https://github.com/pytorch/vision/blob/main/LICENSE
+deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf
+source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py
+technical_details:
+  Model checkpoint: Imagenet
+  Input resolution: 224x224
+  Number of parameters: 28.6M
+  Model size: 28 MB
+  Precision: w8a16 (8-bit weights, 16-bit activations)
+applicable_scenarios:
+  - Medical Imaging
+  - Anomaly Detection
+  - Inventory Management
+form_factors:
+  - Phone
+  - Tablet
+  - IoT
+related_models:
+  - mobilenet_v2
+  - densenet121
+  - googlenet
+has_static_banner: yes
+has_animated_banner: yes
+license_type: bsd-3-clause
+deploy_license_type: AI Model Hub License
+dataset:
+  - imagenet-1k
+  - imagenet-22k
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/model.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/model.py
new file mode 100644
index 00000000..3da97038
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/model.py
@@ -0,0 +1,37 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from pathlib import Path
+
+from aimet_torch.quantsim import QuantizationSimModel
+
+from qai_hub_models.models._shared.convnext_tiny_quantized.model import (
+    ConvNextTinyQuantizableBase,
+)
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 1
+
+DEFAULT_ENCODINGS = "convnext_tiny_w8a16_quantized_encodings.json"
+
+
+class ConvNextTinyW8A16Quantizable(ConvNextTinyQuantizableBase):
+    def __init__(
+        self,
+        quant_sim_model: QuantizationSimModel,
+    ) -> None:
+        ConvNextTinyQuantizableBase.__init__(self, quant_sim_model)
+
+    @classmethod
+    def _default_aimet_encodings(cls) -> str | Path:
+        return CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
+        ).fetch()
+
+    @classmethod
+    def _output_bw(cls) -> int:
+        return 16
diff --git a/qai_hub_models/models/convnext_tiny_w8a16_quantized/test.py b/qai_hub_models/models/convnext_tiny_w8a16_quantized/test.py
new file mode 100644
index 00000000..2931e003
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a16_quantized/test.py
@@ -0,0 +1,31 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.imagenet_classifier.test_utils import (
+    run_imagenet_classifier_test,
+)
+from qai_hub_models.models.convnext_tiny_w8a16_quantized.demo import main as demo_main
+from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    ConvNextTinyW8A16Quantizable,
+)
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+@skip_clone_repo_check
+def test_task():
+    run_imagenet_classifier_test(
+        ConvNextTinyW8A16Quantizable.from_pretrained(),
+        MODEL_ID,
+        asset_version=MODEL_ASSET_VERSION,
+        probability_threshold=0.56,
+        diff_tol=0.06,
+    )
+
+
+@skip_clone_repo_check
+def test_demo():
+    # Verify demo does not crash
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md b/qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md
new file mode 100644
index 00000000..3f86e66c
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/README.md
@@ -0,0 +1,56 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [ConvNext-Tiny-w8a8-Quantized: Imagenet classifier and general purpose backbone](#)
+
+ConvNextTiny is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases.
+
+This is based on the implementation of ConvNext-Tiny-w8a8-Quantized found
+[here](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](#).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.convnext_tiny_w8a8_quantized.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.convnext_tiny_w8a8_quantized.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of ConvNext-Tiny-w8a8-Quantized can be found
+  [here](https://github.com/pytorch/vision/blob/main/LICENSE).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
+* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/__init__.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/__init__.py
new file mode 100644
index 00000000..13778437
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/__init__.py
@@ -0,0 +1,10 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.imagenet_classifier.app import (  # noqa: F401
+    ImagenetClassifierApp as App,
+)
+
+from .model import MODEL_ID  # noqa: F401
+from .model import ConvNextTinyW8A8Quantizable as Model  # noqa: F401
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/conftest.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/conftest.py
new file mode 100644
index 00000000..e737cdbc
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/conftest.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+import inspect
+
+import pytest
+
+from qai_hub_models.models.convnext_tiny_w8a8_quantized import Model
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+# Instantiate the model only once for all tests.
+# Mock from_pretrained to always return the initialized model.
+# This speeds up tests and limits memory leaks.
+@pytest.fixture(scope="module", autouse=True)
+def cached_from_pretrained():
+    with pytest.MonkeyPatch.context() as mp:
+        pretrained_cache = {}
+        from_pretrained = Model.from_pretrained
+        sig = inspect.signature(from_pretrained)
+
+        @skip_clone_repo_check
+        def _cached_from_pretrained(*args, **kwargs):
+            cache_key = str(args) + str(kwargs)
+            model = pretrained_cache.get(cache_key, None)
+            if model:
+                return model
+            else:
+                model = from_pretrained(*args, **kwargs)
+                pretrained_cache[cache_key] = model
+                return model
+
+        _cached_from_pretrained.__signature__ = sig
+
+        mp.setattr(Model, "from_pretrained", _cached_from_pretrained)
+        yield mp
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/demo.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/demo.py
new file mode 100644
index 00000000..adc48957
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/demo.py
@@ -0,0 +1,17 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo
+from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import (
+    MODEL_ID,
+    ConvNextTinyW8A8Quantizable,
+)
+
+
+def main(is_test: bool = False):
+    imagenet_demo(ConvNextTinyW8A8Quantizable, MODEL_ID, is_test)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py
new file mode 100644
index 00000000..8d9cca73
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/export.py
@@ -0,0 +1,227 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import qai_hub as hub
+
+from qai_hub_models.models.convnext_tiny_w8a8_quantized import Model
+from qai_hub_models.utils.args import (
+    export_parser,
+    get_input_spec_kwargs,
+    get_model_kwargs,
+)
+from qai_hub_models.utils.base_model import TargetRuntime
+from qai_hub_models.utils.compare import torch_inference
+from qai_hub_models.utils.printing import (
+    print_inference_metrics,
+    print_on_target_demo_cmd,
+    print_profile_metrics_from_job,
+)
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+    transpose_channel_first_to_last,
+)
+from qai_hub_models.utils.qnn_helpers import get_qnn_inputs
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_downloading: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    target_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    compile_options: str = "",
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[
+    str
+]:
+    """
+    This function accomplishes 6 main tasks:
+
+        1. Instantiates a PyTorch model and converts it to a traced TorchScript format.
+        2. Compiles the model to an asset that can be run on device.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Downloads the model asset to the local directory.
+        6. Summarizes the results from profiling and inference.
+
+    Each of the last four steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_downloading: If set, skips downloading of compiled model.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling and inference.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        target_runtime: Which on-device runtime to target. Default is TFLite.
+        compile_options: Additional options to pass when submitting the compile job.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_pretrained` and `model.get_input_spec`
+
+    Returns:
+        A 3-tuple of:
+            * A CompileJob object containing metadata about the compile job submitted to hub.
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "convnext_tiny_w8a8_quantized"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "convnext_tiny_w8a8_quantized",
+            "ConvNext-Tiny-w8a8-Quantized",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            skip_downloading,
+            skip_summary,
+            output_path,
+            target_runtime,
+            compile_options,
+            profile_options,
+        )
+
+    # 1. Initialize PyTorch model
+    model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs))
+    input_spec = model.get_input_spec(
+        **get_input_spec_kwargs(model, additional_model_kwargs)
+    )
+
+    # Trace the model
+    source_model = model.convert_to_hub_source_model(
+        target_runtime, output_path, input_spec
+    )
+    if target_runtime == TargetRuntime.TFLITE:
+        quant_calibration_data = None
+    else:
+        quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
+
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
+    # 2. Compile the model to an on-device asset
+    model_compile_options = model.get_hub_compile_options(
+        target_runtime, compile_options + channel_last_flags, hub_device
+    )
+    print(f"Optimizing model {model_name} to run on-device")
+    submitted_compile_job = hub.submit_compile_job(
+        model=source_model,
+        input_specs=input_spec,
+        device=hub_device,
+        name=model_name,
+        calibration_data=quant_calibration_data,
+        options=model_compile_options,
+    )
+    compile_job = cast(hub.client.CompileJob, submitted_compile_job)
+
+    # 3. Profile the model asset on real devices
+    profile_job: Optional[hub.client.ProfileJob] = None
+    if not skip_profiling:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(f"Profiling model {model_name} on a hosted device.")
+        submitted_profile_job = hub.submit_profile_job(
+            model=compile_job.get_target_model(),
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        profile_job = cast(hub.client.ProfileJob, submitted_profile_job)
+
+    # 4. Run inference on-device with sample inputs
+    inference_job: Optional[hub.client.InferenceJob] = None
+    if not skip_inferencing:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(
+            f"Running inference for {model_name} on a hosted device with example inputs."
+        )
+        sample_inputs = model.sample_inputs(input_spec)
+        hub_inputs = sample_inputs
+        if target_runtime == TargetRuntime.QNN:
+            hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
+        )
+        submitted_inference_job = hub.submit_inference_job(
+            model=compile_job.get_target_model(),
+            inputs=hub_inputs,
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        inference_job = cast(hub.client.InferenceJob, submitted_inference_job)
+
+    # 5. Download the model asset to a local file
+    if not skip_downloading:
+        os.makedirs(output_path, exist_ok=True)
+        target_model: hub.Model = compile_job.get_target_model()  # type: ignore
+        target_model.download(str(output_path / f"{model_name}.tflite"))
+
+    # 6. Summarize the results from profiling and inference
+    if not skip_summary and not skip_profiling:
+        assert profile_job is not None and profile_job.wait().success
+        profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+        print_profile_metrics_from_job(profile_job, profile_data)
+
+    if not skip_summary and not skip_inferencing:
+        torch_out = torch_inference(model, sample_inputs)
+        assert inference_job is not None and inference_job.wait().success
+        inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
+        print_inference_metrics(
+            inference_job, inference_result, torch_out, metrics="psnr,top1,top5"
+        )
+
+    if not skip_summary:
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+
+    return (compile_job, profile_job, inference_job)
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(model_cls=Model, supports_ort=False)
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/info.yaml b/qai_hub_models/models/convnext_tiny_w8a8_quantized/info.yaml
new file mode 100644
index 00000000..b3770255
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/info.yaml
@@ -0,0 +1,42 @@
+name: ConvNext-Tiny-w8a8-Quantized
+# id must match with the model dir name in qai_hub_models
+id: convnext_tiny_w8a8_quantized
+status: public
+headline: Imagenet classifier and general purpose backbone.
+domain: Computer Vision
+description: ConvNextTiny is a machine learning model that can classify images from
+  the Imagenet dataset. It can also be used as a backbone in building more complex
+  models for specific use cases.
+use_case: Image Classification
+tags:
+  - quantized
+research_paper: https://arxiv.org/abs/2201.03545
+research_paper_title: A ConvNet for the 2020s
+license: https://github.com/pytorch/vision/blob/main/LICENSE
+deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf
+source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py
+technical_details:
+  Model checkpoint: Imagenet
+  Input resolution: 224x224
+  Number of parameters: 28.6M
+  Model size: 28 MB
+  Precision: w8a8 (8-bit weights, 8-bit activations)
+applicable_scenarios:
+  - Medical Imaging
+  - Anomaly Detection
+  - Inventory Management
+form_factors:
+  - Phone
+  - Tablet
+  - IoT
+related_models:
+  - mobilenet_v2
+  - densenet121
+  - googlenet
+has_static_banner: yes
+has_animated_banner: yes
+license_type: bsd-3-clause
+deploy_license_type: AI Model Hub License
+dataset:
+  - imagenet-1k
+  - imagenet-22k
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/model.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/model.py
new file mode 100644
index 00000000..5e332910
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/model.py
@@ -0,0 +1,37 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from pathlib import Path
+
+from aimet_torch.quantsim import QuantizationSimModel
+
+from qai_hub_models.models._shared.convnext_tiny_quantized.model import (
+    ConvNextTinyQuantizableBase,
+)
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 1
+
+DEFAULT_ENCODINGS = "convnext_tiny_w8a8_quantized_encodings.json"
+
+
+class ConvNextTinyW8A8Quantizable(ConvNextTinyQuantizableBase):
+    def __init__(
+        self,
+        quant_sim_model: QuantizationSimModel,
+    ) -> None:
+        ConvNextTinyQuantizableBase.__init__(self, quant_sim_model)
+
+    @classmethod
+    def _default_aimet_encodings(cls) -> str | Path:
+        return CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
+        ).fetch()
+
+    @classmethod
+    def _output_bw(cls) -> int:
+        return 8
diff --git a/qai_hub_models/models/convnext_tiny_w8a8_quantized/test.py b/qai_hub_models/models/convnext_tiny_w8a8_quantized/test.py
new file mode 100644
index 00000000..b7fedd53
--- /dev/null
+++ b/qai_hub_models/models/convnext_tiny_w8a8_quantized/test.py
@@ -0,0 +1,31 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.imagenet_classifier.test_utils import (
+    run_imagenet_classifier_test,
+)
+from qai_hub_models.models.convnext_tiny_w8a8_quantized.demo import main as demo_main
+from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    ConvNextTinyW8A8Quantizable,
+)
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+@skip_clone_repo_check
+def test_task():
+    run_imagenet_classifier_test(
+        ConvNextTinyW8A8Quantizable.from_pretrained(),
+        MODEL_ID,
+        asset_version=MODEL_ASSET_VERSION,
+        probability_threshold=0.56,
+        diff_tol=0.06,
+    )
+
+
+@skip_clone_repo_check
+def test_demo():
+    # Verify demo does not crash
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/ddrnet23_slim/README.md b/qai_hub_models/models/ddrnet23_slim/README.md
index fab4f087..22b47996 100644
--- a/qai_hub_models/models/ddrnet23_slim/README.md
+++ b/qai_hub_models/models/ddrnet23_slim/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/ddrnet23_slim/export.py b/qai_hub_models/models/ddrnet23_slim/export.py
index 2e17af1d..da2e4cdc 100644
--- a/qai_hub_models/models/ddrnet23_slim/export.py
+++ b/qai_hub_models/models/ddrnet23_slim/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ddrnet23_slim/perf.yaml b/qai_hub_models/models/ddrnet23_slim/perf.yaml
index bd99b239..8c0b1142 100644
--- a/qai_hub_models/models/ddrnet23_slim/perf.yaml
+++ b/qai_hub_models/models/ddrnet23_slim/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DDRNet23-Slim
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 6651.0
-      throughput: 150.35333032626673
+      inference_time: 6617.0
+      throughput: 151.1258878645912
       estimated_peak_memory_range:
-        min: 1007616
-        max: 2683032
+        min: 212992
+        max: 2249480
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,23 +48,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 131
-      job_id: j0pxndrl5
+      job_id: jogk3mj25
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 9555.0
+      throughput: 104.65724751439038
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 11808768
+        max: 48661000
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 155
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jegnl7qq5
-      job_status: Failed
+        total_layers: 155
+      job_id: jw56nlk0g
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -70,13 +72,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.356614Z'
+    timestamp: '2024-05-20T16:35:28.530240Z'
   - torchscript_onnx_tflite:
-      inference_time: 4569.0
-      throughput: 218.8662727073758
+      inference_time: 4661.0
+      throughput: 214.5462347135808
       estimated_peak_memory_range:
-        min: 16384
-        max: 71802832
+        min: 40960
+        max: 74706752
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,23 +86,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 131
-      job_id: jo5mqdk9p
+      job_id: jn5q3oj4p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 6060.0
+      throughput: 165.01650165016503
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 2203648
+        max: 39944288
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 155
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jopr8nd75
-      job_status: Failed
+        total_layers: 155
+      job_id: j1p3e2yl5
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -108,13 +110,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.356678Z'
+    timestamp: '2024-05-20T16:35:28.530261Z'
   - torchscript_onnx_tflite:
-      inference_time: 6682.0
-      throughput: 149.655791679138
+      inference_time: 6700.0
+      throughput: 149.2537313432836
       estimated_peak_memory_range:
-        min: 1011712
-        max: 3063152
+        min: 1019904
+        max: 2922360
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +124,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 131
-      job_id: jnp1y108p
+      job_id: j1gl3rj8g
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +133,42 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.356703Z'
+    timestamp: '2024-05-20T16:35:28.530273Z'
+  - torchscript_onnx_ort:
+      inference_time: 9528.0
+      throughput: 104.95382031905962
+      estimated_peak_memory_range:
+        min: 9854976
+        max: 9854976
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 155
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 155
+      job_id: jwgo3qjxg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 38162.0
+      throughput: 26.20407735443635
+      estimated_peak_memory_range:
+        min: 104890368
+        max: 104890368
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 105
+        total_layers: 105
+      job_id: j1pvvxjjp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.530292Z'
diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md b/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md
index 175a743c..49ab4d78 100644
--- a/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md
+++ b/qai_hub_models/models/deeplabv3_plus_mobilenet/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py b/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py
index 42105651..0f531913 100644
--- a/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py
+++ b/qai_hub_models/models/deeplabv3_plus_mobilenet/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml b/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml
index b2d5c52e..6a911252 100644
--- a/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml
+++ b/qai_hub_models/models/deeplabv3_plus_mobilenet/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DeepLabV3-Plus-MobileNet
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 13206.0
-      throughput: 75.72315614114797
+      inference_time: 13090.0
+      throughput: 76.39419404125286
       estimated_peak_memory_range:
-        min: 21012480
-        max: 37177032
+        min: 20566016
+        max: 22394640
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 98
-      job_id: jz57xlxvg
+      job_id: j7gje4jx5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 12804.0
-      throughput: 78.10059356451109
+      inference_time: 12915.0
+      throughput: 77.42934572202864
       estimated_peak_memory_range:
-        min: 1888256
-        max: 20259784
+        min: 2191360
+        max: 18354728
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,7 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 124
-      job_id: jo5m363dg
+      job_id: jz5wqno65
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 18188.0
+      throughput: 54.98130635583902
+      estimated_peak_memory_range:
+        min: 46182400
+        max: 80384080
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jz57x3zlg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-30T00:18:21.085559Z'
+    timestamp: '2024-05-20T16:35:28.552289Z'
   - torchscript_onnx_tflite:
-      inference_time: 9587.0
-      throughput: 104.30791697089809
+      inference_time: 9659.0
+      throughput: 103.53038616834041
       estimated_peak_memory_range:
-        min: 49152
-        max: 66968480
+        min: 45056
+        max: 67351648
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 98
-      job_id: jqp4vdv8p
+      job_id: jlpek3j1p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 9430.0
-      throughput: 106.04453870625663
+      inference_time: 9460.0
+      throughput: 105.70824524312897
       estimated_peak_memory_range:
-        min: 3174400
-        max: 56418592
+        min: 3194880
+        max: 56272272
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,7 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 124
-      job_id: jegn3m3k5
+      job_id: jmg9wevlp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 14020.0
+      throughput: 71.32667617689016
+      estimated_peak_memory_range:
+        min: 51036160
+        max: 82578320
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jqp4v0qvp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-30T00:18:21.085754Z'
+    timestamp: '2024-05-20T16:35:28.552316Z'
   - torchscript_onnx_tflite:
-      inference_time: 13237.0
-      throughput: 75.54581853894386
+      inference_time: 13152.0
+      throughput: 76.03406326034063
       estimated_peak_memory_range:
-        min: 22167552
-        max: 24453856
+        min: 19988480
+        max: 21886552
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 98
-      job_id: j0pxy6y3g
+      job_id: jygzrk1k5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 12986.0
-      throughput: 77.00600646850454
+      inference_time: 12898.0
+      throughput: 77.53140021708792
       estimated_peak_memory_range:
-        min: 3194880
-        max: 26458984
+        min: 3207168
+        max: 24171704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -137,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 124
-      job_id: jopre2e05
+      job_id: jvgdolwep
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -146,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-30T00:18:21.085930Z'
+    timestamp: '2024-05-20T16:35:28.552334Z'
+  - torchscript_onnx_qnn:
+      inference_time: 16530.0
+      throughput: 60.49606775559589
+      estimated_peak_memory_range:
+        min: 3170304
+        max: 3170304
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 124
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 124
+      job_id: jnp1ex02g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 16738.0
+      throughput: 59.7442944198829
+      estimated_peak_memory_range:
+        min: 107229184
+        max: 107229184
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: j0pxy2v1g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 192375.0
+      throughput: 5.198180636777128
+      estimated_peak_memory_range:
+        min: 387981312
+        max: 387981312
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jo5m3yrwg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.552356Z'
diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md
index 82b1c2b3..4595e199 100644
--- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md
+++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py
index 9ba9af15..a4dc4b5c 100644
--- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py
+++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/export.py
@@ -124,12 +124,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -171,8 +175,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -200,8 +206,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -213,7 +223,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml
index 71558a21..37b1f507 100644
--- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml
+++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DeepLabV3-Plus-MobileNet-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 3523.0
-      throughput: 283.84899233607723
+      inference_time: 3349.0
+      throughput: 298.59659599880564
       estimated_peak_memory_range:
         min: 12288
-        max: 2061128
+        max: 1753112
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 99
-      job_id: j2p0l2l9p
+      job_id: jegn382r5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 5308.0
-      throughput: 188.39487565938205
+      inference_time: 5370.0
+      throughput: 186.21973929236498
       estimated_peak_memory_range:
         min: 806912
-        max: 9579664
+        max: 8194984
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,7 +69,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 100
-      job_id: jw56nzn6g
+      job_id: jqpy60e75
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 18506.0
+      throughput: 54.03652869339674
+      estimated_peak_memory_range:
+        min: 102789120
+        max: 122435512
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 122
+        layers_on_gpu: 0
+        layers_on_cpu: 51
+        total_layers: 173
+      job_id: jn5q3o84p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-30T00:18:21.431467Z'
+    timestamp: '2024-05-20T16:35:28.582800Z'
   - torchscript_onnx_tflite:
-      inference_time: 2623.0
-      throughput: 381.2428516965307
+      inference_time: 2567.0
+      throughput: 389.5597974289053
       estimated_peak_memory_range:
         min: 12288
-        max: 58004960
+        max: 57529696
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 99
-      job_id: j1p8zmzkp
+      job_id: joprejk95
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 3894.0
-      throughput: 256.8053415511043
+      inference_time: 3971.0
+      throughput: 251.82573659027952
       estimated_peak_memory_range:
-        min: 802816
-        max: 58260400
+        min: 962560
+        max: 56354464
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,7 +122,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 100
-      job_id: j1p3e1e35
+      job_id: j2p0l7y6p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 13687.0
+      throughput: 73.06202966318405
+      estimated_peak_memory_range:
+        min: 80236544
+        max: 138756336
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 122
+        layers_on_gpu: 0
+        layers_on_cpu: 51
+        total_layers: 173
+      job_id: j1gl3rn8g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,13 +146,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-30T00:18:21.431668Z'
+    timestamp: '2024-05-20T16:35:28.582825Z'
   - torchscript_onnx_tflite:
-      inference_time: 15123.0
-      throughput: 66.12444620776301
+      inference_time: 3337.0
+      throughput: 299.6703626011387
       estimated_peak_memory_range:
-        min: 40960
-        max: 41498720
+        min: 12288
+        max: 2058944
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -128,14 +160,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 99
-      job_id: jn5q3r3np
+      job_id: jep2ln84g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 19868.0
-      throughput: 50.33219247030401
+      inference_time: 5351.0
+      throughput: 186.88095683049897
       estimated_peak_memory_range:
-        min: 802816
-        max: 50369568
+        min: 0
+        max: 6063744
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -143,75 +175,83 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 100
-      job_id: j1pvvrwkp
+      job_id: jogk3mz25
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-30T00:18:21.431848Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:28.582842Z'
   - torchscript_onnx_tflite:
-      inference_time: 124831.0
-      throughput: 8.010830643029376
+      inference_time: 15025.0
+      throughput: 66.55574043261231
       estimated_peak_memory_range:
-        min: 11464704
-        max: 28637488
+        min: 5541888
+        max: 47370848
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 96
-        layers_on_gpu: 3
+        layers_on_npu: 99
+        layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 99
-      job_id: j1gl323jg
+      job_id: jnp1e1wkg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 20149.0
+      throughput: 49.63025460320611
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 12288
+        max: 49872128
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 100
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: j7gje2lv5
-      job_status: Failed
+        total_layers: 100
+      job_id: jegn3q3v5
+      job_status: Passed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-30T00:18:21.432019Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:28.582857Z'
   - torchscript_onnx_tflite:
-      inference_time: 3534.0
-      throughput: 282.9654782116582
+      inference_time: 125926.0
+      throughput: 7.941171799310706
       estimated_peak_memory_range:
-        min: 12288
-        max: 17568944
+        min: 11571200
+        max: 17936624
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 99
-        layers_on_gpu: 0
+        layers_on_npu: 96
+        layers_on_gpu: 3
         layers_on_cpu: 0
         total_layers: 99
-      job_id: jogk3q3w5
+      job_id: jvgdo4qkp
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 5297.0
-      throughput: 188.78610534264678
+    reference_device_info:
+      name: RB5 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:28.582868Z'
+  - torchscript_onnx_qnn:
+      inference_time: 5343.0
+      throughput: 187.16077110237694
       estimated_peak_memory_range:
-        min: 831488
-        max: 14169232
+        min: 790528
+        max: 790528
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -219,13 +259,43 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 100
-      job_id: jwgo3n3qg
+      job_id: j1p8zvoxp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 50376.0
+      throughput: 19.850722566301414
+      estimated_peak_memory_range:
+        min: 130891776
+        max: 130891776
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 122
+        layers_on_gpu: 0
+        layers_on_cpu: 51
+        total_layers: 173
+      job_id: jw56nl60g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 472873.0
+      throughput: 2.114732708359327
+      estimated_peak_memory_range:
+        min: 248066048
+        max: 248066048
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j1p3e2kl5
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-30T00:18:21.432205Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.582890Z'
diff --git a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py
index b2203267..999eebf7 100644
--- a/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py
+++ b/qai_hub_models/models/deeplabv3_plus_mobilenet_quantized/test.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 import os
-import tempfile
 import zipfile
 
 import torch
@@ -23,6 +22,7 @@
     CachedWebModelAsset,
     load_image,
     load_numpy,
+    qaihm_temp_dir,
 )
 from qai_hub_models.utils.testing import skip_clone_repo_check
 
@@ -48,7 +48,7 @@ def test_task():
 def test_aimet_export():
     model = DeepLabV3PlusMobilenetQuantizable.from_pretrained()
     name = model.__class__.__name__
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         output_zip = model.convert_to_onnx_and_aimet_encodings(
             tmpdir,
         )
diff --git a/qai_hub_models/models/deeplabv3_resnet50/README.md b/qai_hub_models/models/deeplabv3_resnet50/README.md
index ab4e87f0..1fec3ba3 100644
--- a/qai_hub_models/models/deeplabv3_resnet50/README.md
+++ b/qai_hub_models/models/deeplabv3_resnet50/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/deeplabv3_resnet50/export.py b/qai_hub_models/models/deeplabv3_resnet50/export.py
index 3f35964c..0e6e2e19 100644
--- a/qai_hub_models/models/deeplabv3_resnet50/export.py
+++ b/qai_hub_models/models/deeplabv3_resnet50/export.py
@@ -120,12 +120,17 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        + " --force_channel_last_output output_0,output_1"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0,output_1",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +168,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +199,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0,output_1", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0,output_1", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +216,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/deeplabv3_resnet50/model.py b/qai_hub_models/models/deeplabv3_resnet50/model.py
index 190f6e20..9dc8cdb7 100644
--- a/qai_hub_models/models/deeplabv3_resnet50/model.py
+++ b/qai_hub_models/models/deeplabv3_resnet50/model.py
@@ -4,7 +4,10 @@
 # ---------------------------------------------------------------------
 from __future__ import annotations
 
+from typing import Optional
+
 import torchvision.models as tv_models
+from qai_hub.client import Device
 
 from qai_hub_models.models._shared.deeplab.model import DeepLabV3Model
 from qai_hub_models.utils.base_model import TargetRuntime
@@ -23,20 +26,36 @@ def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> DeepLabV3_ResNet50:
         return cls(model)
 
     def get_hub_compile_options(
-        self, target_runtime: TargetRuntime, other_compile_options: str = ""
+        self,
+        target_runtime: TargetRuntime,
+        other_compile_options: str = "",
+        device: Optional[Device] = None,
     ) -> str:
         compile_options = super().get_hub_compile_options(
-            target_runtime, other_compile_options
+            target_runtime, other_compile_options, device
         )
-        return compile_options + " --compute_unit gpu"
+        if (
+            target_runtime == TargetRuntime.TFLITE
+            and "--compute_unit" not in compile_options
+        ):
+            compile_options = compile_options + " --compute_unit gpu"
+        return compile_options
 
     def get_hub_profile_options(
-        self, target_runtime: TargetRuntime, other_profile_options: str = ""
+        self,
+        target_runtime: TargetRuntime,
+        other_profile_options: str = "",
     ) -> str:
         profile_options = super().get_hub_profile_options(
-            target_runtime, other_profile_options
+            target_runtime,
+            other_profile_options,
         )
-        return profile_options + " --compute_unit gpu"
+        if (
+            target_runtime == TargetRuntime.TFLITE
+            and "--compute_unit" not in profile_options
+        ):
+            profile_options = profile_options + " --compute_unit gpu"
+        return profile_options
 
     def forward(self, image):
         return super().forward(image)["out"]
diff --git a/qai_hub_models/models/deeplabv3_resnet50/perf.yaml b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml
index 30be88e6..972d9c32 100644
--- a/qai_hub_models/models/deeplabv3_resnet50/perf.yaml
+++ b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DeepLabV3-ResNet50
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 290847.0
-      throughput: 3.4382338480369405
+      inference_time: 295509.0
+      throughput: 3.383991688916412
       estimated_peak_memory_range:
-        min: 32768
-        max: 223952912
+        min: 12288
+        max: 211050624
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
@@ -46,23 +48,38 @@ models:
         layers_on_gpu: 95
         layers_on_cpu: 0
         total_layers: 95
-      job_id: jz5wq3935
+      job_id: jwgo3qyxg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 810711.0
-      throughput: 1.23348517535842
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 3481600
-        max: 11830488
-      primary_compute_unit: GPU
-      precision: fp16
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 83
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 83
-      job_id: jvgdoqvrp
-      job_status: Passed
+        total_layers: 0
+      job_id: jlpek391p
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jmg9we1lp
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-30T00:18:21.450422Z'
+    timestamp: '2024-05-20T16:35:28.622222Z'
   - torchscript_onnx_tflite:
-      inference_time: 228363.0
-      throughput: 4.37899309432789
+      inference_time: 227563.0
+      throughput: 4.394387488299944
       estimated_peak_memory_range:
-        min: 102400
-        max: 31114256
+        min: 69632
+        max: 30257776
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
@@ -84,23 +101,38 @@ models:
         layers_on_gpu: 95
         layers_on_cpu: 0
         total_layers: 95
-      job_id: jmg9wy4wp
+      job_id: j1pvvx3jp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 588856.0
-      throughput: 1.6982080508647275
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 3207168
-        max: 37364864
-      primary_compute_unit: GPU
-      precision: fp16
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 83
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 83
-      job_id: jz57xldvg
-      job_status: Passed
+        total_layers: 0
+      job_id: jygzrkek5
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jnp1exl2g
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -108,8 +140,23 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-30T00:18:21.450461Z'
+    timestamp: '2024-05-20T16:35:28.622250Z'
   - torchscript_onnx_tflite:
+      inference_time: 292688.0
+      throughput: 3.416607445470945
+      estimated_peak_memory_range:
+        min: 1380352
+        max: 149690448
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 95
+        layers_on_cpu: 0
+        total_layers: 95
+      job_id: j7gje4xx5
+      job_status: Passed
+    torchscript_onnx_qnn:
       inference_time: 'null'
       throughput: 'null'
       estimated_peak_memory_range:
@@ -122,23 +169,8 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jnp1ew88g
+      job_id: jz5wqnv65
       job_status: Failed
-    torchscript_onnx_qnn:
-      inference_time: 821173.0
-      throughput: 1.217770189716418
-      estimated_peak_memory_range:
-        min: 3436544
-        max: 12462344
-      primary_compute_unit: GPU
-      precision: fp16
-      layer_info:
-        layers_on_npu: 0
-        layers_on_gpu: 83
-        layers_on_cpu: 0
-        total_layers: 83
-      job_id: jqp4vdw8p
-      job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
       os: '12'
@@ -146,4 +178,12 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-30T00:18:21.450490Z'
+    timestamp: '2024-05-20T16:35:28.622267Z'
+  - reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.622274Z'
diff --git a/qai_hub_models/models/densenet121/README.md b/qai_hub_models/models/densenet121/README.md
index d0d9ab9e..1f95a118 100644
--- a/qai_hub_models/models/densenet121/README.md
+++ b/qai_hub_models/models/densenet121/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/densenet121/export.py b/qai_hub_models/models/densenet121/export.py
index 3fcb6bf7..341660ea 100644
--- a/qai_hub_models/models/densenet121/export.py
+++ b/qai_hub_models/models/densenet121/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/densenet121/perf.yaml b/qai_hub_models/models/densenet121/perf.yaml
index 97bd9840..29060c6f 100644
--- a/qai_hub_models/models/densenet121/perf.yaml
+++ b/qai_hub_models/models/densenet121/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DenseNet-121
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1945.0
-      throughput: 514.1388174807198
+      inference_time: 1948.0
+      throughput: 513.347022587269
       estimated_peak_memory_range:
         min: 16384
-        max: 2306688
+        max: 2162632
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 312
-      job_id: jvgdezmz5
+      job_id: jvgdol9ep
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2005.0
-      throughput: 498.75311720698255
+      inference_time: 1981.0
+      throughput: 504.79555779909134
       estimated_peak_memory_range:
         min: 12288
-        max: 40807680
+        max: 18832024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,23 +63,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 372
-      job_id: jqp4k921g
+      job_id: jnp1exl8g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 1971.0
+      throughput: 507.35667174023337
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 12288
+        max: 46477632
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 374
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jo5mqdl9p
-      job_status: Failed
+        total_layers: 374
+      job_id: j0pxy2j3g
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.432252Z'
+    timestamp: '2024-05-20T16:35:28.650430Z'
   - torchscript_onnx_tflite:
-      inference_time: 1282.0
-      throughput: 780.0312012480499
+      inference_time: 1321.0
+      throughput: 757.002271006813
       estimated_peak_memory_range:
-        min: 12288
-        max: 95228096
+        min: 16384
+        max: 95688784
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 312
-      job_id: jz570789g
+      job_id: jz5wqnv35
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1330.0
-      throughput: 751.8796992481203
+      inference_time: 1319.0
+      throughput: 758.1501137225171
       estimated_peak_memory_range:
         min: 618496
-        max: 155690704
+        max: 162806592
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,23 +116,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 372
-      job_id: j0pxndzl5
+      job_id: jvgdol9rp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 1355.0
+      throughput: 738.0073800738007
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 618496
+        max: 49577808
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 374
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jegnl7wq5
-      job_status: Failed
+        total_layers: 374
+      job_id: jo5m3y2dg
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.432349Z'
+    timestamp: '2024-05-20T16:35:28.650456Z'
   - torchscript_onnx_tflite:
-      inference_time: 1944.0
-      throughput: 514.40329218107
+      inference_time: 1948.0
+      throughput: 513.347022587269
       estimated_peak_memory_range:
-        min: 20480
-        max: 2194800
+        min: 28672
+        max: 2603520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 312
-      job_id: j0pxnrjl5
+      job_id: jmg9we1wp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2008.0
-      throughput: 498.00796812749
+      inference_time: 1983.0
+      throughput: 504.2864346949067
       estimated_peak_memory_range:
-        min: 12288
-        max: 40726728
+        min: 622592
+        max: 6049752
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 372
-      job_id: jep20d6qg
+      job_id: jqp4v0o8p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.432464Z'
+    timestamp: '2024-05-20T16:35:28.650472Z'
+  - torchscript_onnx_qnn:
+      inference_time: 2255.0
+      throughput: 443.4589800443459
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 372
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 372
+      job_id: jz57x3wvg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2014.0
+      throughput: 496.52432969215494
+      estimated_peak_memory_range:
+        min: 606208
+        max: 606208
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 374
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 374
+      job_id: jegn38yk5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 17596.0
+      throughput: 56.83109797681291
+      estimated_peak_memory_range:
+        min: 856064
+        max: 856064
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 311
+        total_layers: 311
+      job_id: joprejq05
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.650494Z'
diff --git a/qai_hub_models/models/detr_resnet101/README.md b/qai_hub_models/models/detr_resnet101/README.md
index 6ed10970..adaff2fc 100644
--- a/qai_hub_models/models/detr_resnet101/README.md
+++ b/qai_hub_models/models/detr_resnet101/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/detr_resnet101/export.py b/qai_hub_models/models/detr_resnet101/export.py
index 334955f6..ff0acdce 100644
--- a/qai_hub_models/models/detr_resnet101/export.py
+++ b/qai_hub_models/models/detr_resnet101/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +206,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/detr_resnet101/perf.yaml b/qai_hub_models/models/detr_resnet101/perf.yaml
index 41a76998..5f60a399 100644
--- a/qai_hub_models/models/detr_resnet101/perf.yaml
+++ b/qai_hub_models/models/detr_resnet101/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,38 +31,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DETR-ResNet101
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 47978.0
-      throughput: 20.842886322897996
+      inference_time: 24664.0
+      throughput: 40.5449237755433
       estimated_peak_memory_range:
-        min: 94208
-        max: 9060976
+        min: 438272
+        max: 3728248
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 910
-        layers_on_gpu: 2
+        layers_on_npu: 839
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 839
+      job_id: jep2ln6rg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 21040.0
+      throughput: 47.52851711026616
+      estimated_peak_memory_range:
+        min: 2801664
+        max: 31885224
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 912
-      job_id: jep20vzqg
+        total_layers: 1084
+      job_id: j1p8zv9kp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 26243.0
-      throughput: 38.105399535114124
+      inference_time: 22542.0
+      throughput: 44.36163605713779
       estimated_peak_memory_range:
-        min: 0
-        max: 299546600
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 16384
+        max: 296984832
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 856
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: j2p03vxnp
+        layers_on_cpu: 0
+        total_layers: 856
+      job_id: jw56nlj6g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,36 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.456092Z'
+    timestamp: '2024-05-20T16:35:28.680620Z'
   - torchscript_onnx_tflite:
-      inference_time: 35573.0
-      throughput: 28.111207938605123
+      inference_time: 17307.0
+      throughput: 57.78008898133703
       estimated_peak_memory_range:
-        min: 28672
-        max: 261178736
+        min: 106496
+        max: 282048208
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 910
-        layers_on_gpu: 2
+        layers_on_npu: 839
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 912
-      job_id: jqpyr7yl5
+        total_layers: 839
+      job_id: jqpy60w85
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 15126.0
+      throughput: 66.11133148221606
+      estimated_peak_memory_range:
+        min: 2797568
+        max: 330730224
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1084
+      job_id: jogk3mnw5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 19779.0
-      throughput: 50.558673340411545
+      inference_time: 15844.0
+      throughput: 63.11537490532694
       estimated_peak_memory_range:
-        min: 3723264
-        max: 90043392
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 2781184
+        max: 113431904
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 856
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: j1p804kog
+        layers_on_cpu: 0
+        total_layers: 856
+      job_id: j1p3e2335
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,21 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.456228Z'
+    timestamp: '2024-05-20T16:35:28.680646Z'
   - torchscript_onnx_tflite:
-      inference_time: 48057.0
-      throughput: 20.80862309340991
+      inference_time: 24760.0
+      throughput: 40.38772213247173
       estimated_peak_memory_range:
-        min: 1380352
-        max: 12433288
+        min: 405504
+        max: 3265984
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 910
-        layers_on_gpu: 2
+        layers_on_npu: 839
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 912
-      job_id: j1gl68rmg
+        total_layers: 839
+      job_id: j2p0l7q9p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 21118.0
+      throughput: 47.35296903115825
+      estimated_peak_memory_range:
+        min: 2813952
+        max: 31273000
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1084
+      job_id: j1gl3rzjg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.456328Z'
+    timestamp: '2024-05-20T16:35:28.680664Z'
+  - torchscript_onnx_qnn:
+      inference_time: 31213.0
+      throughput: 32.03793291256848
+      estimated_peak_memory_range:
+        min: 2768896
+        max: 2768896
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1084
+      job_id: jn5q3oknp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 23126.0
+      throughput: 43.24137334601747
+      estimated_peak_memory_range:
+        min: 117997568
+        max: 117997568
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 856
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 856
+      job_id: jwgo3q0qg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 1815598.0
+      throughput: 0.5507827173195828
+      estimated_peak_memory_range:
+        min: 280969216
+        max: 280969216
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 628
+        total_layers: 628
+      job_id: j1pvvxxkp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.680687Z'
diff --git a/qai_hub_models/models/detr_resnet101_dc5/README.md b/qai_hub_models/models/detr_resnet101_dc5/README.md
index 05a1484e..272c64ab 100644
--- a/qai_hub_models/models/detr_resnet101_dc5/README.md
+++ b/qai_hub_models/models/detr_resnet101_dc5/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/detr_resnet101_dc5/export.py b/qai_hub_models/models/detr_resnet101_dc5/export.py
index 36390c36..2a05937e 100644
--- a/qai_hub_models/models/detr_resnet101_dc5/export.py
+++ b/qai_hub_models/models/detr_resnet101_dc5/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +206,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/detr_resnet101_dc5/perf.yaml b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml
index a8021e26..51fb42f3 100644
--- a/qai_hub_models/models/detr_resnet101_dc5/perf.yaml
+++ b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,38 +31,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DETR-ResNet101-DC5
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 407929.0
-      throughput: 2.451406985039063
+      inference_time: 146017.0
+      throughput: 6.848517638357178
       estimated_peak_memory_range:
-        min: 7622656
-        max: 15500416
+        min: 1216512
+        max: 4088024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 911
-        layers_on_gpu: 2
+        layers_on_npu: 840
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 840
+      job_id: j7gje44v5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 142673.0
+      throughput: 7.009034645658254
+      estimated_peak_memory_range:
+        min: 2891776
+        max: 63987360
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 913
-      job_id: jn5qemdo5
+        total_layers: 1084
+      job_id: jz5wqnn35
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 179129.0
-      throughput: 5.582568986596252
+      inference_time: 135442.0
+      throughput: 7.383234151887893
       estimated_peak_memory_range:
-        min: 2637824
-        max: 309754336
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 2297856
+        max: 306707784
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 856
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: jw56ed9yg
+        layers_on_cpu: 0
+        total_layers: 856
+      job_id: jz57x33vg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,36 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.473830Z'
+    timestamp: '2024-05-20T16:35:28.711241Z'
   - torchscript_onnx_tflite:
-      inference_time: 311354.0
-      throughput: 3.2117782331365583
+      inference_time: 107206.0
+      throughput: 9.327836128574893
       estimated_peak_memory_range:
-        min: 90112
-        max: 447334464
+        min: 790528
+        max: 492355520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 911
-        layers_on_gpu: 2
+        layers_on_npu: 840
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 913
-      job_id: j1gl619mg
+        total_layers: 840
+      job_id: jlpek33op
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 100534.0
+      throughput: 9.946883641355164
+      estimated_peak_memory_range:
+        min: 460566528
+        max: 811388336
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1084
+      job_id: jmg9weewp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 135318.0
-      throughput: 7.3899998522000026
+      inference_time: 95212.0
+      throughput: 10.502877788514052
       estimated_peak_memory_range:
-        min: 10055680
-        max: 190681632
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 4116480
+        max: 168196992
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 856
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: j1p3vwlng
+        layers_on_cpu: 0
+        total_layers: 856
+      job_id: jqp4v008p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,21 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.473979Z'
+    timestamp: '2024-05-20T16:35:28.711267Z'
   - torchscript_onnx_tflite:
-      inference_time: 405436.0
-      throughput: 2.4664805296026993
+      inference_time: 141747.0
+      throughput: 7.054823029764298
       estimated_peak_memory_range:
-        min: 6467584
-        max: 13861952
+        min: 184320
+        max: 5835464
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 911
-        layers_on_gpu: 2
+        layers_on_npu: 840
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 913
-      job_id: jlpeex3vp
+        total_layers: 840
+      job_id: jygzrkko5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 144502.0
+      throughput: 6.92031944194544
+      estimated_peak_memory_range:
+        min: 2871296
+        max: 58689696
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1084
+      job_id: jvgdollrp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.474090Z'
+    timestamp: '2024-05-20T16:35:28.711285Z'
+  - torchscript_onnx_qnn:
+      inference_time: 172453.0
+      throughput: 5.798681379854221
+      estimated_peak_memory_range:
+        min: 2772992
+        max: 2772992
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1084
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1084
+      job_id: jnp1exx8g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 125853.0
+      throughput: 7.945778010853933
+      estimated_peak_memory_range:
+        min: 119799808
+        max: 119799808
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 856
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 856
+      job_id: j0pxy223g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2208720.0
+      throughput: 0.4527509145568474
+      estimated_peak_memory_range:
+        min: 280973312
+        max: 280973312
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 628
+        total_layers: 628
+      job_id: jo5m3yydg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.711307Z'
diff --git a/qai_hub_models/models/detr_resnet50/README.md b/qai_hub_models/models/detr_resnet50/README.md
index 362f9f43..e37e8210 100644
--- a/qai_hub_models/models/detr_resnet50/README.md
+++ b/qai_hub_models/models/detr_resnet50/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/detr_resnet50/export.py b/qai_hub_models/models/detr_resnet50/export.py
index f775cebf..3d9e505c 100644
--- a/qai_hub_models/models/detr_resnet50/export.py
+++ b/qai_hub_models/models/detr_resnet50/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +206,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/detr_resnet50/perf.yaml b/qai_hub_models/models/detr_resnet50/perf.yaml
index cf9904e7..dacb9380 100644
--- a/qai_hub_models/models/detr_resnet50/perf.yaml
+++ b/qai_hub_models/models/detr_resnet50/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,38 +31,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DETR-ResNet50
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 39035.0
-      throughput: 25.618035096708084
+      inference_time: 20791.0
+      throughput: 48.0977345967005
       estimated_peak_memory_range:
-        min: 1327104
-        max: 9193440
+        min: 57344
+        max: 3249616
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 842
-        layers_on_gpu: 2
+        layers_on_npu: 771
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 771
+      job_id: jegn388k5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 19328.0
+      throughput: 51.73841059602649
+      estimated_peak_memory_range:
+        min: 2805760
+        max: 23254680
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 844
-      job_id: j1pv09yr5
+        total_layers: 863
+      job_id: jqpy60085
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 22280.0
-      throughput: 44.88330341113106
+      inference_time: 16790.0
+      throughput: 59.55926146515783
       estimated_peak_memory_range:
-        min: 1789952
-        max: 205559344
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 536576
+        max: 208713080
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 737
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: jlpeel0vp
+        layers_on_cpu: 0
+        total_layers: 737
+      job_id: jn5q3oonp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,36 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.491676Z'
+    timestamp: '2024-05-20T16:35:28.741938Z'
   - torchscript_onnx_tflite:
-      inference_time: 28469.0
-      throughput: 35.12592644631002
+      inference_time: 14384.0
+      throughput: 69.52169076751946
       estimated_peak_memory_range:
-        min: 1241088
-        max: 215942624
+        min: 409600
+        max: 231124128
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 842
-        layers_on_gpu: 2
+        layers_on_npu: 771
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 844
-      job_id: j7gjzw6e5
+        total_layers: 771
+      job_id: joprejj05
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 13592.0
+      throughput: 73.57268981753973
+      estimated_peak_memory_range:
+        min: 2801664
+        max: 247117184
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 863
+      job_id: j2p0l779p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 17238.0
-      throughput: 58.0113702285648
+      inference_time: 11524.0
+      throughput: 86.77542519958348
       estimated_peak_memory_range:
-        min: 3723264
-        max: 80445392
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 4878336
+        max: 99183200
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 737
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: jygzo4qx5
+        layers_on_cpu: 0
+        total_layers: 737
+      job_id: j1gl3rrjg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,21 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.491805Z'
+    timestamp: '2024-05-20T16:35:28.741963Z'
   - torchscript_onnx_tflite:
-      inference_time: 38866.0
-      throughput: 25.729429321257655
+      inference_time: 20731.0
+      throughput: 48.23693984853601
       estimated_peak_memory_range:
-        min: 1429504
-        max: 8463712
+        min: 405504
+        max: 3824656
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 842
-        layers_on_gpu: 2
+        layers_on_npu: 771
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 844
-      job_id: jz570n39g
+        total_layers: 771
+      job_id: jep2lnnrg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 19426.0
+      throughput: 51.47740142077628
+      estimated_peak_memory_range:
+        min: 40960
+        max: 25594136
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 863
+      job_id: jogk3mmw5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.491909Z'
+    timestamp: '2024-05-20T16:35:28.741980Z'
+  - torchscript_onnx_qnn:
+      inference_time: 22410.0
+      throughput: 44.62293618920125
+      estimated_peak_memory_range:
+        min: 2768896
+        max: 2768896
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 863
+      job_id: j1p8zvvkp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 17039.0
+      throughput: 58.68889019308645
+      estimated_peak_memory_range:
+        min: 33472512
+        max: 33472512
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 737
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 737
+      job_id: jw56nll6g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 68004.0
+      throughput: 14.705017351920475
+      estimated_peak_memory_range:
+        min: 3866624
+        max: 3866624
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 577
+        total_layers: 577
+      job_id: j1p3e2235
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.742002Z'
diff --git a/qai_hub_models/models/detr_resnet50_dc5/README.md b/qai_hub_models/models/detr_resnet50_dc5/README.md
index cb3249d0..39efe2a1 100644
--- a/qai_hub_models/models/detr_resnet50_dc5/README.md
+++ b/qai_hub_models/models/detr_resnet50_dc5/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/d
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/detr_resnet50_dc5/export.py b/qai_hub_models/models/detr_resnet50_dc5/export.py
index 4415c02a..ac9e21c6 100644
--- a/qai_hub_models/models/detr_resnet50_dc5/export.py
+++ b/qai_hub_models/models/detr_resnet50_dc5/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +206,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/detr_resnet50_dc5/perf.yaml b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml
index 8cf30015..5ee7970c 100644
--- a/qai_hub_models/models/detr_resnet50_dc5/perf.yaml
+++ b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,38 +31,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: DETR-ResNet50-DC5
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 405395.0
-      throughput: 2.4667299794028046
+      inference_time: 135457.0
+      throughput: 7.382416560236828
       estimated_peak_memory_range:
-        min: 339968
-        max: 8125832
+        min: 1200128
+        max: 4621488
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 843
-        layers_on_gpu: 2
+        layers_on_npu: 772
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 772
+      job_id: jwgo3qqqg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 121332.0
+      throughput: 8.2418488115254
+      estimated_peak_memory_range:
+        min: 65536
+        max: 55100088
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 845
-      job_id: jmg9jx785
+        total_layers: 863
+      job_id: jlpevmmo5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 174726.0
-      throughput: 5.723246683378547
+      inference_time: 119137.0
+      throughput: 8.39369801153294
       estimated_peak_memory_range:
-        min: 7774208
-        max: 210473208
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 679936
+        max: 229172048
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 737
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: jvgdezyz5
+        layers_on_cpu: 0
+        total_layers: 737
+      job_id: jnp18zz8g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,36 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.509285Z'
+    timestamp: '2024-05-20T16:35:28.772381Z'
   - torchscript_onnx_tflite:
-      inference_time: 306266.0
-      throughput: 3.26513553577609
+      inference_time: 102211.0
+      throughput: 9.78368277386974
       estimated_peak_memory_range:
-        min: 16384
-        max: 412400848
+        min: 1204224
+        max: 442913328
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 843
-        layers_on_gpu: 2
+        layers_on_npu: 772
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 845
-      job_id: jnp1yvk7p
+        total_layers: 772
+      job_id: j1pvwkkkg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 92508.0
+      throughput: 10.809875902624638
+      estimated_peak_memory_range:
+        min: 2818048
+        max: 287246416
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 863
+      job_id: jygz7ddop
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 130531.0
-      throughput: 7.66101539097992
+      inference_time: 90890.0
+      throughput: 11.002310485201892
       estimated_peak_memory_range:
-        min: 10014720
-        max: 184574640
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 4927488
+        max: 146881408
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 3
+        layers_on_npu: 737
         layers_on_gpu: 0
-        layers_on_cpu: 5
-        total_layers: 8
-      job_id: jz570719g
+        layers_on_cpu: 0
+        total_layers: 737
+      job_id: jvgdv11rg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,21 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.509384Z'
+    timestamp: '2024-05-20T16:35:28.772406Z'
   - torchscript_onnx_tflite:
-      inference_time: 400391.0
-      throughput: 2.497558636432887
+      inference_time: 134542.0
+      throughput: 7.432623270056934
       estimated_peak_memory_range:
-        min: 7581696
-        max: 16235952
+        min: 1204224
+        max: 4576992
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 843
-        layers_on_gpu: 2
+        layers_on_npu: 772
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 845
-      job_id: jegnlq8q5
+        total_layers: 772
+      job_id: j7gjlnnvp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 133524.0
+      throughput: 7.4892903148497645
+      estimated_peak_memory_range:
+        min: 16384
+        max: 52330520
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 863
+      job_id: jmg94nnw5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.509493Z'
+    timestamp: '2024-05-20T16:35:28.772424Z'
+  - torchscript_onnx_qnn:
+      inference_time: 165859.0
+      throughput: 6.029217588433549
+      estimated_peak_memory_range:
+        min: 2772992
+        max: 2772992
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 863
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 863
+      job_id: jz5w9663p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 119044.0
+      throughput: 8.40025536776318
+      estimated_peak_memory_range:
+        min: 31268864
+        max: 31268864
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 737
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 737
+      job_id: jz57drrv5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jqp4wrr8g
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.772448Z'
diff --git a/qai_hub_models/models/efficientnet_b0/README.md b/qai_hub_models/models/efficientnet_b0/README.md
index 197315d7..31379b53 100644
--- a/qai_hub_models/models/efficientnet_b0/README.md
+++ b/qai_hub_models/models/efficientnet_b0/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/e
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/efficientnet_b0/export.py b/qai_hub_models/models/efficientnet_b0/export.py
index 2bff3de6..d745fda1 100644
--- a/qai_hub_models/models/efficientnet_b0/export.py
+++ b/qai_hub_models/models/efficientnet_b0/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/efficientnet_b0/perf.yaml b/qai_hub_models/models/efficientnet_b0/perf.yaml
index 40fef96a..bdfab403 100644
--- a/qai_hub_models/models/efficientnet_b0/perf.yaml
+++ b/qai_hub_models/models/efficientnet_b0/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: EfficientNet-B0
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1637.0
-      throughput: 610.8735491753207
+      inference_time: 1623.0
+      throughput: 616.1429451632779
       estimated_peak_memory_range:
         min: 24576
-        max: 18330576
+        max: 2090224
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: j0pxnd8l5
+      job_id: j0px1oo3g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1692.0
-      throughput: 591.016548463357
+      inference_time: 1678.0
+      throughput: 595.9475566150179
       estimated_peak_memory_range:
-        min: 16384
-        max: 89136624
+        min: 12288
+        max: 88022416
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 243
-      job_id: jegnl7dq5
+      job_id: jopry330g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1847.0
-      throughput: 541.4185165132648
+      inference_time: 1575.0
+      throughput: 634.9206349206349
       estimated_peak_memory_range:
         min: 12288
-        max: 80485720
+        max: 80602048
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 245
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jep20vqqg
+        total_layers: 245
+      job_id: j1p87yyk5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.527091Z'
+    timestamp: '2024-05-20T16:35:28.802791Z'
   - torchscript_onnx_tflite:
-      inference_time: 1177.0
-      throughput: 849.6176720475786
+      inference_time: 1162.0
+      throughput: 860.5851979345955
       estimated_peak_memory_range:
         min: 16384
-        max: 70869408
+        max: 71535472
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: jo5mqd19p
+      job_id: jo5mzxxdp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1180.0
-      throughput: 847.457627118644
+      inference_time: 1182.0
+      throughput: 846.0236886632825
       estimated_peak_memory_range:
-        min: 0
-        max: 70362624
+        min: 618496
+        max: 69430064
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 243
-      job_id: jopr8nm75
+      job_id: jep2myyr5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1299.0
-      throughput: 769.8229407236336
+      inference_time: 1137.0
+      throughput: 879.5074758135444
       estimated_peak_memory_range:
-        min: 761856
-        max: 28745360
+        min: 0
+        max: 34872096
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 245
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jqpyr7kl5
+        total_layers: 245
+      job_id: jogkyxxwp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.527166Z'
+    timestamp: '2024-05-20T16:35:28.802817Z'
   - torchscript_onnx_tflite:
-      inference_time: 1635.0
-      throughput: 611.6207951070336
+      inference_time: 1626.0
+      throughput: 615.0061500615006
       estimated_peak_memory_range:
-        min: 28672
-        max: 2553520
+        min: 24576
+        max: 2679392
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: j1gl6qmmg
+      job_id: jegnevvkg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1694.0
-      throughput: 590.318772136954
+      inference_time: 1668.0
+      throughput: 599.5203836930456
       estimated_peak_memory_range:
-        min: 622592
-        max: 68146216
+        min: 16384
+        max: 14848360
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 243
-      job_id: j1pv0nkr5
+      job_id: j2p0r009p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.527250Z'
+    timestamp: '2024-05-20T16:35:28.802834Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1835.0
+      throughput: 544.9591280653951
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 243
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 243
+      job_id: jqpyd338p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1670.0
+      throughput: 598.8023952095808
+      estimated_peak_memory_range:
+        min: 34729984
+        max: 34729984
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 245
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 245
+      job_id: jn5q2qqn5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 10374.0
+      throughput: 96.3948332369385
+      estimated_peak_memory_range:
+        min: 36884480
+        max: 36884480
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j1glkmmjp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.802858Z'
diff --git a/qai_hub_models/models/esrgan/README.md b/qai_hub_models/models/esrgan/README.md
index 71784914..4afc7424 100644
--- a/qai_hub_models/models/esrgan/README.md
+++ b/qai_hub_models/models/esrgan/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/e
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/esrgan/export.py b/qai_hub_models/models/esrgan/export.py
index 6bc2ec3f..2a7d632d 100644
--- a/qai_hub_models/models/esrgan/export.py
+++ b/qai_hub_models/models/esrgan/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/esrgan/perf.yaml b/qai_hub_models/models/esrgan/perf.yaml
index 8cbaa145..ae40c9f2 100644
--- a/qai_hub_models/models/esrgan/perf.yaml
+++ b/qai_hub_models/models/esrgan/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ESRGAN
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 65051.0
-      throughput: 15.372553842369832
+      inference_time: 68602.0
+      throughput: 14.576834494621147
       estimated_peak_memory_range:
-        min: 3252224
-        max: 6824744
+        min: 4915200
+        max: 8401176
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1024
-      job_id: j1p804dog
+      job_id: jw561446p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 65381.0
-      throughput: 15.294963368562732
+      inference_time: 67537.0
+      throughput: 14.806698550424212
       estimated_peak_memory_range:
-        min: 102400
-        max: 104823816
+        min: 122880
+        max: 105180416
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1026
-      job_id: jn5qemxo5
+      job_id: j1pvwk6kg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 70770.0
-      throughput: 14.130281192595733
+      inference_time: 70574.0
+      throughput: 14.169524187377787
       estimated_peak_memory_range:
-        min: 3174400
-        max: 141778696
+        min: 6324224
+        max: 153237392
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1028
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jw56edxyg
+        total_layers: 1028
+      job_id: jz5w96e3p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.551433Z'
+    timestamp: '2024-05-20T16:35:28.833041Z'
   - torchscript_onnx_tflite:
-      inference_time: 51233.0
-      throughput: 19.518669607479556
+      inference_time: 51332.0
+      throughput: 19.48102548118133
       estimated_peak_memory_range:
-        min: 94208
-        max: 579142256
+        min: 3239936
+        max: 585991072
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1024
-      job_id: jogk79wnp
+      job_id: j1p3m003g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 50830.0
-      throughput: 19.673421207948063
+      inference_time: 50345.0
+      throughput: 19.86294567484358
       estimated_peak_memory_range:
-        min: 102400
-        max: 255173680
+        min: 12288
+        max: 260077888
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1026
-      job_id: j1gl61dmg
+      job_id: j7gjlnvvp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 51607.0
-      throughput: 19.37721626911078
+      inference_time: 51390.0
+      throughput: 19.45903872348706
       estimated_peak_memory_range:
-        min: 6688768
-        max: 197563712
+        min: 6324224
+        max: 192683632
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1028
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p3vwdng
+        total_layers: 1028
+      job_id: jmg94nlw5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.551673Z'
+    timestamp: '2024-05-20T16:35:28.833066Z'
   - torchscript_onnx_tflite:
-      inference_time: 71702.0
-      throughput: 13.946612367855847
+      inference_time: 71946.0
+      throughput: 13.899313373919329
       estimated_peak_memory_range:
-        min: 3293184
-        max: 6629192
+        min: 0
+        max: 3606600
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1024
-      job_id: jmg9jqn85
+      job_id: jwgov66q5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 68263.0
-      throughput: 14.649224323572067
+      inference_time: 70208.0
+      throughput: 14.243391066545122
       estimated_peak_memory_range:
-        min: 118784
-        max: 62391352
+        min: 196608
+        max: 104068704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1026
-      job_id: jqp4k2r1g
+      job_id: jygz7d3op
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.551903Z'
+    timestamp: '2024-05-20T16:35:28.833084Z'
+  - torchscript_onnx_qnn:
+      inference_time: 73168.0
+      throughput: 13.667176907937897
+      estimated_peak_memory_range:
+        min: 204800
+        max: 204800
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1026
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1026
+      job_id: jlpevmdo5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 65764.0
+      throughput: 15.205887719725078
+      estimated_peak_memory_range:
+        min: 1138688
+        max: 1138688
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1028
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1028
+      job_id: jnp18z48g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 641039.0
+      throughput: 1.5599674902775027
+      estimated_peak_memory_range:
+        min: 554172416
+        max: 554172416
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jvgdv1xrg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.833106Z'
diff --git a/qai_hub_models/models/facebook_denoiser/README.md b/qai_hub_models/models/facebook_denoiser/README.md
index 72d5dba1..eaa05e87 100644
--- a/qai_hub_models/models/facebook_denoiser/README.md
+++ b/qai_hub_models/models/facebook_denoiser/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/facebook_denoiser/demo.py b/qai_hub_models/models/facebook_denoiser/demo.py
index cb8eb9d1..4239c719 100644
--- a/qai_hub_models/models/facebook_denoiser/demo.py
+++ b/qai_hub_models/models/facebook_denoiser/demo.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 import os
-import tempfile
 from pathlib import Path
 from typing import List
 
@@ -23,7 +22,11 @@
     get_on_device_demo_parser,
     validate_on_device_demo_args,
 )
-from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_path
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_path,
+    qaihm_temp_dir,
+)
 
 EXAMPLE_RECORDING = CachedWebModelAsset.from_asset_store(
     MODEL_ID, ASSET_VERSION, "icsi_meeting_recording.wav"
@@ -57,7 +60,7 @@ def main(is_test: bool = False):
     # Download data
     audio_files: List[str] = args.audio
     audio_tensors = []
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         for idx, file in enumerate(audio_files):
             audio_file = load_path(file, tmpdir)
             audio, sample_rate = torchaudio.load(audio_file)
diff --git a/qai_hub_models/models/facebook_denoiser/export.py b/qai_hub_models/models/facebook_denoiser/export.py
index da820ee0..8b898270 100644
--- a/qai_hub_models/models/facebook_denoiser/export.py
+++ b/qai_hub_models/models/facebook_denoiser/export.py
@@ -120,7 +120,7 @@ def export_model(
 
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options
+        target_runtime, compile_options, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -192,7 +192,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/facebook_denoiser/perf.yaml b/qai_hub_models/models/facebook_denoiser/perf.yaml
index 98736731..7def156d 100644
--- a/qai_hub_models/models/facebook_denoiser/perf.yaml
+++ b/qai_hub_models/models/facebook_denoiser/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Facebook-Denoiser
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 683713.0
-      throughput: 1.4626019982068499
+      inference_time: 727870.0
+      throughput: 1.37387170785992
       estimated_peak_memory_range:
-        min: 380928
-        max: 375423608
+        min: 45551616
+        max: 416715824
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -46,22 +48,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 209
         total_layers: 209
-      job_id: j1pv098r5
+      job_id: jz57dryv5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 14433398.0
-      throughput: 0.0692837542483066
+      inference_time: 14547237.0
+      throughput: 0.06874157614947773
       estimated_peak_memory_range:
-        min: 1519616
-        max: 86092704
+        min: 143360
+        max: 92274744
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 4
+        layers_on_npu: 175
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 7
-      job_id: jlpeelqvp
+        total_layers: 178
+      job_id: jo5mzxndp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +72,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.576051Z'
+    timestamp: '2024-05-20T16:35:28.863583Z'
   - torchscript_onnx_tflite:
-      inference_time: 677141.0
-      throughput: 1.476797299233099
+      inference_time: 779484.0
+      throughput: 1.2828999697235608
       estimated_peak_memory_range:
-        min: 363802624
-        max: 387318224
+        min: 430981120
+        max: 452244496
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -84,22 +86,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 209
         total_layers: 209
-      job_id: j7gjzw9e5
+      job_id: jqp4wrl8g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 10716749.0
-      throughput: 0.09331188030997087
+      inference_time: 10691874.0
+      throughput: 0.09352897349893947
       estimated_peak_memory_range:
-        min: 19521536
-        max: 273877616
+        min: 17801216
+        max: 224185136
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 4
+        layers_on_npu: 175
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 7
-      job_id: jygzo46x5
+        total_layers: 178
+      job_id: jegnev6kg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +110,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.576099Z'
+    timestamp: '2024-05-20T16:35:28.863604Z'
   - torchscript_onnx_tflite:
-      inference_time: 704020.0
-      throughput: 1.4204141927786142
+      inference_time: 727753.0
+      throughput: 1.3740925836100986
       estimated_peak_memory_range:
-        min: 321875968
-        max: 538203832
+        min: 235909120
+        max: 447833184
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -122,7 +124,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 209
         total_layers: 209
-      job_id: j1p80kyog
+      job_id: j0px1ok3g
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +133,42 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.576158Z'
+    timestamp: '2024-05-20T16:35:28.863616Z'
+  - torchscript_onnx_ort:
+      inference_time: 15602048.0
+      throughput: 0.06409414969111747
+      estimated_peak_memory_range:
+        min: 450560
+        max: 450560
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 175
+        layers_on_gpu: 0
+        layers_on_cpu: 3
+        total_layers: 178
+      job_id: jopry3v0g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 138131.0
+      throughput: 7.239504528310082
+      estimated_peak_memory_range:
+        min: 139943936
+        max: 139943936
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 107
+        total_layers: 107
+      job_id: jep2mykr5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.863636Z'
diff --git a/qai_hub_models/models/fastsam_s/README.md b/qai_hub_models/models/fastsam_s/README.md
index 6d29272a..717a36e8 100644
--- a/qai_hub_models/models/fastsam_s/README.md
+++ b/qai_hub_models/models/fastsam_s/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/fastsam_s/export.py b/qai_hub_models/models/fastsam_s/export.py
index 1f5c1637..e808d3d9 100644
--- a/qai_hub_models/models/fastsam_s/export.py
+++ b/qai_hub_models/models/fastsam_s/export.py
@@ -122,12 +122,17 @@ def export_model(
         model.to("cpu"), make_torch_inputs(input_spec), check_trace=False
     )
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        + " --force_channel_last_output output_1,output_2,output_3,output_5"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_1,output_2,output_3,output_5",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -165,8 +170,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -194,8 +201,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_1,output_2,output_3,output_5", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_1,output_2,output_3,output_5", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/fastsam_s/perf.yaml b/qai_hub_models/models/fastsam_s/perf.yaml
index 345dbbcb..91456b39 100644
--- a/qai_hub_models/models/fastsam_s/perf.yaml
+++ b/qai_hub_models/models/fastsam_s/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FastSam-S
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 8729.0
-      throughput: 114.56065986940085
+      inference_time: 8636.0
+      throughput: 115.7943492357573
       estimated_peak_memory_range:
-        min: 7823360
-        max: 10576056
+        min: 8404992
+        max: 26145480
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 288
-      job_id: jmg9jxr85
+      job_id: jqpyd318p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 8361.0
+      throughput: 119.60291831120679
+      estimated_peak_memory_range:
+        min: 4947968
+        max: 19891312
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 286
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 286
+      job_id: jogkyxewp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 10386.0
-      throughput: 96.28345850182939
+      inference_time: 10837.0
+      throughput: 92.27646027498385
       estimated_peak_memory_range:
-        min: 20791296
-        max: 84541352
+        min: 21467136
+        max: 77311024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 289
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jvgdezkz5
+        total_layers: 289
+      job_id: j1p3m0j3g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.594002Z'
+    timestamp: '2024-05-20T16:35:28.890936Z'
   - torchscript_onnx_tflite:
-      inference_time: 6438.0
-      throughput: 155.32774153463808
+      inference_time: 6531.0
+      throughput: 153.11590874291838
       estimated_peak_memory_range:
-        min: 6541312
-        max: 77737344
+        min: 5767168
+        max: 76610048
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 288
-      job_id: jnp1yv97p
+      job_id: j2p0r0z9p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 6171.0
+      throughput: 162.04829039053638
+      estimated_peak_memory_range:
+        min: 4952064
+        max: 91897808
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 286
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 286
+      job_id: jn5q2q6n5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 7468.0
-      throughput: 133.9046598821639
+      inference_time: 7948.0
+      throughput: 125.81781580271766
       estimated_peak_memory_range:
-        min: 24322048
-        max: 63913008
+        min: 28004352
+        max: 71806784
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 289
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz5707m9g
+        total_layers: 289
+      job_id: jwgov62q5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.594052Z'
+    timestamp: '2024-05-20T16:35:28.890962Z'
   - torchscript_onnx_tflite:
-      inference_time: 8739.0
-      throughput: 114.42956860052638
+      inference_time: 8645.0
+      throughput: 115.6737998843262
       estimated_peak_memory_range:
-        min: 7802880
-        max: 25345168
+        min: 7819264
+        max: 25353920
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 288
-      job_id: jw56e0yyg
+      job_id: j1p87yqk5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 8210.0
+      throughput: 121.8026796589525
+      estimated_peak_memory_range:
+        min: 4984832
+        max: 19259848
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 286
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 286
+      job_id: jw5614y6p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.594089Z'
+    timestamp: '2024-05-20T16:35:28.890979Z'
+  - torchscript_onnx_qnn:
+      inference_time: 9182.0
+      throughput: 108.90873448050533
+      estimated_peak_memory_range:
+        min: 4935680
+        max: 4935680
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 286
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 286
+      job_id: j1glkmvjp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 10779.0
+      throughput: 92.77298450691158
+      estimated_peak_memory_range:
+        min: 67710976
+        max: 67710976
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 289
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 289
+      job_id: j1pvwkqkg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 62697.0
+      throughput: 15.949726462191174
+      estimated_peak_memory_range:
+        min: 70156288
+        max: 70156288
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 201
+        total_layers: 201
+      job_id: j7gjlndvp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.891004Z'
diff --git a/qai_hub_models/models/fastsam_x/README.md b/qai_hub_models/models/fastsam_x/README.md
index 8c7c2091..b3c84891 100644
--- a/qai_hub_models/models/fastsam_x/README.md
+++ b/qai_hub_models/models/fastsam_x/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/fastsam_x/export.py b/qai_hub_models/models/fastsam_x/export.py
index 494a3229..b87735a2 100644
--- a/qai_hub_models/models/fastsam_x/export.py
+++ b/qai_hub_models/models/fastsam_x/export.py
@@ -122,12 +122,17 @@ def export_model(
         model.to("cpu"), make_torch_inputs(input_spec), check_trace=False
     )
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        + " --force_channel_last_output output_1,output_2,output_3,output_5"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_1,output_2,output_3,output_5",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -165,8 +170,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -194,8 +201,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_1,output_2,output_3,output_5", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_1,output_2,output_3,output_5", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/fastsam_x/perf.yaml b/qai_hub_models/models/fastsam_x/perf.yaml
index 4b8e861d..629b0462 100644
--- a/qai_hub_models/models/fastsam_x/perf.yaml
+++ b/qai_hub_models/models/fastsam_x/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FastSam-X
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 50012.0
-      throughput: 19.995201151723585
+      inference_time: 49665.0
+      throughput: 20.13490385583409
       estimated_peak_memory_range:
-        min: 9154560
-        max: 13813200
+        min: 9117696
+        max: 14327728
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 420
-      job_id: j0pxndql5
+      job_id: jlpevmoo5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 46166.0
+      throughput: 21.66096261317853
+      estimated_peak_memory_range:
+        min: 4935680
+        max: 20646312
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 418
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 418
+      job_id: jmg94n0w5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 50171.0
-      throughput: 19.93183313069303
+      inference_time: 50328.0
+      throughput: 19.86965506278811
       estimated_peak_memory_range:
-        min: 24637440
-        max: 351124872
+        min: 25731072
+        max: 346581656
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 421
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jegnl74q5
+        total_layers: 421
+      job_id: jmg94n085
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.611861Z'
+    timestamp: '2024-05-20T16:35:28.921155Z'
   - torchscript_onnx_tflite:
-      inference_time: 36802.0
-      throughput: 27.172436280636923
+      inference_time: 36007.0
+      throughput: 27.772377593245757
       estimated_peak_memory_range:
-        min: 8462336
-        max: 149995872
+        min: 73728
+        max: 135466464
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 420
-      job_id: jo5mqd79p
+      job_id: jygz7d2op
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 34949.0
+      throughput: 28.61312197773899
+      estimated_peak_memory_range:
+        min: 4096000
+        max: 127015584
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 418
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 418
+      job_id: jnp18z28g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 36880.0
-      throughput: 27.114967462039047
+      inference_time: 36890.0
+      throughput: 27.107617240444565
       estimated_peak_memory_range:
-        min: 26107904
-        max: 93739104
+        min: 29392896
+        max: 93988544
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 421
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jopr8nr75
+        total_layers: 421
+      job_id: jnp18z27g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.611951Z'
+    timestamp: '2024-05-20T16:35:28.921181Z'
   - torchscript_onnx_tflite:
-      inference_time: 52081.0
-      throughput: 19.200860198536894
+      inference_time: 50541.0
+      throughput: 19.785916384717357
       estimated_peak_memory_range:
-        min: 9240576
-        max: 13789008
+        min: 9220096
+        max: 14009928
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 420
-      job_id: jmg9jql85
+      job_id: jz5w96w3p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 45832.0
+      throughput: 21.81881654739047
+      estimated_peak_memory_range:
+        min: 4988928
+        max: 21102120
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 418
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 418
+      job_id: jz5w96wmp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.612007Z'
+    timestamp: '2024-05-20T16:35:28.921198Z'
+  - torchscript_onnx_qnn:
+      inference_time: 57556.0
+      throughput: 17.374383209396065
+      estimated_peak_memory_range:
+        min: 4939776
+        max: 4939776
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 418
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 418
+      job_id: jvgdv1nrg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 49642.0
+      throughput: 20.144232706176222
+      estimated_peak_memory_range:
+        min: 36737024
+        max: 36737024
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 421
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 421
+      job_id: jvgdv1nzg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2190810.0
+      throughput: 0.45645217978738456
+      estimated_peak_memory_range:
+        min: 582156288
+        max: 582156288
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jz57dr295
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.921219Z'
diff --git a/qai_hub_models/models/fcn_resnet50/README.md b/qai_hub_models/models/fcn_resnet50/README.md
index 674f6c47..c2af6df1 100644
--- a/qai_hub_models/models/fcn_resnet50/README.md
+++ b/qai_hub_models/models/fcn_resnet50/README.md
@@ -1,11 +1,11 @@
 [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
 
 
-# [FCN_ResNet50: Fully-convolutional network model for image segmentation](https://aihub.qualcomm.com/models/fcn_resnet50)
+# [FCN-ResNet50: Fully-convolutional network model for image segmentation](https://aihub.qualcomm.com/models/fcn_resnet50)
 
 FCN_ResNet50 is a machine learning model that can segment images from the COCO dataset. It uses ResNet50 as a backbone.
 
-This is based on the implementation of FCN_ResNet50 found
+This is based on the implementation of FCN-ResNet50 found
 [here](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py). This repository contains scripts for optimized on-device
 export suitable to run on Qualcomm® devices. More details on model performance
 accross various devices, can be found [here](https://aihub.qualcomm.com/models/fcn_resnet50).
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
@@ -39,7 +41,7 @@ Additional options are documented with the `--help` option. Note that the above
 script requires access to Deployment instructions for Qualcomm® AI Hub.
 
 ## License
-- The license for the original implementation of FCN_ResNet50 can be found
+- The license for the original implementation of FCN-ResNet50 can be found
   [here](https://github.com/pytorch/vision/blob/main/LICENSE).
 - The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
 
diff --git a/qai_hub_models/models/fcn_resnet50/app.py b/qai_hub_models/models/fcn_resnet50/app.py
index f00519d8..08d0329a 100644
--- a/qai_hub_models/models/fcn_resnet50/app.py
+++ b/qai_hub_models/models/fcn_resnet50/app.py
@@ -15,7 +15,6 @@
 
 from qai_hub_models.models.fcn_resnet50.model import NUM_CLASSES
 from qai_hub_models.utils.draw import create_color_map
-from qai_hub_models.utils.image_processing import normalize_image_transform
 
 
 def preprocess_image(image: Image) -> torch.Tensor:
@@ -30,13 +29,7 @@ def preprocess_image(image: Image) -> torch.Tensor:
     Returns:
         torch tensor to be directly passed to the model.
     """
-    transform = transforms.Compose(
-        [
-            transforms.ToTensor(),
-            normalize_image_transform(),
-        ]
-    )
-    out_tensor: torch.Tensor = transform(image)  # type: ignore
+    out_tensor: torch.Tensor = transforms.ToTensor()(image)  # type: ignore
     return out_tensor.unsqueeze(0)
 
 
diff --git a/qai_hub_models/models/fcn_resnet50/demo.py b/qai_hub_models/models/fcn_resnet50/demo.py
index 6c12063c..2a997b45 100644
--- a/qai_hub_models/models/fcn_resnet50/demo.py
+++ b/qai_hub_models/models/fcn_resnet50/demo.py
@@ -2,6 +2,8 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
+from typing import Type
+
 from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App
 from qai_hub_models.models.fcn_resnet50.model import (
     MODEL_ASSET_VERSION,
@@ -26,9 +28,9 @@
 )
 
 
-def main(is_test: bool = False):
+def fcn_resnet50_demo(model_cls: Type[FCN_ResNet50], is_test: bool = False):
     # Demo parameters
-    parser = get_model_cli_parser(FCN_ResNet50)
+    parser = get_model_cli_parser(model_cls)
     parser = get_on_device_demo_parser(parser, add_output_dir=True)
     parser.add_argument(
         "--image",
@@ -39,12 +41,12 @@ def main(is_test: bool = False):
 
     args = parser.parse_args([] if is_test else None)
     validate_on_device_demo_args(args, MODEL_ID)
-    model = demo_model_from_cli_args(FCN_ResNet50, MODEL_ID, args)
+    model = demo_model_from_cli_args(model_cls, MODEL_ID, args)
 
     # This FCN ResNet 50 demo comes from
     # https://pytorch.org/hub/pytorch_vision_fcn_resnet101/
     # load image
-    (_, _, height, width) = FCN_ResNet50.get_input_spec()["image"][0]
+    (_, _, height, width) = model_cls.get_input_spec()["image"][0]
     orig_image = load_image(args.image)
     image, scale, padding = pil_resize_pad(orig_image, (height, width))
     input_image = image.convert("RGB")
@@ -58,5 +60,9 @@ def main(is_test: bool = False):
         display_or_save_image(image_annotated, args.output_dir, "fcn_demo_output.png")
 
 
+def main(is_test: bool = False):
+    return fcn_resnet50_demo(FCN_ResNet50, is_test=is_test)
+
+
 if __name__ == "__main__":
     main()
diff --git a/qai_hub_models/models/fcn_resnet50/export.py b/qai_hub_models/models/fcn_resnet50/export.py
index d0a84c70..9b549255 100644
--- a/qai_hub_models/models/fcn_resnet50/export.py
+++ b/qai_hub_models/models/fcn_resnet50/export.py
@@ -98,7 +98,7 @@ def export_model(
     if not can_access_qualcomm_ai_hub():
         return export_without_hub_access(
             "fcn_resnet50",
-            "FCN_ResNet50",
+            "FCN-ResNet50",
             device,
             skip_profiling,
             skip_inferencing,
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/fcn_resnet50/info.yaml b/qai_hub_models/models/fcn_resnet50/info.yaml
index 87c73764..fcaeaafa 100644
--- a/qai_hub_models/models/fcn_resnet50/info.yaml
+++ b/qai_hub_models/models/fcn_resnet50/info.yaml
@@ -1,4 +1,4 @@
-name: FCN_ResNet50
+name: FCN-ResNet50
 # id must match with the model dir name in qai_hub_models
 id: fcn_resnet50
 status: public
@@ -24,7 +24,7 @@ applicable_scenarios:
   - Inventory Management
 related_models:
   - sam
-  - unet_segmentation
+  - deeplabv3_plus_mobilenet
   - ddrnet23_slim
 form_factors:
   - Phone
@@ -35,4 +35,5 @@ has_static_banner: yes
 has_animated_banner: no
 license_type: bsd-3-clause
 deploy_license_type: AI Model Hub License
-dataset: []
+dataset:
+  - coco
diff --git a/qai_hub_models/models/fcn_resnet50/model.py b/qai_hub_models/models/fcn_resnet50/model.py
index 156e63ec..e336cecf 100644
--- a/qai_hub_models/models/fcn_resnet50/model.py
+++ b/qai_hub_models/models/fcn_resnet50/model.py
@@ -7,7 +7,10 @@
 import torch
 import torchvision.models as tv_models
 
+from qai_hub_models.evaluators.base_evaluators import BaseEvaluator
+from qai_hub_models.evaluators.segmentation_evaluator import SegmentationOutputEvaluator
 from qai_hub_models.utils.base_model import BaseModel
+from qai_hub_models.utils.image_processing import normalize_image_torchvision
 from qai_hub_models.utils.input_spec import InputSpec
 
 MODEL_ID = __name__.split(".")[-2]
@@ -29,9 +32,13 @@ def __init__(
     @classmethod
     def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> FCN_ResNet50:
         model = tv_models.segmentation.fcn_resnet50(weights=weights).eval()
+        model.aux_classifier = None
         return cls(model)
 
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
+    def get_evaluator(self) -> BaseEvaluator:
+        return SegmentationOutputEvaluator(NUM_CLASSES)
+
+    def forward(self, image):
         """
         Run FCN_ResNet50 on `image`, and produce a tensor of classes for segmentation
 
@@ -43,14 +50,14 @@ def forward(self, image: torch.Tensor) -> torch.Tensor:
         Returns:
             tensor: 1x21xHxW tensor of class logits per pixel
         """
-        return self.model(image)["out"]
+        return self.model(normalize_image_torchvision(image))["out"]
 
     @staticmethod
     def get_input_spec(
         batch_size: int = 1,
         num_channels: int = 3,
-        height: int = 224,
-        width: int = 224,
+        height: int = 512,
+        width: int = 512,
     ) -> InputSpec:
         # Get the input specification ordered (name -> (shape, type)) pairs for this model.
         #
diff --git a/qai_hub_models/models/fcn_resnet50/perf.yaml b/qai_hub_models/models/fcn_resnet50/perf.yaml
index 17a1dc92..23e73bfb 100644
--- a/qai_hub_models/models/fcn_resnet50/perf.yaml
+++ b/qai_hub_models/models/fcn_resnet50/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,53 +31,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
-- name: FCN_ResNet50
+- name: FCN-ResNet50
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 8481.0
-      throughput: 117.91062374719962
+      inference_time: 42451.0
+      throughput: 23.55657110550988
       estimated_peak_memory_range:
-        min: 4251648
-        max: 6673424
+        min: 22093824
+        max: 24844120
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 84
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 84
-      job_id: jqpyr7ll5
+        total_layers: 86
+      job_id: jqp4wrn1g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 7915.0
-      throughput: 126.34238787113077
+      inference_time: 42160.0
+      throughput: 23.719165085388994
       estimated_peak_memory_range:
-        min: 32768
-        max: 14371224
+        min: 3166208
+        max: 20971816
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 125
+        layers_on_npu: 127
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 125
-      job_id: j1p804nog
+        total_layers: 127
+      job_id: jegnev0qg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 434382.0
-      throughput: 2.3021211744501384
+      inference_time: 42833.0
+      throughput: 23.346485186655148
       estimated_peak_memory_range:
-        min: 229376
-        max: 157385104
+        min: 46034944
+        max: 200591552
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 129
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jn5qemno5
+        total_layers: 129
+      job_id: j2p0r04np
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,51 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.629765Z'
+    timestamp: '2024-05-20T16:35:28.951494Z'
   - torchscript_onnx_tflite:
-      inference_time: 6385.0
-      throughput: 156.61707126076743
+      inference_time: 30899.0
+      throughput: 32.363506909608724
       estimated_peak_memory_range:
-        min: 4259840
-        max: 81999104
+        min: 20209664
+        max: 155788144
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 84
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 84
-      job_id: j2p03vwnp
+        total_layers: 86
+      job_id: j0px1o9lg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 5804.0
-      throughput: 172.2949689869056
+      inference_time: 31911.0
+      throughput: 31.337156466422236
       estimated_peak_memory_range:
-        min: 618496
-        max: 57524672
+        min: 2564096
+        max: 76317072
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 125
+        layers_on_npu: 127
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 125
-      job_id: jogk791np
+        total_layers: 127
+      job_id: jopry367g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 334126.0
-      throughput: 2.9928829244057633
+      inference_time: 32386.0
+      throughput: 30.877539677638485
       estimated_peak_memory_range:
-        min: 3608576
-        max: 48710400
+        min: 43917312
+        max: 112401296
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 129
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1gl61jmg
+        total_layers: 129
+      job_id: j1p87y2o5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,36 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.629842Z'
+    timestamp: '2024-05-20T16:35:28.951519Z'
   - torchscript_onnx_tflite:
-      inference_time: 8533.0
-      throughput: 117.19207781553968
+      inference_time: 42178.0
+      throughput: 23.709042628858647
       estimated_peak_memory_range:
-        min: 4243456
-        max: 6395552
+        min: 18853888
+        max: 20525048
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 84
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 84
-      job_id: jvgdemx65
+        total_layers: 86
+      job_id: jo5mzxe9p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 7887.0
-      throughput: 126.79092177000126
+      inference_time: 42067.0
+      throughput: 23.77160244372073
       estimated_peak_memory_range:
-        min: 16384
-        max: 14326120
+        min: 3178496
+        max: 20597416
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 125
+        layers_on_npu: 127
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 125
-      job_id: jo5mqln7p
+        total_layers: 127
+      job_id: jqpyd3zlp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.629884Z'
+    timestamp: '2024-05-20T16:35:28.951537Z'
+  - torchscript_onnx_qnn:
+      inference_time: 68578.0
+      throughput: 14.581935897809792
+      estimated_peak_memory_range:
+        min: 3153920
+        max: 3153920
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 127
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 127
+      job_id: jep2myxq5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 42426.0
+      throughput: 23.57045208127092
+      estimated_peak_memory_range:
+        min: 40243200
+        max: 40243200
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 129
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 129
+      job_id: jogkyxvnp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 340971.0
+      throughput: 2.932800736719545
+      estimated_peak_memory_range:
+        min: 278179840
+        max: 278179840
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 59
+        total_layers: 59
+      job_id: jn5q2q0o5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.951559Z'
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/README.md b/qai_hub_models/models/fcn_resnet50_quantized/README.md
new file mode 100644
index 00000000..72302a12
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/README.md
@@ -0,0 +1,56 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [FCN-ResNet50-Quantized: Quantized fully-convolutional network model for image segmentation](https://aihub.qualcomm.com/models/fcn_resnet50_quantized)
+
+FCN_ResNet50 is a quantized machine learning model that can segment images from the COCO dataset. It uses ResNet50 as a backbone.
+
+This is based on the implementation of FCN-ResNet50-Quantized found
+[here](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](https://aihub.qualcomm.com/models/fcn_resnet50_quantized).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.fcn_resnet50_quantized.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.fcn_resnet50_quantized.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of FCN-ResNet50-Quantized can be found
+  [here](https://github.com/pytorch/vision/blob/main/LICENSE).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038)
+* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/__init__.py b/qai_hub_models/models/fcn_resnet50_quantized/__init__.py
new file mode 100644
index 00000000..6f6e853c
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/__init__.py
@@ -0,0 +1,8 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App as App  # noqa: F401
+
+from .model import MODEL_ID  # noqa: F401
+from .model import FCN_ResNet50Quantizable as Model  # noqa: F401
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/conftest.py b/qai_hub_models/models/fcn_resnet50_quantized/conftest.py
new file mode 100644
index 00000000..8fd7c424
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/conftest.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+import inspect
+
+import pytest
+
+from qai_hub_models.models.fcn_resnet50_quantized import Model
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+# Instantiate the model only once for all tests.
+# Mock from_pretrained to always return the initialized model.
+# This speeds up tests and limits memory leaks.
+@pytest.fixture(scope="module", autouse=True)
+def cached_from_pretrained():
+    with pytest.MonkeyPatch.context() as mp:
+        pretrained_cache = {}
+        from_pretrained = Model.from_pretrained
+        sig = inspect.signature(from_pretrained)
+
+        @skip_clone_repo_check
+        def _cached_from_pretrained(*args, **kwargs):
+            cache_key = str(args) + str(kwargs)
+            model = pretrained_cache.get(cache_key, None)
+            if model:
+                return model
+            else:
+                model = from_pretrained(*args, **kwargs)
+                pretrained_cache[cache_key] = model
+                return model
+
+        _cached_from_pretrained.__signature__ = sig
+
+        mp.setattr(Model, "from_pretrained", _cached_from_pretrained)
+        yield mp
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/demo.py b/qai_hub_models/models/fcn_resnet50_quantized/demo.py
new file mode 100644
index 00000000..cc6abc16
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/demo.py
@@ -0,0 +1,14 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.fcn_resnet50.demo import fcn_resnet50_demo
+from qai_hub_models.models.fcn_resnet50_quantized.model import FCN_ResNet50Quantizable
+
+
+def main(is_test: bool = False):
+    fcn_resnet50_demo(FCN_ResNet50Quantizable, is_test)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/export.py b/qai_hub_models/models/fcn_resnet50_quantized/export.py
new file mode 100644
index 00000000..cf0b371a
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/export.py
@@ -0,0 +1,232 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import qai_hub as hub
+
+from qai_hub_models.models.fcn_resnet50_quantized import Model
+from qai_hub_models.utils.args import (
+    export_parser,
+    get_input_spec_kwargs,
+    get_model_kwargs,
+)
+from qai_hub_models.utils.base_model import TargetRuntime
+from qai_hub_models.utils.compare import torch_inference
+from qai_hub_models.utils.printing import (
+    print_inference_metrics,
+    print_on_target_demo_cmd,
+    print_profile_metrics_from_job,
+)
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+    transpose_channel_first_to_last,
+    transpose_channel_last_to_first,
+)
+from qai_hub_models.utils.qnn_helpers import get_qnn_inputs
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_downloading: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    target_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    compile_options: str = "",
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[
+    str
+]:
+    """
+    This function accomplishes 6 main tasks:
+
+        1. Instantiates a PyTorch model and converts it to a traced TorchScript format.
+        2. Compiles the model to an asset that can be run on device.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Downloads the model asset to the local directory.
+        6. Summarizes the results from profiling and inference.
+
+    Each of the last four steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_downloading: If set, skips downloading of compiled model.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling and inference.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        target_runtime: Which on-device runtime to target. Default is TFLite.
+        compile_options: Additional options to pass when submitting the compile job.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_pretrained` and `model.get_input_spec`
+
+    Returns:
+        A 3-tuple of:
+            * A CompileJob object containing metadata about the compile job submitted to hub.
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "fcn_resnet50_quantized"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "fcn_resnet50_quantized",
+            "FCN-ResNet50-Quantized",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            skip_downloading,
+            skip_summary,
+            output_path,
+            target_runtime,
+            compile_options,
+            profile_options,
+        )
+
+    # 1. Initialize PyTorch model
+    model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs))
+    input_spec = model.get_input_spec(
+        **get_input_spec_kwargs(model, additional_model_kwargs)
+    )
+
+    # Trace the model
+    source_model = model.convert_to_hub_source_model(
+        target_runtime, output_path, input_spec
+    )
+    if target_runtime == TargetRuntime.TFLITE:
+        quant_calibration_data = None
+    else:
+        quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
+
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
+    # 2. Compile the model to an on-device asset
+    model_compile_options = model.get_hub_compile_options(
+        target_runtime, compile_options + channel_last_flags, hub_device
+    )
+    print(f"Optimizing model {model_name} to run on-device")
+    submitted_compile_job = hub.submit_compile_job(
+        model=source_model,
+        input_specs=input_spec,
+        device=hub_device,
+        name=model_name,
+        calibration_data=quant_calibration_data,
+        options=model_compile_options,
+    )
+    compile_job = cast(hub.client.CompileJob, submitted_compile_job)
+
+    # 3. Profile the model asset on real devices
+    profile_job: Optional[hub.client.ProfileJob] = None
+    if not skip_profiling:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(f"Profiling model {model_name} on a hosted device.")
+        submitted_profile_job = hub.submit_profile_job(
+            model=compile_job.get_target_model(),
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        profile_job = cast(hub.client.ProfileJob, submitted_profile_job)
+
+    # 4. Run inference on-device with sample inputs
+    inference_job: Optional[hub.client.InferenceJob] = None
+    if not skip_inferencing:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(
+            f"Running inference for {model_name} on a hosted device with example inputs."
+        )
+        sample_inputs = model.sample_inputs(input_spec)
+        hub_inputs = sample_inputs
+        if target_runtime == TargetRuntime.QNN:
+            hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
+        )
+        submitted_inference_job = hub.submit_inference_job(
+            model=compile_job.get_target_model(),
+            inputs=hub_inputs,
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        inference_job = cast(hub.client.InferenceJob, submitted_inference_job)
+
+    # 5. Download the model asset to a local file
+    if not skip_downloading:
+        os.makedirs(output_path, exist_ok=True)
+        target_model: hub.Model = compile_job.get_target_model()  # type: ignore
+        target_model.download(str(output_path / f"{model_name}.tflite"))
+
+    # 6. Summarize the results from profiling and inference
+    if not skip_summary and not skip_profiling:
+        assert profile_job is not None and profile_job.wait().success
+        profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+        print_profile_metrics_from_job(profile_job, profile_data)
+
+    if not skip_summary and not skip_inferencing:
+        torch_out = torch_inference(model, sample_inputs)
+        assert inference_job is not None and inference_job.wait().success
+        inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
+        # Convert outputs from channel last to channel first
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
+        )
+        print_inference_metrics(inference_job, inference_result, torch_out)
+
+    if not skip_summary:
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+
+    return (compile_job, profile_job, inference_job)
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(model_cls=Model)
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/info.yaml b/qai_hub_models/models/fcn_resnet50_quantized/info.yaml
new file mode 100644
index 00000000..21939860
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/info.yaml
@@ -0,0 +1,41 @@
+name: FCN-ResNet50-Quantized
+# id must match with the model dir name in qai_hub_models
+id: fcn_resnet50_quantized
+status: public
+headline: Quantized fully-convolutional network model for image segmentation.
+domain: Computer Vision
+use_case: Semantic Segmentation
+description: FCN_ResNet50 is a quantized machine learning model that can segment images from
+  the COCO dataset. It uses ResNet50 as a backbone.
+tags:
+  - quantized
+research_paper: https://arxiv.org/abs/1411.4038
+research_paper_title: Fully Convolutional Networks for Semantic Segmentation
+license: https://github.com/pytorch/vision/blob/main/LICENSE
+deploy_license:
+  https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf
+source_repo:
+  https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py
+technical_details:
+  Model checkpoint: COCO_WITH_VOC_LABELS_V1
+  Input resolution: 512x512
+  Number of parameters: 33.0M
+  Model size: 32.2 MB
+applicable_scenarios:
+  - Anomaly Detection
+  - Inventory Management
+related_models:
+  - sam
+  - deeplabv3_plus_mobilenet
+  - ddrnet23_slim
+form_factors:
+  - Phone
+  - Tablet
+  - IoT
+  - XR
+has_static_banner: yes
+has_animated_banner: no
+license_type: bsd-3-clause
+deploy_license_type: AI Model Hub License
+dataset:
+  - coco
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/model.py b/qai_hub_models/models/fcn_resnet50_quantized/model.py
new file mode 100644
index 00000000..affc65ef
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/model.py
@@ -0,0 +1,87 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+# isort: off
+# This verifies aimet is installed, and this must be included first.
+from qai_hub_models.utils.quantization_aimet import (
+    AIMETQuantizableMixin,
+)
+
+# isort: on
+
+import torch
+from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
+from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
+
+from qai_hub_models.models.fcn_resnet50.model import FCN_ResNet50
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
+from qai_hub_models.utils.quantization_aimet import (
+    constrain_quantized_inputs_to_image_range,
+    tie_observers,
+)
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 1
+DEFAULT_ENCODINGS = "fcn_resnet50_quantized_encodings.json"
+
+
+class FCN_ResNet50Quantizable(AIMETQuantizableMixin, FCN_ResNet50):
+    """
+    FCN_ResNet50 with post train quantization support.
+
+    Supports only 8 bit weights and activations
+    """
+
+    def __init__(
+        self,
+        model: QuantizationSimModel,
+    ) -> None:
+        FCN_ResNet50.__init__(self, model.model)
+        AIMETQuantizableMixin.__init__(self, model)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        aimet_encodings: str | None = "DEFAULT",
+    ) -> "FCN_ResNet50Quantizable":
+        # Load Model
+        fp16_model = FCN_ResNet50.from_pretrained()
+        input_shape = cls.get_input_spec()["image"][0]
+
+        model = prepare_model(fp16_model)
+        equalize_model(model, input_shape)
+
+        sim = QuantizationSimModel(
+            model,
+            quant_scheme="tf_enhanced",
+            default_param_bw=8,
+            default_output_bw=8,
+            config_file=get_default_aimet_config(),
+            dummy_input=torch.rand(input_shape),
+        )
+        tie_observers(sim)
+        constrain_quantized_inputs_to_image_range(sim)
+
+        if aimet_encodings:
+            if aimet_encodings == "DEFAULT":
+                aimet_encodings = CachedWebModelAsset.from_asset_store(
+                    MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
+                ).fetch()
+            load_encodings_to_sim(sim, aimet_encodings)
+
+        sim.model.eval()
+        final_model = cls(sim)
+        return final_model
+
+    def forward(self, image: torch.Tensor):
+        """
+        Run FCN_ResNet50Quantizable on `image`, and produce a segmentation mask.
+
+        See FCN_ResNet50 model for details.
+        """
+        return self.model(image)
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/perf.yaml b/qai_hub_models/models/fcn_resnet50_quantized/perf.yaml
new file mode 100644
index 00000000..dae0447f
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/perf.yaml
@@ -0,0 +1,301 @@
+aggregated:
+  supported_oses:
+  - Android
+  supported_devices:
+  - Google Pixel 3
+  - Google Pixel 3a
+  - Google Pixel 3a XL
+  - Google Pixel 4
+  - Google Pixel 4a
+  - Google Pixel 5a 5G
+  - QCS6490 (Proxy)
+  - QCS8250 (Proxy)
+  - QCS8550 (Proxy)
+  - RB3 Gen 2 (Proxy)
+  - RB5 (Proxy)
+  - Samsung Galaxy S21
+  - Samsung Galaxy S21 Ultra
+  - Samsung Galaxy S21+
+  - Samsung Galaxy S22 5G
+  - Samsung Galaxy S22 Ultra 5G
+  - Samsung Galaxy S22+ 5G
+  - Samsung Galaxy S23
+  - Samsung Galaxy S23 Ultra
+  - Samsung Galaxy S23+
+  - Samsung Galaxy S24
+  - Samsung Galaxy S24 Ultra
+  - Samsung Galaxy S24+
+  - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
+  - Xiaomi 12
+  - Xiaomi 12 Pro
+  supported_chipsets:
+  - Qcs6490
+  - Qcs8250
+  - Qcs8550
+  - Snapdragon® 8 Gen 1
+  - Snapdragon® 8 Gen 2
+  - Snapdragon® 8 Gen 3
+  - Snapdragon® 888
+  - Snapdragon® X Elite
+models:
+- name: FCN-ResNet50-Quantized
+  performance_metrics:
+  - torchscript_onnx_tflite:
+      inference_time: 14056.0
+      throughput: 71.14399544678429
+      estimated_peak_memory_range:
+        min: 5554176
+        max: 7613336
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 87
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 87
+      job_id: j1glkm6mp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 15255.0
+      throughput: 65.55227794165847
+      estimated_peak_memory_range:
+        min: 16384
+        max: 85850320
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 79
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 79
+      job_id: jwgov6kk5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 19290.0
+      throughput: 51.84033177812338
+      estimated_peak_memory_range:
+        min: 44077056
+        max: 93926136
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 82
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 82
+      job_id: jygz7doxp
+      job_status: Passed
+    reference_device_info:
+      name: Samsung Galaxy S23
+      os: '13'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 2
+    timestamp: '2024-05-20T16:35:28.981987Z'
+  - torchscript_onnx_tflite:
+      inference_time: 10013.0
+      throughput: 99.87016878058525
+      estimated_peak_memory_range:
+        min: 49152
+        max: 82780048
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 87
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 87
+      job_id: jw5614eyp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 11218.0
+      throughput: 89.14244963451596
+      estimated_peak_memory_range:
+        min: 802816
+        max: 56818672
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 79
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 79
+      job_id: j1pvwk0rg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 14506.0
+      throughput: 68.93699158968703
+      estimated_peak_memory_range:
+        min: 48697344
+        max: 95933808
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 82
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 82
+      job_id: jz5w962mp
+      job_status: Passed
+    reference_device_info:
+      name: Samsung Galaxy S24
+      os: '14'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 3
+    timestamp: '2024-05-20T16:35:28.982012Z'
+  - torchscript_onnx_tflite:
+      inference_time: 14093.0
+      throughput: 70.95721280068119
+      estimated_peak_memory_range:
+        min: 5595136
+        max: 7636376
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 87
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 87
+      job_id: j1p3m0vng
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 15248.0
+      throughput: 65.58237145855195
+      estimated_peak_memory_range:
+        min: 16384
+        max: 73538552
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 79
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 79
+      job_id: jlpevmev5
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:28.982029Z'
+  - torchscript_onnx_tflite:
+      inference_time: 90967.0
+      throughput: 10.992997460617586
+      estimated_peak_memory_range:
+        min: 274432
+        max: 138676000
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 87
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 87
+      job_id: jqpy62q45
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 93267.0
+      throughput: 10.721905925997405
+      estimated_peak_memory_range:
+        min: 905216
+        max: 129785056
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 79
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 79
+      job_id: jw56nmr7g
+      job_status: Passed
+    reference_device_info:
+      name: RB3 Gen 2 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:28.982045Z'
+  - torchscript_onnx_tflite:
+      inference_time: 703201.0
+      throughput: 1.422068512416791
+      estimated_peak_memory_range:
+        min: 51548160
+        max: 190297032
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 66
+        layers_on_gpu: 9
+        layers_on_cpu: 12
+        total_layers: 87
+      job_id: j2p0l9dep
+      job_status: Passed
+    reference_device_info:
+      name: RB5 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:28.982055Z'
+  - torchscript_onnx_qnn:
+      inference_time: 16865.0
+      throughput: 59.29439667951379
+      estimated_peak_memory_range:
+        min: 786432
+        max: 786432
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 79
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 79
+      job_id: j7gjlnzep
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 17493.0
+      throughput: 57.16572343223004
+      estimated_peak_memory_range:
+        min: 72589312
+        max: 72589312
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 82
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 82
+      job_id: jmg94nj85
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jnp18zy7g
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:28.982080Z'
diff --git a/qai_hub_models/models/fcn_resnet50_quantized/test.py b/qai_hub_models/models/fcn_resnet50_quantized/test.py
new file mode 100644
index 00000000..d5dae110
--- /dev/null
+++ b/qai_hub_models/models/fcn_resnet50_quantized/test.py
@@ -0,0 +1,40 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import torch
+
+from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App
+from qai_hub_models.models.fcn_resnet50.demo import INPUT_IMAGE_ADDRESS
+from qai_hub_models.models.fcn_resnet50_quantized.demo import main as demo_main
+from qai_hub_models.models.fcn_resnet50_quantized.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    FCN_ResNet50Quantizable,
+)
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    load_numpy,
+)
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+OUTPUT_IMAGE_MASK = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "fcn_resnet50_output_mask.npy"
+)
+
+
+@skip_clone_repo_check
+def test_task():
+    # AIMET Quantization Simulator introduces randomness. Eliminate that for this test.
+    torch.manual_seed(0)
+    image = load_image(INPUT_IMAGE_ADDRESS)
+    app = FCN_ResNet50App(FCN_ResNet50Quantizable.from_pretrained())
+    output_mask = app.predict(image, True)
+    output_mask_gt = load_numpy(OUTPUT_IMAGE_MASK)
+    assert (output_mask == output_mask_gt).mean() > 0.95
+
+
+@skip_clone_repo_check
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/ffnet_122ns_lowres/README.md b/qai_hub_models/models/ffnet_122ns_lowres/README.md
index e4102369..6d34b21a 100644
--- a/qai_hub_models/models/ffnet_122ns_lowres/README.md
+++ b/qai_hub_models/models/ffnet_122ns_lowres/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_122ns_lowres/export.py b/qai_hub_models/models/ffnet_122ns_lowres/export.py
index 68bd1824..1bb5ca1b 100644
--- a/qai_hub_models/models/ffnet_122ns_lowres/export.py
+++ b/qai_hub_models/models/ffnet_122ns_lowres/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml
index 66734ffc..af26da00 100644
--- a/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml
+++ b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-122NS-LowRes
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 9669.0
-      throughput: 103.42331161443789
+      inference_time: 9717.0
+      throughput: 102.91242152927859
       estimated_peak_memory_range:
-        min: 675840
-        max: 2991672
+        min: 651264
+        max: 3155872
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 216
-      job_id: j1p3vwyng
+      job_id: jvgdv1ezg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 10768.0
-      throughput: 92.86775631500743
+      inference_time: 10869.0
+      throughput: 92.00478424878094
       estimated_peak_memory_range:
-        min: 6320128
-        max: 41702576
+        min: 8364032
+        max: 43265120
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 348
-      job_id: j1pv09jr5
+      job_id: j0px1onlg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 7374.0
-      throughput: 135.61160835367508
+      inference_time: 7858.0
+      throughput: 127.25884448969204
       estimated_peak_memory_range:
-        min: 1433600
-        max: 142206056
+        min: 2232320
+        max: 141084128
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 350
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jlpeeljvp
+        total_layers: 350
+      job_id: jep2my0q5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.654178Z'
+    timestamp: '2024-05-20T16:35:29.021584Z'
   - torchscript_onnx_tflite:
-      inference_time: 6839.0
-      throughput: 146.22020763269484
+      inference_time: 6794.0
+      throughput: 147.18869590815424
       estimated_peak_memory_range:
-        min: 569344
-        max: 59671696
+        min: 303104
+        max: 60447344
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 216
-      job_id: jwgok4jkp
+      job_id: jz57dr095
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 7605.0
-      throughput: 131.49243918474687
+      inference_time: 7585.0
+      throughput: 131.83915622940015
       estimated_peak_memory_range:
         min: 6307840
-        max: 88354272
+        max: 88988128
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 348
-      job_id: j7gjzwje5
+      job_id: jo5mzxq9p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 5809.0
-      throughput: 172.14666896195558
+      inference_time: 5761.0
+      throughput: 173.58097552508247
       estimated_peak_memory_range:
-        min: 61464576
-        max: 106276496
+        min: 5238784
+        max: 60652944
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 350
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jygzo41x5
+        total_layers: 350
+      job_id: jqpyd3rlp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.654276Z'
+    timestamp: '2024-05-20T16:35:29.021611Z'
   - torchscript_onnx_tflite:
-      inference_time: 9658.0
-      throughput: 103.54110581901014
+      inference_time: 9668.0
+      throughput: 103.4340091021928
       estimated_peak_memory_range:
-        min: 0
-        max: 4034800
+        min: 651264
+        max: 2883976
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 216
-      job_id: jqpyry105
+      job_id: jqp4wrk1g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 10822.0
-      throughput: 92.40436148586214
+      inference_time: 10900.0
+      throughput: 91.74311926605505
       estimated_peak_memory_range:
-        min: 6328320
-        max: 38539008
+        min: 6332416
+        max: 40664968
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 348
-      job_id: jn5qed0e5
+      job_id: jopry387g
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.654354Z'
+    timestamp: '2024-05-20T16:35:29.021627Z'
+  - torchscript_onnx_qnn:
+      inference_time: 17551.0
+      throughput: 56.976810438151674
+      estimated_peak_memory_range:
+        min: 6303744
+        max: 6303744
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 348
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 348
+      job_id: jegnevlqg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 7536.0
+      throughput: 132.6963906581741
+      estimated_peak_memory_range:
+        min: 6365184
+        max: 6365184
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 350
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 350
+      job_id: j2p0r03np
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 38423.0
+      throughput: 26.026078130286546
+      estimated_peak_memory_range:
+        min: 6307840
+        max: 6307840
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 151
+        total_layers: 151
+      job_id: j1p87y0o5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.021650Z'
diff --git a/qai_hub_models/models/ffnet_40s/README.md b/qai_hub_models/models/ffnet_40s/README.md
index 1e3b56a5..f1911ec4 100644
--- a/qai_hub_models/models/ffnet_40s/README.md
+++ b/qai_hub_models/models/ffnet_40s/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_40s/export.py b/qai_hub_models/models/ffnet_40s/export.py
index 8fe5c587..fd46c18f 100644
--- a/qai_hub_models/models/ffnet_40s/export.py
+++ b/qai_hub_models/models/ffnet_40s/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_40s/perf.yaml b/qai_hub_models/models/ffnet_40s/perf.yaml
index ee5c5c18..d8dea744 100644
--- a/qai_hub_models/models/ffnet_40s/perf.yaml
+++ b/qai_hub_models/models/ffnet_40s/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-40S
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 23048.0
-      throughput: 43.38771259979174
+      inference_time: 23181.0
+      throughput: 43.138777447047154
       estimated_peak_memory_range:
-        min: 0
-        max: 30911488
+        min: 2527232
+        max: 5196976
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 92
-      job_id: jmg9jx685
+      job_id: jogkyx7np
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 17363.0
-      throughput: 57.59373380176237
+      inference_time: 17245.0
+      throughput: 57.98782255726297
       estimated_peak_memory_range:
-        min: 25214976
-        max: 44166488
+        min: 1662976
+        max: 17190312
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 140
-      job_id: jvgdezjz5
+      job_id: jw5614zyp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 28590.0
-      throughput: 34.97726477789437
+      inference_time: 27135.0
+      throughput: 36.852773171181134
       estimated_peak_memory_range:
-        min: 30191616
-        max: 118917360
+        min: 33619968
+        max: 118794368
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 142
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jmg9jx6m5
+        total_layers: 142
+      job_id: j7gjln2ep
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:31.678643Z'
+    timestamp: '2024-05-20T16:35:29.051849Z'
   - torchscript_onnx_tflite:
-      inference_time: 16867.0
-      throughput: 59.28736586233474
+      inference_time: 16628.0
+      throughput: 60.13952369497233
       estimated_peak_memory_range:
-        min: 32768
-        max: 105460576
+        min: 65536
+        max: 96903808
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 92
-      job_id: jnp1yvr7p
+      job_id: jn5q2qeo5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 12552.0
-      throughput: 79.66857871255577
+      inference_time: 12571.0
+      throughput: 79.54816641476414
       estimated_peak_memory_range:
-        min: 25202688
-        max: 84533840
+        min: 25198592
+        max: 80803488
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 140
-      job_id: jz5w21j45
+      job_id: j1p3m01ng
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 20354.0
-      throughput: 49.13039206052864
+      inference_time: 19730.0
+      throughput: 50.68423720223011
       estimated_peak_memory_range:
-        min: 352256
-        max: 45279760
+        min: 32903168
+        max: 79929760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 142
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jnp1yvrnp
+        total_layers: 142
+      job_id: jlpevmwv5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:31.678725Z'
+    timestamp: '2024-05-20T16:35:29.051876Z'
   - torchscript_onnx_tflite:
-      inference_time: 22456.0
-      throughput: 44.53152832205201
+      inference_time: 23514.0
+      throughput: 42.527855745513314
       estimated_peak_memory_range:
-        min: 32768
-        max: 1647568
+        min: 2555904
+        max: 4820560
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 92
-      job_id: jygzo0245
+      job_id: j1glkm2mp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 17241.0
-      throughput: 58.00127602807262
+      inference_time: 17349.0
+      throughput: 57.64020981036371
       estimated_peak_memory_range:
-        min: 25214976
-        max: 52246888
+        min: 25227264
+        max: 46301352
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 140
-      job_id: jvgdemn65
+      job_id: j1pvwkrrg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:31.678770Z'
+    timestamp: '2024-05-20T16:35:29.051893Z'
+  - torchscript_onnx_qnn:
+      inference_time: 23285.0
+      throughput: 42.94610264118531
+      estimated_peak_memory_range:
+        min: 25214976
+        max: 25214976
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 140
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 140
+      job_id: jwgov6nk5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 26395.0
+      throughput: 37.885963250615646
+      estimated_peak_memory_range:
+        min: 25223168
+        max: 25223168
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 142
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 142
+      job_id: jygz7djxp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 143723.0
+      throughput: 6.957828600850247
+      estimated_peak_memory_range:
+        min: 208834560
+        max: 208834560
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 67
+        total_layers: 67
+      job_id: jz5w963mp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.051916Z'
diff --git a/qai_hub_models/models/ffnet_40s_quantized/README.md b/qai_hub_models/models/ffnet_40s_quantized/README.md
index bafe50ad..7767cf30 100644
--- a/qai_hub_models/models/ffnet_40s_quantized/README.md
+++ b/qai_hub_models/models/ffnet_40s_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_40s_quantized/export.py b/qai_hub_models/models/ffnet_40s_quantized/export.py
index a40f6ed5..c61ae011 100644
--- a/qai_hub_models/models/ffnet_40s_quantized/export.py
+++ b/qai_hub_models/models/ffnet_40s_quantized/export.py
@@ -123,12 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -170,8 +174,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -199,8 +205,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -209,7 +219,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_40s_quantized/perf.yaml b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml
index a4fab1f8..79fff9d7 100644
--- a/qai_hub_models/models/ffnet_40s_quantized/perf.yaml
+++ b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-40S-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 6424.0
-      throughput: 155.6662515566625
+      inference_time: 6448.0
+      throughput: 155.08684863523573
       estimated_peak_memory_range:
-        min: 651264
-        max: 25140680
+        min: 823296
+        max: 2440760
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,22 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 97
-      job_id: jz5707qng
+      job_id: jmg94ny85
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 4328.0
+      throughput: 231.0536044362292
+      estimated_peak_memory_range:
+        min: 6303744
+        max: 20703816
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 89
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 89
+      job_id: jz57drl95
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 50173.0
-      throughput: 19.93103860642178
+      inference_time: 11529.0
+      throughput: 86.73779165582444
       estimated_peak_memory_range:
-        min: 29384704
-        max: 58656168
+        min: 25239552
+        max: 52880320
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 94
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j0pxndw85
+        total_layers: 94
+      job_id: jegnevmqg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.012713Z'
+    timestamp: '2024-05-20T16:35:29.082174Z'
   - torchscript_onnx_tflite:
       inference_time: 4623.0
       throughput: 216.3097555699762
       estimated_peak_memory_range:
-        min: 20480
-        max: 67550048
+        min: 36864
+        max: 67842848
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,22 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 97
-      job_id: jqp4k9z2g
+      job_id: jnp18zw7g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 3154.0
+      throughput: 317.0577045022194
+      estimated_peak_memory_range:
+        min: 6311936
+        max: 56225968
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 89
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 89
+      job_id: jqp4wrd1g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 31095.0
-      throughput: 32.15951117543013
+      inference_time: 8449.0
+      throughput: 118.35720203574388
       estimated_peak_memory_range:
-        min: 31465472
-        max: 65073664
+        min: 29212672
+        max: 64461296
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 94
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jo5mqdj7p
+        total_layers: 94
+      job_id: jopry327g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,88 +146,156 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.012866Z'
+    timestamp: '2024-05-20T16:35:29.082201Z'
   - torchscript_onnx_tflite:
-      inference_time: 46106.0
-      throughput: 21.68915108662647
+      inference_time: 6431.0
+      throughput: 155.49681231534754
       estimated_peak_memory_range:
-        min: 12288
-        max: 52922016
+        min: 651264
+        max: 2546568
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 99
+        layers_on_npu: 97
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 99
-      job_id: jegnlw0j5
+        total_layers: 97
+      job_id: jvgdv1qzg
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 362244.0
-      throughput: 2.7605702233853426
+    torchscript_onnx_qnn:
+      inference_time: 4293.0
+      throughput: 232.93733985557884
       estimated_peak_memory_range:
-        min: 159432704
-        max: 207613904
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 6332416
+        max: 23257352
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 89
         layers_on_gpu: 0
-        layers_on_cpu: 92
-        total_layers: 92
-      job_id: jegnl7jj5
+        layers_on_cpu: 0
+        total_layers: 89
+      job_id: jo5mzx69p
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.013041Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:29.082219Z'
   - torchscript_onnx_tflite:
-      inference_time: 206934.0
-      throughput: 4.832458658316178
+      inference_time: 35053.0
+      throughput: 28.528228682281117
+      estimated_peak_memory_range:
+        min: 147456
+        max: 42857344
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 97
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 97
+      job_id: j1gl3818g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 28024.0
+      throughput: 35.68369968598344
       estimated_peak_memory_range:
-        min: 2678784
-        max: 4932640
+        min: 6324224
+        max: 55642400
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 99
+        layers_on_npu: 89
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 99
-      job_id: jep29omxg
+        total_layers: 89
+      job_id: jlpekxl1p
       job_status: Passed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.013136Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:29.082235Z'
   - torchscript_onnx_tflite:
-      inference_time: 8927.0
-      throughput: 112.0197154699227
+      inference_time: 186982.0
+      throughput: 5.348108374068092
       estimated_peak_memory_range:
-        min: 2711552
-        max: 19152008
+        min: 774144
+        max: 11267904
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 99
+        layers_on_npu: 97
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 99
-      job_id: jopr876k5
+        total_layers: 97
+      job_id: jw56nmd0g
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.013229Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:29.082246Z'
+  - torchscript_onnx_qnn:
+      inference_time: 5258.0
+      throughput: 190.1863826550019
+      estimated_peak_memory_range:
+        min: 6303744
+        max: 6303744
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 89
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 89
+      job_id: j0px1o6lg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 10886.0
+      throughput: 91.86110600771633
+      estimated_peak_memory_range:
+        min: 25223168
+        max: 25223168
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 94
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 94
+      job_id: jep2my9q5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 551323.0
+      throughput: 1.813818759601903
+      estimated_peak_memory_range:
+        min: 204230656
+        max: 204230656
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jqpyd3jlp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.082269Z'
diff --git a/qai_hub_models/models/ffnet_54s/README.md b/qai_hub_models/models/ffnet_54s/README.md
index b1f072c6..6aea8fe7 100644
--- a/qai_hub_models/models/ffnet_54s/README.md
+++ b/qai_hub_models/models/ffnet_54s/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_54s/export.py b/qai_hub_models/models/ffnet_54s/export.py
index b8f1207c..4acec5b3 100644
--- a/qai_hub_models/models/ffnet_54s/export.py
+++ b/qai_hub_models/models/ffnet_54s/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_54s/perf.yaml b/qai_hub_models/models/ffnet_54s/perf.yaml
index aff4821c..8826e4ad 100644
--- a/qai_hub_models/models/ffnet_54s/perf.yaml
+++ b/qai_hub_models/models/ffnet_54s/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-54S
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 25024.0
-      throughput: 39.9616368286445
+      inference_time: 25556.0
+      throughput: 39.12975426514321
       estimated_peak_memory_range:
-        min: 2580480
-        max: 5287928
+        min: 2527232
+        max: 5075256
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 113
-      job_id: jopr8nzk5
+      job_id: j2p0r02np
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 19758.0
-      throughput: 50.61241016297196
+      inference_time: 20540.0
+      throughput: 48.685491723466406
       estimated_peak_memory_range:
-        min: 25214976
-        max: 48724312
+        min: 25178112
+        max: 46235888
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 175
-      job_id: jqpyr7905
+      job_id: jn5q2qro5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 30799.0
-      throughput: 32.46858664242345
+      inference_time: 30453.0
+      throughput: 32.837487275473684
       estimated_peak_memory_range:
-        min: 30203904
-        max: 103625272
+        min: 33370112
+        max: 130933960
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 177
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p804oqg
+        total_layers: 177
+      job_id: jwgov63k5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.053359Z'
+    timestamp: '2024-05-20T16:35:29.121526Z'
   - torchscript_onnx_tflite:
-      inference_time: 18446.0
-      throughput: 54.21229534858506
+      inference_time: 18475.0
+      throughput: 54.12719891745602
       estimated_peak_memory_range:
-        min: 1429504
-        max: 120768592
+        min: 2248704
+        max: 109217248
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 113
-      job_id: jep20v26g
+      job_id: j1p87ymo5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 14552.0
-      throughput: 68.71907641561297
+      inference_time: 14482.0
+      throughput: 69.05123601712471
       estimated_peak_memory_range:
-        min: 180420608
-        max: 252953088
+        min: 24494080
+        max: 90410912
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 175
-      job_id: j2p03vy0p
+      job_id: j1glkm3mp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 23498.0
-      throughput: 42.556813345816664
+      inference_time: 23113.0
+      throughput: 43.265694630727296
       estimated_peak_memory_range:
-        min: 30953472
-        max: 85531952
+        min: 29417472
+        max: 74020448
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 177
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jogk79zvp
+        total_layers: 177
+      job_id: j1pvwkvrg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.053423Z'
+    timestamp: '2024-05-20T16:35:29.121553Z'
   - torchscript_onnx_tflite:
-      inference_time: 25045.0
-      throughput: 39.92812936713915
+      inference_time: 25895.0
+      throughput: 38.61749372465727
       estimated_peak_memory_range:
-        min: 2555904
-        max: 5156288
+        min: 2523136
+        max: 5051104
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 113
-      job_id: jn5qedee5
+      job_id: jogkyxqnp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 19986.0
-      throughput: 50.035024517162014
+      inference_time: 20155.0
+      throughput: 49.61548002976929
       estimated_peak_memory_range:
         min: 25214976
-        max: 55043864
+        max: 43633496
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 175
-      job_id: jwgok9k1p
+      job_id: j1p3m0eng
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.053476Z'
+    timestamp: '2024-05-20T16:35:29.121570Z'
+  - torchscript_onnx_qnn:
+      inference_time: 25810.0
+      throughput: 38.74467260751646
+      estimated_peak_memory_range:
+        min: 25219072
+        max: 25219072
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 175
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 175
+      job_id: jw5614nyp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 29548.0
+      throughput: 33.84323812102342
+      estimated_peak_memory_range:
+        min: 25219072
+        max: 25219072
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 177
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 177
+      job_id: j7gjlneep
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 176490.0
+      throughput: 5.666043401892458
+      estimated_peak_memory_range:
+        min: 414695424
+        max: 414695424
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 81
+        total_layers: 81
+      job_id: jlpevmkv5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.121592Z'
diff --git a/qai_hub_models/models/ffnet_54s_quantized/README.md b/qai_hub_models/models/ffnet_54s_quantized/README.md
index ce069620..9f4d0a9c 100644
--- a/qai_hub_models/models/ffnet_54s_quantized/README.md
+++ b/qai_hub_models/models/ffnet_54s_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_54s_quantized/export.py b/qai_hub_models/models/ffnet_54s_quantized/export.py
index 04980844..5deb7808 100644
--- a/qai_hub_models/models/ffnet_54s_quantized/export.py
+++ b/qai_hub_models/models/ffnet_54s_quantized/export.py
@@ -123,12 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -170,8 +174,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -199,8 +205,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -209,7 +219,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_54s_quantized/perf.yaml b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml
index e1c908a5..1a7b2f06 100644
--- a/qai_hub_models/models/ffnet_54s_quantized/perf.yaml
+++ b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-54S-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 7125.0
-      throughput: 140.35087719298247
+      inference_time: 7101.0
+      throughput: 140.8252358822701
       estimated_peak_memory_range:
-        min: 647168
-        max: 2562192
+        min: 692224
+        max: 2279272
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,22 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 118
-      job_id: j1gl61n2g
+      job_id: jygz7drxp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 4974.0
+      throughput: 201.04543626859672
+      estimated_peak_memory_range:
+        min: 6311936
+        max: 20048864
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 110
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 110
+      job_id: jnp18ze7g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 51385.0
-      throughput: 19.46093217865136
+      inference_time: 11814.0
+      throughput: 84.64533604198408
       estimated_peak_memory_range:
-        min: 29982720
-        max: 70964288
+        min: 30167040
+        max: 62607768
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 115
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p3vwkmg
+        total_layers: 115
+      job_id: j0px1oylg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.077877Z'
+    timestamp: '2024-05-20T16:35:29.151840Z'
   - torchscript_onnx_tflite:
-      inference_time: 5099.0
-      throughput: 196.11688566385567
+      inference_time: 5164.0
+      throughput: 193.64833462432222
       estimated_peak_memory_range:
         min: 16384
-        max: 75082320
+        max: 74278720
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,22 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 118
-      job_id: jw56ed6ng
+      job_id: jz5w96qmp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 3622.0
+      throughput: 276.09055770292656
+      estimated_peak_memory_range:
+        min: 6307840
+        max: 63588464
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 110
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 110
+      job_id: jvgdv1ozg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 31008.0
-      throughput: 32.24974200206398
+      inference_time: 9025.0
+      throughput: 110.80332409972299
       estimated_peak_memory_range:
-        min: 15433728
-        max: 55696624
+        min: 675840
+        max: 35809952
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 115
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jwgok4y1p
+        total_layers: 115
+      job_id: jo5mzx39p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,88 +146,156 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.077918Z'
+    timestamp: '2024-05-20T16:35:29.151867Z'
   - torchscript_onnx_tflite:
-      inference_time: 49684.0
-      throughput: 20.127203928830205
+      inference_time: 7134.0
+      throughput: 140.17381553125875
       estimated_peak_memory_range:
-        min: 126976
-        max: 56138256
+        min: 643072
+        max: 3436240
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 120
+        layers_on_npu: 118
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 120
-      job_id: jygzo0o45
+        total_layers: 118
+      job_id: jmg94nw85
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 420355.0
-      throughput: 2.3789416088782103
+    torchscript_onnx_qnn:
+      inference_time: 4965.0
+      throughput: 201.4098690835851
       estimated_peak_memory_range:
-        min: 187011072
-        max: 248380464
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 6307840
+        max: 20582560
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 110
         layers_on_gpu: 0
-        layers_on_cpu: 113
-        total_layers: 113
-      job_id: j1pv093z5
+        layers_on_cpu: 0
+        total_layers: 110
+      job_id: jqp4wrv1g
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.077964Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:29.151884Z'
   - torchscript_onnx_tflite:
-      inference_time: 216291.0
-      throughput: 4.623400881220208
+      inference_time: 39060.0
+      throughput: 25.60163850486431
+      estimated_peak_memory_range:
+        min: 40960
+        max: 43989968
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 118
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 118
+      job_id: jn5q31v4p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 31116.0
+      throughput: 32.13780691605605
       estimated_peak_memory_range:
-        min: 2650112
-        max: 4899184
+        min: 6332416
+        max: 62882080
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 120
+        layers_on_npu: 110
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 120
-      job_id: jqpyj8drp
+        total_layers: 110
+      job_id: j7gjeyqx5
       job_status: Passed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.077990Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:29.151900Z'
   - torchscript_onnx_tflite:
-      inference_time: 10210.0
-      throughput: 97.94319294809011
+      inference_time: 200139.0
+      throughput: 4.996527413447654
       estimated_peak_memory_range:
-        min: 2527232
-        max: 4340680
+        min: 765952
+        max: 3306112
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 120
+        layers_on_npu: 118
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 120
-      job_id: jvgdeme65
+        total_layers: 118
+      job_id: j1gl38l8g
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.078022Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:29.151911Z'
+  - torchscript_onnx_qnn:
+      inference_time: 6006.0
+      throughput: 166.5001665001665
+      estimated_peak_memory_range:
+        min: 6303744
+        max: 6303744
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 110
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 110
+      job_id: jz57drx95
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 11424.0
+      throughput: 87.53501400560224
+      estimated_peak_memory_range:
+        min: 25227264
+        max: 25227264
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 115
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 115
+      job_id: jegnev3qg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 870655.0
+      throughput: 1.1485605664700713
+      estimated_peak_memory_range:
+        min: 241315840
+        max: 241315840
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 113
+        total_layers: 113
+      job_id: jopry3e7g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.151933Z'
diff --git a/qai_hub_models/models/ffnet_78s/README.md b/qai_hub_models/models/ffnet_78s/README.md
index 9f577cda..c3f6b6dc 100644
--- a/qai_hub_models/models/ffnet_78s/README.md
+++ b/qai_hub_models/models/ffnet_78s/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_78s/export.py b/qai_hub_models/models/ffnet_78s/export.py
index 4084ce61..70bebcd0 100644
--- a/qai_hub_models/models/ffnet_78s/export.py
+++ b/qai_hub_models/models/ffnet_78s/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_78s/perf.yaml b/qai_hub_models/models/ffnet_78s/perf.yaml
index 44f26730..6808013a 100644
--- a/qai_hub_models/models/ffnet_78s/perf.yaml
+++ b/qai_hub_models/models/ffnet_78s/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-78S
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 29177.0
-      throughput: 34.27357164890153
+      inference_time: 29391.0
+      throughput: 34.02402095879691
       estimated_peak_memory_range:
-        min: 2576384
-        max: 5205816
+        min: 2580480
+        max: 4887232
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 149
-      job_id: j7gjzwx15
+      job_id: jep2mylq5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 23420.0
-      throughput: 42.69854824935952
+      inference_time: 23544.0
+      throughput: 42.473666326877336
       estimated_peak_memory_range:
-        min: 24846336
-        max: 48603008
+        min: 25210880
+        max: 46779104
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 235
-      job_id: jygzo4e45
+      job_id: j1p87yzo5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 35439.0
-      throughput: 28.21750049380626
+      inference_time: 34349.0
+      throughput: 29.1129290517919
       estimated_peak_memory_range:
-        min: 30183424
-        max: 150703648
+        min: 30216192
+        max: 174827344
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 237
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jmg9jxvm5
+        total_layers: 237
+      job_id: jw56141yp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.105190Z'
+    timestamp: '2024-05-20T16:35:29.191393Z'
   - torchscript_onnx_tflite:
-      inference_time: 21728.0
-      throughput: 46.02356406480118
+      inference_time: 21206.0
+      throughput: 47.15646515137225
       estimated_peak_memory_range:
-        min: 0
-        max: 133794256
+        min: 794624
+        max: 119306480
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 149
-      job_id: jlpeel98p
+      job_id: jqpyd36lp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 17745.0
-      throughput: 56.353902507748664
+      inference_time: 17482.0
+      throughput: 57.201693170117835
       estimated_peak_memory_range:
-        min: 25317376
-        max: 101665296
+        min: 20983808
+        max: 100170336
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 235
-      job_id: jz5w21o45
+      job_id: jogkyx3np
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 26731.0
-      throughput: 37.40974898058434
+      inference_time: 26382.0
+      throughput: 37.904631946023805
       estimated_peak_memory_range:
         min: 29417472
-        max: 90195264
+        max: 78554720
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 237
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jnp1yv0np
+        total_layers: 237
+      job_id: j1p3m0mng
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.105263Z'
+    timestamp: '2024-05-20T16:35:29.191421Z'
   - torchscript_onnx_tflite:
-      inference_time: 29631.0
-      throughput: 33.748439134690024
+      inference_time: 29621.0
+      throughput: 33.759832551230545
       estimated_peak_memory_range:
-        min: 499712
-        max: 1916448
+        min: 2560000
+        max: 5156816
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 149
-      job_id: jegnlwlj5
+      job_id: j2p0r0lnp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 23601.0
-      throughput: 42.371085970933436
+      inference_time: 23548.0
+      throughput: 42.466451503312385
       estimated_peak_memory_range:
-        min: 25165824
-        max: 55560608
+        min: 25202688
+        max: 46491072
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 235
-      job_id: j2p03x20p
+      job_id: j1glkmkmp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.105324Z'
+    timestamp: '2024-05-20T16:35:29.191438Z'
+  - torchscript_onnx_qnn:
+      inference_time: 32624.0
+      throughput: 30.65228052967141
+      estimated_peak_memory_range:
+        min: 25214976
+        max: 25214976
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 235
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 235
+      job_id: jn5q2q3o5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 33277.0
+      throughput: 30.050785828049403
+      estimated_peak_memory_range:
+        min: 26583040
+        max: 26583040
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 237
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 237
+      job_id: jwgov6vk5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 207214.0
+      throughput: 4.825928749987935
+      estimated_peak_memory_range:
+        min: 139489280
+        max: 139489280
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 105
+        total_layers: 105
+      job_id: j1pvwkwrg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.191460Z'
diff --git a/qai_hub_models/models/ffnet_78s_lowres/README.md b/qai_hub_models/models/ffnet_78s_lowres/README.md
index 0139c054..ac546964 100644
--- a/qai_hub_models/models/ffnet_78s_lowres/README.md
+++ b/qai_hub_models/models/ffnet_78s_lowres/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_78s_lowres/export.py b/qai_hub_models/models/ffnet_78s_lowres/export.py
index badf0c59..74ef3914 100644
--- a/qai_hub_models/models/ffnet_78s_lowres/export.py
+++ b/qai_hub_models/models/ffnet_78s_lowres/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_78s_lowres/perf.yaml b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml
index 42001680..5fbc7a1b 100644
--- a/qai_hub_models/models/ffnet_78s_lowres/perf.yaml
+++ b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-78S-LowRes
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 10805.0
-      throughput: 92.5497454881999
+      inference_time: 10832.0
+      throughput: 92.31905465288035
       estimated_peak_memory_range:
         min: 667648
-        max: 2943392
+        max: 2444712
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 149
-      job_id: jz5707zng
+      job_id: j7gjlnlep
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 11389.0
-      throughput: 87.80402142418123
+      inference_time: 11360.0
+      throughput: 88.02816901408451
       estimated_peak_memory_range:
-        min: 32768
-        max: 63143120
+        min: 135168
+        max: 63213296
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 236
-      job_id: j0pxndv85
+      job_id: jz5w969mp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 7820.0
-      throughput: 127.8772378516624
+      inference_time: 8961.0
+      throughput: 111.59468809284678
       estimated_peak_memory_range:
-        min: 2232320
-        max: 124968440
+        min: 2129920
+        max: 131892976
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 238
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jegnl72j5
+        total_layers: 238
+      job_id: jz5w9694p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.129479Z'
+    timestamp: '2024-05-20T16:35:29.221675Z'
   - torchscript_onnx_tflite:
-      inference_time: 7620.0
-      throughput: 131.23359580052494
+      inference_time: 7598.0
+      throughput: 131.61358252171624
       estimated_peak_memory_range:
-        min: 299008
-        max: 53659920
+        min: 32768
+        max: 51441440
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 149
-      job_id: jqp4k9q2g
+      job_id: jlpevmvv5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 7996.0
-      throughput: 125.06253126563281
+      inference_time: 7919.0
+      throughput: 126.27857052658165
       estimated_peak_memory_range:
-        min: 6324224
-        max: 70041552
+        min: 6307840
+        max: 73605024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 236
-      job_id: jo5mqdr7p
+      job_id: jmg94n485
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 5925.0
-      throughput: 168.77637130801688
+      inference_time: 6622.0
+      throughput: 151.01177891875565
       estimated_peak_memory_range:
-        min: 6332416
-        max: 48029072
+        min: 6012928
+        max: 45766784
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 238
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jopr8nkk5
+        total_layers: 238
+      job_id: jmg94n4m5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.129559Z'
+    timestamp: '2024-05-20T16:35:29.221702Z'
   - torchscript_onnx_tflite:
-      inference_time: 10747.0
-      throughput: 93.04922303898762
+      inference_time: 10817.0
+      throughput: 92.44707405010631
       estimated_peak_memory_range:
-        min: 655360
-        max: 2972672
+        min: 692224
+        max: 2481904
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 149
-      job_id: jw56e0zng
+      job_id: jygz7d7xp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 11414.0
-      throughput: 87.61170492377782
+      inference_time: 11402.0
+      throughput: 87.70391159445711
       estimated_peak_memory_range:
-        min: 6336512
-        max: 38367920
+        min: 1359872
+        max: 53966200
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 236
-      job_id: j7gjz8215
+      job_id: jvgdv1vzg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.129627Z'
+    timestamp: '2024-05-20T16:35:29.221720Z'
+  - torchscript_onnx_qnn:
+      inference_time: 20470.0
+      throughput: 48.85197850512946
+      estimated_peak_memory_range:
+        min: 6303744
+        max: 6303744
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 236
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 236
+      job_id: jnp18z87g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 8747.0
+      throughput: 114.32491139819366
+      estimated_peak_memory_range:
+        min: 42668032
+        max: 42668032
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 238
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 238
+      job_id: jnp18z8ng
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 64289.0
+      throughput: 15.554760534461572
+      estimated_peak_memory_range:
+        min: 42369024
+        max: 42369024
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 106
+        total_layers: 106
+      job_id: jvgdv1v6g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.221743Z'
diff --git a/qai_hub_models/models/ffnet_78s_quantized/README.md b/qai_hub_models/models/ffnet_78s_quantized/README.md
index c9cd33a7..43dcb2af 100644
--- a/qai_hub_models/models/ffnet_78s_quantized/README.md
+++ b/qai_hub_models/models/ffnet_78s_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/f
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/ffnet_78s_quantized/export.py b/qai_hub_models/models/ffnet_78s_quantized/export.py
index e53a076e..c2f4f09e 100644
--- a/qai_hub_models/models/ffnet_78s_quantized/export.py
+++ b/qai_hub_models/models/ffnet_78s_quantized/export.py
@@ -123,12 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -170,8 +174,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -199,8 +205,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -209,7 +219,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/ffnet_78s_quantized/perf.yaml b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml
index 76857574..1c52bc46 100644
--- a/qai_hub_models/models/ffnet_78s_quantized/perf.yaml
+++ b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: FFNet-78S-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 8382.0
-      throughput: 119.30326890956812
+      inference_time: 8341.0
+      throughput: 119.88970147464333
       estimated_peak_memory_range:
-        min: 688128
-        max: 2625256
+        min: 684032
+        max: 2437040
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,22 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 154
-      job_id: jqpyr7e05
+      job_id: jz57drdn5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 5952.0
+      throughput: 168.01075268817203
+      estimated_peak_memory_range:
+        min: 8372224
+        max: 27369456
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jo5mzxz7p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 53059.0
-      throughput: 18.846943968035582
+      inference_time: 12352.0
+      throughput: 80.95854922279793
       estimated_peak_memory_range:
-        min: 30326784
-        max: 75211072
+        min: 30101504
+        max: 79464896
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 151
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p8049qg
+        total_layers: 151
+      job_id: jqpyd3d0p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.154340Z'
+    timestamp: '2024-05-20T16:35:29.251992Z'
   - torchscript_onnx_tflite:
-      inference_time: 5988.0
-      throughput: 167.000668002672
+      inference_time: 5972.0
+      throughput: 167.44809109176154
       estimated_peak_memory_range:
-        min: 20480
-        max: 87117952
+        min: 12288
+        max: 88653408
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,22 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 154
-      job_id: j2p03vq0p
+      job_id: jqp4wrw2g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 4317.0
+      throughput: 231.6423442205235
+      estimated_peak_memory_range:
+        min: 6307840
+        max: 75240272
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jegnevejg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 31534.0
-      throughput: 31.71180313312615
+      inference_time: 9441.0
+      throughput: 105.92098294672175
       estimated_peak_memory_range:
-        min: 31961088
-        max: 77114832
+        min: 31965184
+        max: 81051088
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 151
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jogk79nvp
+        total_layers: 151
+      job_id: j2p0r010p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,88 +146,156 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.154385Z'
+    timestamp: '2024-05-20T16:35:29.252019Z'
   - torchscript_onnx_tflite:
-      inference_time: 57755.0
-      throughput: 17.31451822353043
+      inference_time: 8351.0
+      throughput: 119.74613818704347
       estimated_peak_memory_range:
-        min: 319488
-        max: 58248928
+        min: 696320
+        max: 3185352
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 156
+        layers_on_npu: 154
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 156
-      job_id: jz5708lng
+        total_layers: 154
+      job_id: j0px1o18g
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 547799.0
-      throughput: 1.825487085591613
+    torchscript_onnx_qnn:
+      inference_time: 5974.0
+      throughput: 167.39203213927016
       estimated_peak_memory_range:
-        min: 166916096
-        max: 242608960
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 6336512
+        max: 26276408
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 146
         layers_on_gpu: 0
-        layers_on_cpu: 149
-        total_layers: 149
-      job_id: jn5qemke5
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jep2mym65
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.154436Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:29.252037Z'
   - torchscript_onnx_tflite:
-      inference_time: 235689.0
-      throughput: 4.242879387667647
+      inference_time: 45673.0
+      throughput: 21.89477371751363
+      estimated_peak_memory_range:
+        min: 774144
+        max: 49521760
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 154
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 154
+      job_id: jz5wqzl65
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 37262.0
+      throughput: 26.83699210992432
       estimated_peak_memory_range:
-        min: 2572288
-        max: 5196608
+        min: 6307840
+        max: 71671376
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 156
+        layers_on_npu: 146
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 156
-      job_id: j2p02or25
+        total_layers: 146
+      job_id: j0pxyrl1g
       job_status: Passed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.154466Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:29.252054Z'
   - torchscript_onnx_tflite:
-      inference_time: 10675.0
-      throughput: 93.6768149882904
+      inference_time: 218485.0
+      throughput: 4.576973247591368
       estimated_peak_memory_range:
-        min: 2576384
-        max: 4529144
+        min: 770048
+        max: 10557616
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 156
+        layers_on_npu: 154
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 156
-      job_id: jvgdemq65
+        total_layers: 154
+      job_id: jmg9w2zlp
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.154494Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:29.252065Z'
+  - torchscript_onnx_qnn:
+      inference_time: 7096.0
+      throughput: 140.92446448703495
+      estimated_peak_memory_range:
+        min: 6303744
+        max: 6303744
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jopry3ykg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 13843.0
+      throughput: 72.23867658744491
+      estimated_peak_memory_range:
+        min: 34721792
+        max: 34721792
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 151
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 151
+      job_id: j1p87y3q5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 801403.0
+      throughput: 1.2478116503182544
+      estimated_peak_memory_range:
+        min: 204279808
+        max: 204279808
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jogkyxlvp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.252087Z'
diff --git a/qai_hub_models/models/googlenet/README.md b/qai_hub_models/models/googlenet/README.md
index ddf7fdbf..71a8d343 100644
--- a/qai_hub_models/models/googlenet/README.md
+++ b/qai_hub_models/models/googlenet/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/g
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/googlenet/export.py b/qai_hub_models/models/googlenet/export.py
index 3226da2e..eec00f37 100644
--- a/qai_hub_models/models/googlenet/export.py
+++ b/qai_hub_models/models/googlenet/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/googlenet/perf.yaml b/qai_hub_models/models/googlenet/perf.yaml
index ff2fab34..0f54510e 100644
--- a/qai_hub_models/models/googlenet/perf.yaml
+++ b/qai_hub_models/models/googlenet/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: GoogLeNet
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1044.0
-      throughput: 957.8544061302682
+      inference_time: 1047.0
+      throughput: 955.1098376313277
       estimated_peak_memory_range:
-        min: 28672
-        max: 2002104
+        min: 16384
+        max: 1526704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 84
-      job_id: jnp1yvlnp
+      job_id: jqp4wrx2g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1075.0
-      throughput: 930.2325581395348
+      inference_time: 1089.0
+      throughput: 918.2736455463728
       estimated_peak_memory_range:
-        min: 20480
-        max: 26621784
+        min: 618496
+        max: 4593576
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 143
-      job_id: jz5707wng
+      job_id: jegnev9jg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1293.0
-      throughput: 773.3952049497293
+      inference_time: 1227.0
+      throughput: 814.9959250203749
       estimated_peak_memory_range:
-        min: 12288
-        max: 46074600
+        min: 16384
+        max: 45472688
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 145
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j0pxndj85
+        total_layers: 145
+      job_id: j2p0r0e0p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.213322Z'
+    timestamp: '2024-05-20T16:35:29.373835Z'
   - torchscript_onnx_tflite:
-      inference_time: 650.0
-      throughput: 1538.4615384615386
+      inference_time: 691.0
+      throughput: 1447.178002894356
       estimated_peak_memory_range:
-        min: 16384
-        max: 45786064
+        min: 12288
+        max: 46214624
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 84
-      job_id: jvgdez965
+      job_id: j0px1o78g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 693.0
-      throughput: 1443.001443001443
+      inference_time: 699.0
+      throughput: 1430.615164520744
       estimated_peak_memory_range:
         min: 0
-        max: 53494384
+        max: 56918592
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 143
-      job_id: jqp4k9o2g
+      job_id: jopry34kg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 852.0
-      throughput: 1173.7089201877934
+      inference_time: 898.0
+      throughput: 1113.5857461024498
       estimated_peak_memory_range:
-        min: 618496
-        max: 24414912
+        min: 602112
+        max: 25082000
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 145
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jo5mqd27p
+        total_layers: 145
+      job_id: j1p87ywq5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.213386Z'
+    timestamp: '2024-05-20T16:35:29.373860Z'
   - torchscript_onnx_tflite:
-      inference_time: 1043.0
-      throughput: 958.7727708533077
+      inference_time: 1047.0
+      throughput: 955.1098376313277
       estimated_peak_memory_range:
         min: 12288
-        max: 1850480
+        max: 17376784
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 84
-      job_id: jlpeenk8p
+      job_id: jo5mzxw7p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1090.0
-      throughput: 917.4311926605504
+      inference_time: 1094.0
+      throughput: 914.0767824497258
       estimated_peak_memory_range:
         min: 622592
-        max: 4955600
+        max: 5356744
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 143
-      job_id: jnp1ymenp
+      job_id: jqpyd340p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.213429Z'
+    timestamp: '2024-05-20T16:35:29.373877Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1259.0
+      throughput: 794.2811755361398
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 143
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 143
+      job_id: jep2my765
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1333.0
+      throughput: 750.1875468867216
+      estimated_peak_memory_range:
+        min: 11251712
+        max: 11251712
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 145
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 145
+      job_id: jogkyxrvp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 5736.0
+      throughput: 174.33751743375174
+      estimated_peak_memory_range:
+        min: 11059200
+        max: 11059200
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 84
+        total_layers: 84
+      job_id: jn5q2q9e5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.373901Z'
diff --git a/qai_hub_models/models/googlenet_quantized/README.md b/qai_hub_models/models/googlenet_quantized/README.md
index dfa75bdd..38fef799 100644
--- a/qai_hub_models/models/googlenet_quantized/README.md
+++ b/qai_hub_models/models/googlenet_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/g
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/googlenet_quantized/export.py b/qai_hub_models/models/googlenet_quantized/export.py
index 44eb1ac6..df3a2329 100644
--- a/qai_hub_models/models/googlenet_quantized/export.py
+++ b/qai_hub_models/models/googlenet_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/googlenet_quantized/perf.yaml b/qai_hub_models/models/googlenet_quantized/perf.yaml
index 164b7f27..17e41ae4 100644
--- a/qai_hub_models/models/googlenet_quantized/perf.yaml
+++ b/qai_hub_models/models/googlenet_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,6 +37,7 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: GoogLeNetQuantized
   performance_metrics:
@@ -44,7 +46,7 @@ models:
       throughput: 3367.003367003367
       estimated_peak_memory_range:
         min: 12288
-        max: 1529584
+        max: 1659216
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 84
-      job_id: jopr8nqk5
+      job_id: j1glkme2p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 346.0
-      throughput: 2890.173410404624
+      inference_time: 345.0
+      throughput: 2898.550724637681
       estimated_peak_memory_range:
-        min: 16384
-        max: 139797592
+        min: 90112
+        max: 4621032
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,22 +69,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 86
-      job_id: jogk79mvp
+      job_id: jwgov6e15
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 756.0
-      throughput: 1322.7513227513227
+      inference_time: 623.0
+      throughput: 1605.1364365971108
       estimated_peak_memory_range:
         min: 12288
-        max: 22997816
+        max: 31466656
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 94
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1gl61r2g
+        total_layers: 94
+      job_id: jygz7dv4p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -91,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.237563Z'
+    timestamp: '2024-05-20T16:35:29.404186Z'
   - torchscript_onnx_tflite:
-      inference_time: 229.0
-      throughput: 4366.812227074236
+      inference_time: 214.0
+      throughput: 4672.897196261682
       estimated_peak_memory_range:
         min: 12288
-        max: 32807600
+        max: 33138256
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 84
-      job_id: jqpyr7w05
+      job_id: jw5614qnp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 242.0
-      throughput: 4132.231404958678
+      inference_time: 250.0
+      throughput: 4000.0
       estimated_peak_memory_range:
-        min: 163840
-        max: 41416608
+        min: 0
+        max: 43090384
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -120,22 +122,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 86
-      job_id: jn5qemoe5
+      job_id: j1pvwkzzg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 547.0
-      throughput: 1828.1535648994516
+      inference_time: 475.0
+      throughput: 2105.2631578947367
       estimated_peak_memory_range:
-        min: 3473408
-        max: 30390976
+        min: 0
+        max: 26393632
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 94
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jw56edlng
+        total_layers: 94
+      job_id: jz5w96m4p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -144,13 +146,28 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.237614Z'
+    timestamp: '2024-05-20T16:35:29.404215Z'
   - torchscript_onnx_tflite:
-      inference_time: 1013.0
-      throughput: 987.1668311944719
+      inference_time: 297.0
+      throughput: 3367.003367003367
       estimated_peak_memory_range:
-        min: 20480
-        max: 16869552
+        min: 12288
+        max: 1518064
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 84
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 84
+      job_id: j1p3m0qmg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 344.0
+      throughput: 2906.9767441860463
+      estimated_peak_memory_range:
+        min: 16384
+        max: 100695528
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -158,37 +175,45 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 86
-      job_id: jz5708xqg
+      job_id: jlpevm485
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:29.404232Z'
+  - torchscript_onnx_tflite:
+      inference_time: 950.0
+      throughput: 1052.6315789473683
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 12288
+        max: 17406016
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 84
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jegnlwev5
-      job_status: Failed
-    torchscript_onnx_ort:
-      inference_time: 10247.0
-      throughput: 97.58953840148337
+        total_layers: 84
+      job_id: jygzrylk5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1100.0
+      throughput: 909.0909090909091
       estimated_peak_memory_range:
-        min: 2646016
-        max: 50596416
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 163840
+        max: 37495168
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 86
         layers_on_gpu: 0
-        layers_on_cpu: 95
-        total_layers: 95
-      job_id: j1p3vw2mg
+        layers_on_cpu: 0
+        total_layers: 86
+      job_id: jmg9w2owp
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -197,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.237671Z'
+    timestamp: '2024-05-20T16:35:29.404247Z'
   - torchscript_onnx_tflite:
-      inference_time: 5919.0
-      throughput: 168.94745734076702
+      inference_time: 5755.0
+      throughput: 173.7619461337967
       estimated_peak_memory_range:
         min: 20480
-        max: 6396208
+        max: 7049192
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 86
+        layers_on_npu: 84
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 86
-      job_id: j1p8mj7z5
+        total_layers: 84
+      job_id: jz5wqzy65
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -220,13 +245,13 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.237693Z'
-  - torchscript_onnx_tflite:
-      inference_time: 322.0
-      throughput: 3105.590062111801
+    timestamp: '2024-05-20T16:35:29.404257Z'
+  - torchscript_onnx_qnn:
+      inference_time: 465.0
+      throughput: 2150.537634408602
       estimated_peak_memory_range:
-        min: 12288
-        max: 2046792
+        min: 540672
+        max: 540672
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -234,28 +259,43 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 86
-      job_id: j0pxnzyj5
+      job_id: j7gjlnk1p
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 365.0
-      throughput: 2739.72602739726
+    torchscript_onnx_ort:
+      inference_time: 616.0
+      throughput: 1623.3766233766235
       estimated_peak_memory_range:
-        min: 634880
-        max: 5391328
+        min: 19083264
+        max: 19083264
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 88
+        layers_on_npu: 94
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 88
-      job_id: jep20zmxg
+        total_layers: 94
+      job_id: jmg94n9m5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2182.0
+      throughput: 458.29514207149407
+      estimated_peak_memory_range:
+        min: 1978368
+        max: 1978368
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 94
+        total_layers: 94
+      job_id: jnp18zqng
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.237731Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.404279Z'
diff --git a/qai_hub_models/models/hrnet_pose/README.md b/qai_hub_models/models/hrnet_pose/README.md
index 60809ac6..1291e266 100644
--- a/qai_hub_models/models/hrnet_pose/README.md
+++ b/qai_hub_models/models/hrnet_pose/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/h
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/hrnet_pose/app.py b/qai_hub_models/models/hrnet_pose/app.py
index 7f500e96..7f97c3a7 100644
--- a/qai_hub_models/models/hrnet_pose/app.py
+++ b/qai_hub_models/models/hrnet_pose/app.py
@@ -200,6 +200,6 @@ def predict_pose_keypoints(
 
         predicted_images = []
         for i, img in enumerate(NHWC_int_numpy_frames):
-            draw_points(img, keypoints[i], color=(255, 0, 0), size=2)
+            draw_points(img, keypoints[i], color=(255, 0, 0), size=6)
             predicted_images.append(fromarray(img))
         return predicted_images
diff --git a/qai_hub_models/models/hrnet_pose/export.py b/qai_hub_models/models/hrnet_pose/export.py
index 7a1669ca..f8c0f803 100644
--- a/qai_hub_models/models/hrnet_pose/export.py
+++ b/qai_hub_models/models/hrnet_pose/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/hrnet_pose/info.yaml b/qai_hub_models/models/hrnet_pose/info.yaml
index bf4f1b15..9220f62f 100644
--- a/qai_hub_models/models/hrnet_pose/info.yaml
+++ b/qai_hub_models/models/hrnet_pose/info.yaml
@@ -16,7 +16,7 @@ source_repo:
   https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet
 technical_details:
   Model checkpoint: hrnet_posenet_FP32_state_dict
-  Input resolution: 192x256
+  Input resolution: 256x192
   Number of parameters: 28.5M
   Model size: 109 MB
 applicable_scenarios:
@@ -29,7 +29,7 @@ form_factors:
   - IoT
 related_models: [litehrnet, openpose]
 has_static_banner: yes
-has_animated_banner: no
+has_animated_banner: yes
 license_type: other
 deploy_license_type: AI Model Hub License
 dataset: []
diff --git a/qai_hub_models/models/hrnet_pose/model.py b/qai_hub_models/models/hrnet_pose/model.py
index 592ec066..c4d3c102 100644
--- a/qai_hub_models/models/hrnet_pose/model.py
+++ b/qai_hub_models/models/hrnet_pose/model.py
@@ -21,7 +21,7 @@
 from qai_hub_models.utils.input_spec import InputSpec
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 1
+MODEL_ASSET_VERSION = 2
 # This model originally comes from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
 # but we'll use the weights from AIMET
 # Weights and config stored in S3 are sourced from
diff --git a/qai_hub_models/models/hrnet_pose/perf.yaml b/qai_hub_models/models/hrnet_pose/perf.yaml
index 7e98ec4d..50386875 100644
--- a/qai_hub_models/models/hrnet_pose/perf.yaml
+++ b/qai_hub_models/models/hrnet_pose/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,53 +31,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: HRNetPose
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2289.0
-      throughput: 436.871996505024
+      inference_time: 2818.0
+      throughput: 354.86160397444996
       estimated_peak_memory_range:
-        min: 16384
-        max: 2655344
+        min: 28672
+        max: 2913312
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 514
+        layers_on_npu: 516
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 514
-      job_id: jwgok4q1p
+        total_layers: 516
+      job_id: jvgdv176g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2297.0
-      throughput: 435.35045711798
+      inference_time: 2886.0
+      throughput: 346.5003465003465
       estimated_peak_memory_range:
-        min: 12288
-        max: 59340792
+        min: 16384
+        max: 20957856
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 745
+        layers_on_npu: 747
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 745
-      job_id: j7gjzw415
+        total_layers: 747
+      job_id: j0px1oe8g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 3007.0
-      throughput: 332.5573661456601
+      inference_time: 3134.0
+      throughput: 319.0810465858328
       estimated_peak_memory_range:
         min: 0
-        max: 148641888
+        max: 128298872
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 749
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jygzo4k45
+        total_layers: 749
+      job_id: jep2my365
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,51 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.272940Z'
+    timestamp: '2024-05-20T16:35:29.443668Z'
   - torchscript_onnx_tflite:
-      inference_time: 1753.0
-      throughput: 570.4506560182544
+      inference_time: 2065.0
+      throughput: 484.26150121065376
       estimated_peak_memory_range:
-        min: 225280
-        max: 107290736
+        min: 12288
+        max: 109086992
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 514
+        layers_on_npu: 516
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 514
-      job_id: j1pv09xz5
+        total_layers: 516
+      job_id: jz57drvn5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1719.0
-      throughput: 581.7335660267597
+      inference_time: 2134.0
+      throughput: 468.6035613870665
       estimated_peak_memory_range:
-        min: 606208
-        max: 177224704
+        min: 0
+        max: 189704832
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 745
+        layers_on_npu: 747
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 745
-      job_id: jlpeel38p
+        total_layers: 747
+      job_id: jo5mzxv7p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2250.0
-      throughput: 444.44444444444446
+      inference_time: 2215.0
+      throughput: 451.46726862302484
       estimated_peak_memory_range:
         min: 12288
-        max: 81136704
+        max: 93863680
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 749
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz5w21n45
+        total_layers: 749
+      job_id: jqpyd3v0p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,36 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.273098Z'
+    timestamp: '2024-05-20T16:35:29.443693Z'
   - torchscript_onnx_tflite:
-      inference_time: 2294.0
-      throughput: 435.9197907585004
+      inference_time: 2881.0
+      throughput: 347.1017007983339
       estimated_peak_memory_range:
-        min: 16384
-        max: 3533472
+        min: 24576
+        max: 4152200
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 514
+        layers_on_npu: 516
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 514
-      job_id: jogk7kyyp
+        total_layers: 516
+      job_id: jqp4wrj2g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2291.0
-      throughput: 436.4906154517678
+      inference_time: 2909.0
+      throughput: 343.7607425232039
       estimated_peak_memory_range:
-        min: 610304
-        max: 59474648
+        min: 630784
+        max: 16131888
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 745
+        layers_on_npu: 747
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 745
-      job_id: j1p3vrmxg
+        total_layers: 747
+      job_id: jopry31kg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.273247Z'
+    timestamp: '2024-05-20T16:35:29.443710Z'
+  - torchscript_onnx_qnn:
+      inference_time: 3156.0
+      throughput: 316.85678073510775
+      estimated_peak_memory_range:
+        min: 589824
+        max: 589824
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 747
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 747
+      job_id: jegnevrjg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2975.0
+      throughput: 336.1344537815126
+      estimated_peak_memory_range:
+        min: 54882304
+        max: 54882304
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 749
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 749
+      job_id: j2p0r0k0p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 19453.0
+      throughput: 51.40595280933532
+      estimated_peak_memory_range:
+        min: 37265408
+        max: 37265408
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j1p87y8q5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.443733Z'
diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/README.md b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md
index 3e9e7062..fc585abc 100644
--- a/qai_hub_models/models/huggingface_wavlm_base_plus/README.md
+++ b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/h
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/export.py b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py
index 170b15f0..44c79380 100644
--- a/qai_hub_models/models/huggingface_wavlm_base_plus/export.py
+++ b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py
@@ -119,7 +119,7 @@ def export_model(
 
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options
+        target_runtime, compile_options, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/model.py b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py
index 5074e78e..f476aa98 100644
--- a/qai_hub_models/models/huggingface_wavlm_base_plus/model.py
+++ b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py
@@ -82,7 +82,12 @@ def get_hub_profile_options(
         profile_options = super().get_hub_profile_options(
             target_runtime, other_profile_options
         )
-        return profile_options + " --compute_unit cpu"
+        if (
+            target_runtime == TargetRuntime.TFLITE
+            and "--compute_unit" not in profile_options
+        ):
+            profile_options = profile_options + " --compute_unit gpu"
+        return profile_options
 
 
 # Modules used to override Huggingface WavLM to be NPU friendly
diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml
index ed9b36af..63c58551 100644
--- a/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml
+++ b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: HuggingFace-WavLM-Base-Plus
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 884463.0
-      throughput: 1.1306295458374178
+      inference_time: 938575.0
+      throughput: 1.0654449564499373
       estimated_peak_memory_range:
-        min: 149233664
-        max: 152668384
+        min: 130052096
+        max: 143676568
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -46,23 +48,38 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 811
         total_layers: 811
-      job_id: jo5mqdy7p
+      job_id: jmg94n8m5
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jz57dr6n5
+      job_status: Failed
     torchscript_onnx_ort:
-      inference_time: 613080.0
-      throughput: 1.631108501337509
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 16220160
-        max: 44091568
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
         layers_on_npu: 0
         layers_on_gpu: 0
-        layers_on_cpu: 484
-        total_layers: 484
-      job_id: jopr8njk5
-      job_status: Passed
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jo5mzx47p
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.343742Z'
+    timestamp: '2024-05-20T16:35:29.513720Z'
   - torchscript_onnx_tflite:
-      inference_time: 789013.0
-      throughput: 1.2674062404548467
+      inference_time: 852446.0
+      throughput: 1.173094835332678
       estimated_peak_memory_range:
-        min: 148623360
-        max: 174462192
+        min: 148041728
+        max: 183065760
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -84,23 +101,38 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 811
         total_layers: 811
-      job_id: jegnl78j5
+      job_id: jnp18z3ng
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jqp4wr82g
+      job_status: Failed
     torchscript_onnx_ort:
-      inference_time: 513891.0
-      throughput: 1.9459379518224682
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 995328
-        max: 204911264
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
         layers_on_npu: 0
         layers_on_gpu: 0
-        layers_on_cpu: 484
-        total_layers: 484
-      job_id: jep20vn6g
-      job_status: Passed
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jegnevxjg
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.343896Z'
+    timestamp: '2024-05-20T16:35:29.513746Z'
   - torchscript_onnx_tflite:
-      inference_time: 928773.0
-      throughput: 1.0766893525113241
+      inference_time: 867664.0
+      throughput: 1.1525198694425491
       estimated_peak_memory_range:
-        min: 150151168
-        max: 158231104
+        min: 149274624
+        max: 152991232
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -122,8 +154,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 811
         total_layers: 811
-      job_id: jqp4k2wqg
+      job_id: jvgdv106g
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j0px1om8g
+      job_status: Failed
     reference_device_info:
       name: QCS8550 (Proxy)
       os: '12'
@@ -131,4 +178,42 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.343990Z'
+    timestamp: '2024-05-20T16:35:29.513762Z'
+  - torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jopry39kg
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jep2myj65
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.513781Z'
diff --git a/qai_hub_models/models/inception_v3/README.md b/qai_hub_models/models/inception_v3/README.md
index 33e4e2ca..0b085c5e 100644
--- a/qai_hub_models/models/inception_v3/README.md
+++ b/qai_hub_models/models/inception_v3/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/i
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/inception_v3/export.py b/qai_hub_models/models/inception_v3/export.py
index 94f8800d..e3919318 100644
--- a/qai_hub_models/models/inception_v3/export.py
+++ b/qai_hub_models/models/inception_v3/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/inception_v3/perf.yaml b/qai_hub_models/models/inception_v3/perf.yaml
index e11be443..4d8aab78 100644
--- a/qai_hub_models/models/inception_v3/perf.yaml
+++ b/qai_hub_models/models/inception_v3/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Inception-v3
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1337.0
-      throughput: 747.9431563201197
+      inference_time: 1342.0
+      throughput: 745.156482861401
       estimated_peak_memory_range:
-        min: 20480
-        max: 2064624
+        min: 12288
+        max: 1685032
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 129
-      job_id: j2p03600p
+      job_id: jqpyd3n0p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1396.0
-      throughput: 716.3323782234957
+      inference_time: 1414.0
+      throughput: 707.2135785007072
       estimated_peak_memory_range:
         min: 16384
-        max: 150190256
+        max: 149750296
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 219
-      job_id: jogk78xvp
+      job_id: jogkyxovp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1728.0
-      throughput: 578.7037037037037
+      inference_time: 1719.0
+      throughput: 581.7335660267597
       estimated_peak_memory_range:
-        min: 57344
-        max: 214567960
+        min: 12288
+        max: 214330432
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 221
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1gl6lm2g
+        total_layers: 221
+      job_id: j1p3m0xmg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.362074Z'
+    timestamp: '2024-05-20T16:35:29.537865Z'
   - torchscript_onnx_tflite:
-      inference_time: 1019.0
-      throughput: 981.3542688910696
+      inference_time: 1013.0
+      throughput: 987.1668311944719
       estimated_peak_memory_range:
-        min: 12288
-        max: 51945968
+        min: 16384
+        max: 52159904
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 129
-      job_id: j1p801yqg
+      job_id: j2p0r0d0p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1044.0
-      throughput: 957.8544061302682
+      inference_time: 1043.0
+      throughput: 958.7727708533077
       estimated_peak_memory_range:
-        min: 618496
-        max: 62186832
+        min: 0
+        max: 66127216
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 219
-      job_id: jn5qevqe5
+      job_id: jn5q2qze5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1343.0
-      throughput: 744.6016381236038
+      inference_time: 1333.0
+      throughput: 750.1875468867216
       estimated_peak_memory_range:
         min: 618496
-        max: 25688304
+        max: 28967744
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 221
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jw56ew4ng
+        total_layers: 221
+      job_id: jwgov6o15
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.362144Z'
+    timestamp: '2024-05-20T16:35:29.537891Z'
   - torchscript_onnx_tflite:
-      inference_time: 1335.0
-      throughput: 749.0636704119851
+      inference_time: 1352.0
+      throughput: 739.6449704142012
       estimated_peak_memory_range:
-        min: 24576
-        max: 1812440
+        min: 16384
+        max: 2133976
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 129
-      job_id: jogk7klyp
+      job_id: j1p87y6q5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1397.0
-      throughput: 715.8196134574088
+      inference_time: 1421.0
+      throughput: 703.7297677691766
       estimated_peak_memory_range:
-        min: 36864
-        max: 150659520
+        min: 20480
+        max: 150041024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 219
-      job_id: j1p3vr4xg
+      job_id: jw5614rnp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.362196Z'
+    timestamp: '2024-05-20T16:35:29.537908Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1636.0
+      throughput: 611.2469437652812
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 219
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 219
+      job_id: j1glkmo2p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1677.0
+      throughput: 596.3029218843172
+      estimated_peak_memory_range:
+        min: 48324608
+        max: 48324608
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 221
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 221
+      job_id: j1pvwkezg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 12033.0
+      throughput: 83.10479514667996
+      estimated_peak_memory_range:
+        min: 26181632
+        max: 26181632
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 123
+        total_layers: 123
+      job_id: j7gjlno1p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.537932Z'
diff --git a/qai_hub_models/models/inception_v3_quantized/README.md b/qai_hub_models/models/inception_v3_quantized/README.md
index ee26f496..a0f99c07 100644
--- a/qai_hub_models/models/inception_v3_quantized/README.md
+++ b/qai_hub_models/models/inception_v3_quantized/README.md
@@ -3,7 +3,7 @@
 
 # [Inception-v3-Quantized: Quantized Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/inception_v3_quantized)
 
-InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using samples from [Google's open images dataset](https://storage.googleapis.com/openimages/web/index.html).
+InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using samples from Google's open images dataset.
 
 This is based on the implementation of Inception-v3-Quantized found
 [here](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py). This repository contains scripts for optimized on-device
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/i
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/inception_v3_quantized/export.py b/qai_hub_models/models/inception_v3_quantized/export.py
index c6c03ade..f5eab10b 100644
--- a/qai_hub_models/models/inception_v3_quantized/export.py
+++ b/qai_hub_models/models/inception_v3_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/inception_v3_quantized/info.yaml b/qai_hub_models/models/inception_v3_quantized/info.yaml
index 06f3bf87..c3d40275 100644
--- a/qai_hub_models/models/inception_v3_quantized/info.yaml
+++ b/qai_hub_models/models/inception_v3_quantized/info.yaml
@@ -7,7 +7,7 @@ domain: Computer Vision
 description: InceptionNetV3 is a machine learning model that can classify images from
   the Imagenet dataset. It can also be used as a backbone in building more complex
   models for specific use cases. This model is post-training quantized to int8 using
-  samples from [Google's open images dataset](https://storage.googleapis.com/openimages/web/index.html).
+  samples from Google's open images dataset.
 use_case: Image Classification
 tags:
   - backbone
diff --git a/qai_hub_models/models/inception_v3_quantized/perf.yaml b/qai_hub_models/models/inception_v3_quantized/perf.yaml
index 0ab60e20..04c24471 100644
--- a/qai_hub_models/models/inception_v3_quantized/perf.yaml
+++ b/qai_hub_models/models/inception_v3_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Inception-v3-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 623.0
-      throughput: 1605.1364365971108
+      inference_time: 615.0
+      throughput: 1626.0162601626016
       estimated_peak_memory_range:
-        min: 40960
-        max: 1585824
+        min: 20480
+        max: 1835968
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,22 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 144
-      job_id: jwgok861p
+      job_id: jlpevm885
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 656.0
+      throughput: 1524.3902439024391
+      estimated_peak_memory_range:
+        min: 16384
+        max: 70614144
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 134
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 134
+      job_id: jmg94nkm5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1098.0
-      throughput: 910.7468123861566
+      inference_time: 934.0
+      throughput: 1070.6638115631692
       estimated_peak_memory_range:
-        min: 53248
-        max: 53526464
+        min: 12288
+        max: 63129504
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 137
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j7gjzqn15
+        total_layers: 137
+      job_id: jmg94nkq5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.386127Z'
+    timestamp: '2024-05-20T16:35:29.568165Z'
   - torchscript_onnx_tflite:
-      inference_time: 492.0
-      throughput: 2032.520325203252
+      inference_time: 466.0
+      throughput: 2145.922746781116
       estimated_peak_memory_range:
         min: 12288
-        max: 64321136
+        max: 65030624
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,22 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 144
-      job_id: j1pv07kz5
+      job_id: jygz7d84p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 493.0
+      throughput: 2028.3975659229209
+      estimated_peak_memory_range:
+        min: 163840
+        max: 49682240
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 134
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 134
+      job_id: jnp18z7ng
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 880.0
-      throughput: 1136.3636363636363
+      inference_time: 708.0
+      throughput: 1412.4293785310736
       estimated_peak_memory_range:
-        min: 618496
-        max: 36779824
+        min: 0
+        max: 35132704
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 137
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jlpeeym8p
+        total_layers: 137
+      job_id: jnp18z7kg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,36 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.386165Z'
+    timestamp: '2024-05-20T16:35:29.568191Z'
+  - torchscript_onnx_tflite:
+      inference_time: 627.0
+      throughput: 1594.896331738437
+      estimated_peak_memory_range:
+        min: 16384
+        max: 2002888
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 144
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 144
+      job_id: jz5w9684p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 645.0
+      throughput: 1550.3875968992247
+      estimated_peak_memory_range:
+        min: 24576
+        max: 70914568
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 134
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 134
+      job_id: jz5w968zp
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:29.568208Z'
   - torchscript_onnx_tflite:
-      inference_time: 2624.0
-      throughput: 381.0975609756098
+      inference_time: 2476.0
+      throughput: 403.8772213247173
       estimated_peak_memory_range:
         min: 12288
-        max: 20812688
+        max: 21173984
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 146
+        layers_on_npu: 144
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 146
-      job_id: j7gjz8075
+        total_layers: 144
+      job_id: jygzry0o5
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 26460.0
-      throughput: 37.79289493575208
+    torchscript_onnx_qnn:
+      inference_time: 2578.0
+      throughput: 387.8975950349108
       estimated_peak_memory_range:
-        min: 17575936
-        max: 85502320
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 163840
+        max: 52566912
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 134
         layers_on_gpu: 0
-        layers_on_cpu: 138
-        total_layers: 138
-      job_id: jygzond45
+        layers_on_cpu: 0
+        total_layers: 134
+      job_id: jqp4v428p
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -152,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.386219Z'
+    timestamp: '2024-05-20T16:35:29.568224Z'
   - torchscript_onnx_tflite:
-      inference_time: 7950.0
-      throughput: 125.78616352201257
+      inference_time: 7805.0
+      throughput: 128.12299807815504
       estimated_peak_memory_range:
-        min: 45056
-        max: 4402544
+        min: 16384
+        max: 7895408
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 146
+        layers_on_npu: 144
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 146
-      job_id: jn5qr427p
+        total_layers: 144
+      job_id: jz5wqzr35
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -175,27 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.386245Z'
-  - torchscript_onnx_tflite:
-      inference_time: 641.0
-      throughput: 1560.0624024960998
+    timestamp: '2024-05-20T16:35:29.568235Z'
+  - torchscript_onnx_qnn:
+      inference_time: 716.0
+      throughput: 1396.6480446927374
       estimated_peak_memory_range:
-        min: 12288
-        max: 1923000
+        min: 413696
+        max: 413696
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 134
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 134
+      job_id: jvgdv186g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 910.0
+      throughput: 1098.901098901099
+      estimated_peak_memory_range:
+        min: 39702528
+        max: 39702528
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 146
+        layers_on_npu: 137
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 146
-      job_id: jlpeenr7p
+        total_layers: 137
+      job_id: jvgdv18kg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 21412.0
+      throughput: 46.70278348589576
+      estimated_peak_memory_range:
+        min: 20770816
+        max: 20770816
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jz57drkq5
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.386270Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.568257Z'
diff --git a/qai_hub_models/models/lama_dilated/README.md b/qai_hub_models/models/lama_dilated/README.md
index 34259a6a..511bdc4a 100644
--- a/qai_hub_models/models/lama_dilated/README.md
+++ b/qai_hub_models/models/lama_dilated/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/l
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/lama_dilated/export.py b/qai_hub_models/models/lama_dilated/export.py
index 64713a41..a3f5ee20 100644
--- a/qai_hub_models/models/lama_dilated/export.py
+++ b/qai_hub_models/models/lama_dilated/export.py
@@ -120,12 +120,17 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image,mask"
+        + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image,mask"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +168,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image,mask", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image,mask", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +201,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
diff --git a/qai_hub_models/models/lama_dilated/perf.yaml b/qai_hub_models/models/lama_dilated/perf.yaml
index 80ce8f88..45244c8f 100644
--- a/qai_hub_models/models/lama_dilated/perf.yaml
+++ b/qai_hub_models/models/lama_dilated/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: LaMa-Dilated
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 87925.0
-      throughput: 11.373329542223486
+      inference_time: 87247.0
+      throughput: 11.46171215056105
       estimated_peak_memory_range:
-        min: 0
-        max: 3269648
+        min: 2240512
+        max: 138049312
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 347
-      job_id: jz5w24645
+      job_id: jqp4wrmqg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 81938.0
-      throughput: 12.204349630208206
+      inference_time: 81632.0
+      throughput: 12.250098000784007
       estimated_peak_memory_range:
-        min: 1654784
-        max: 33961664
+        min: 4276224
+        max: 42687880
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 332
+        layers_on_npu: 333
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 332
-      job_id: jnp1y6znp
+        total_layers: 333
+      job_id: jegnev7vg
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -76,7 +78,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jz5w246z5
+      job_id: j2p0r0v2p
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.412866Z'
+    timestamp: '2024-05-20T16:35:29.607433Z'
   - torchscript_onnx_tflite:
-      inference_time: 60997.0
-      throughput: 16.39424889748676
+      inference_time: 59804.0
+      throughput: 16.721289545849775
       estimated_peak_memory_range:
-        min: 2707456
-        max: 271146544
+        min: 2932736
+        max: 243608672
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,22 +101,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 347
-      job_id: jmg9jdnm5
+      job_id: j0px1o3jg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 57249.0
-      throughput: 17.4675540184108
+      inference_time: 57736.0
+      throughput: 17.32021615629763
       estimated_peak_memory_range:
-        min: 4161536
-        max: 189298048
+        min: 2392064
+        max: 161784064
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 332
+        layers_on_npu: 333
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 332
-      job_id: jvgde2165
+        total_layers: 333
+      job_id: jopry3nvg
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -129,7 +131,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jmg9jdnq5
+      job_id: j1p87y4z5
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.412968Z'
+    timestamp: '2024-05-20T16:35:29.607460Z'
   - torchscript_onnx_tflite:
-      inference_time: 87453.0
-      throughput: 11.434713503253176
+      inference_time: 85940.0
+      throughput: 11.63602513381429
       estimated_peak_memory_range:
-        min: 3260416
-        max: 139194808
+        min: 3170304
+        max: 139550144
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,22 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 347
-      job_id: jopr871v5
+      job_id: jo5mzxoyp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 82234.0
-      throughput: 12.160420264124328
+      inference_time: 80913.0
+      throughput: 12.358953443822378
       estimated_peak_memory_range:
-        min: 3178496
-        max: 33096560
+        min: 3190784
+        max: 42527696
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 332
+        layers_on_npu: 333
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 332
-      job_id: j1p80kwzg
+        total_layers: 333
+      job_id: jqpyd37rp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.413058Z'
+    timestamp: '2024-05-20T16:35:29.607478Z'
+  - torchscript_onnx_qnn:
+      inference_time: 92003.0
+      throughput: 10.869210786604784
+      estimated_peak_memory_range:
+        min: 4202496
+        max: 4202496
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 333
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 333
+      job_id: jep2myvx5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jogkyx9yp
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 659315.0
+      throughput: 1.5167256925748693
+      estimated_peak_memory_range:
+        min: 278200320
+        max: 278200320
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 220
+        total_layers: 220
+      job_id: jn5q2qm75
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.607503Z'
diff --git a/qai_hub_models/models/litehrnet/README.md b/qai_hub_models/models/litehrnet/README.md
index a8956a11..13fa47e0 100644
--- a/qai_hub_models/models/litehrnet/README.md
+++ b/qai_hub_models/models/litehrnet/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/l
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/litehrnet/app.py b/qai_hub_models/models/litehrnet/app.py
index f2f714f4..0f3fb398 100644
--- a/qai_hub_models/models/litehrnet/app.py
+++ b/qai_hub_models/models/litehrnet/app.py
@@ -103,6 +103,6 @@ def predict_pose_keypoints(
 
         predicted_images = []
         for i, img in enumerate(NHWC_int_numpy_frames):
-            draw_points(img, keypoints[i], color=(255, 0, 0), size=2)
+            draw_points(img, keypoints[i], color=(255, 0, 0), size=6)
             predicted_images.append(fromarray(img))
         return predicted_images
diff --git a/qai_hub_models/models/litehrnet/export.py b/qai_hub_models/models/litehrnet/export.py
index 92d23418..79e94273 100644
--- a/qai_hub_models/models/litehrnet/export.py
+++ b/qai_hub_models/models/litehrnet/export.py
@@ -120,7 +120,7 @@ def export_model(
 
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options
+        target_runtime, compile_options, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
diff --git a/qai_hub_models/models/litehrnet/info.yaml b/qai_hub_models/models/litehrnet/info.yaml
index 31da35bd..663ccd8a 100644
--- a/qai_hub_models/models/litehrnet/info.yaml
+++ b/qai_hub_models/models/litehrnet/info.yaml
@@ -27,7 +27,7 @@ form_factors:
   - IoT
 related_models: [openpose, hrnet_pose]
 has_static_banner: yes
-has_animated_banner: no
+has_animated_banner: yes
 license_type: apache-2.0
 deploy_license_type: AI Model Hub License
 dataset: []
diff --git a/qai_hub_models/models/litehrnet/perf.yaml b/qai_hub_models/models/litehrnet/perf.yaml
index 4eea5b8e..6ae1b3c7 100644
--- a/qai_hub_models/models/litehrnet/perf.yaml
+++ b/qai_hub_models/models/litehrnet/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: LiteHRNet
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 15561.0
-      throughput: 64.263222157959
+      inference_time: 11083.0
+      throughput: 90.22827754218171
       estimated_peak_memory_range:
-        min: 6553600
-        max: 13181120
+        min: 6615040
+        max: 31875176
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,7 +48,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 10
         total_layers: 1236
-      job_id: jvgde21k5
+      job_id: j1glkm1ep
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -61,7 +63,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jqp4k3rqg
+      job_id: jwgov6445
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +72,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.436998Z'
+    timestamp: '2024-05-20T16:35:29.637644Z'
   - torchscript_onnx_tflite:
-      inference_time: 10344.0
-      throughput: 96.67440061871616
+      inference_time: 7847.0
+      throughput: 127.43723716069836
       estimated_peak_memory_range:
-        min: 20480
-        max: 73273328
+        min: 16384
+        max: 74259408
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,7 +86,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 10
         total_layers: 1236
-      job_id: jz5709rqg
+      job_id: jw5614dvp
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -99,7 +101,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: j0pxnxoj5
+      job_id: j1pvwk97g
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +110,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.437166Z'
+    timestamp: '2024-05-20T16:35:29.637666Z'
   - torchscript_onnx_tflite:
-      inference_time: 15632.0
-      throughput: 63.97134083930399
+      inference_time: 11125.0
+      throughput: 89.88764044943821
       estimated_peak_memory_range:
-        min: 6529024
-        max: 10764512
+        min: 6553600
+        max: 11774200
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +124,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 10
         total_layers: 1236
-      job_id: j1gl6qeeg
+      job_id: j1p3m0wxg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +133,42 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.437300Z'
+    timestamp: '2024-05-20T16:35:29.637678Z'
+  - torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j7gjlnw7p
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 17746.0
+      throughput: 56.35072692437733
+      estimated_peak_memory_range:
+        min: 9547776
+        max: 9547776
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 939
+        total_layers: 939
+      job_id: jlpevml75
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.637696Z'
diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md b/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md
index ccb5645f..5f1fc83b 100644
--- a/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md
+++ b/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md
@@ -15,6 +15,8 @@ a hosted Qualcomm® device.
 
 
 
+
+
 ## License
 - The license for the original implementation of Llama-v2-7B-Chat can be found
   [here](https://github.com/facebookresearch/llama/blob/main/LICENSE).
@@ -29,3 +31,25 @@ a hosted Qualcomm® device.
 * For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
 
 
+## Usage and Limitations
+
+This model may not be used for or in connection with any of the following applications:
+
+- Accessing essential private and public services and benefits;
+- Administration of justice and democratic processes;
+- Assessing or recognizing the emotional state of a person;
+- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics;
+- Education and vocational training;
+- Employment and workers management;
+- Exploitation of the vulnerabilities of persons resulting in harmful behavior;
+- General purpose social scoring;
+- Law enforcement;
+- Management and operation of critical infrastructure;
+- Migration, asylum and border control management;
+- Predictive policing;
+- Real-time remote biometric identification in public spaces;
+- Recommender systems of social media platforms;
+- Scraping of facial images (from the internet or otherwise); and/or
+- Subliminal manipulation
+
+
diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml b/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml
index 92bb1baf..fc8a7ba1 100644
--- a/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml
+++ b/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml
@@ -1,6 +1,6 @@
 name: Llama-v2-7B-Chat
 id: llama_v2_7b_chat_quantized
-status: public # Renable when approved by marketing #9577
+status: public
 headline: State-of-the-art large language model useful on a variety of language
   understanding and generation tasks.
 domain: Generative AI
diff --git a/qai_hub_models/models/mediapipe_face/README.md b/qai_hub_models/models/mediapipe_face/README.md
index 5fa48aab..1a280d17 100644
--- a/qai_hub_models/models/mediapipe_face/README.md
+++ b/qai_hub_models/models/mediapipe_face/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mediapipe_face/export.py b/qai_hub_models/models/mediapipe_face/export.py
index 8b99631f..9aca58fe 100644
--- a/qai_hub_models/models/mediapipe_face/export.py
+++ b/qai_hub_models/models/mediapipe_face/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -226,12 +226,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(
-        model_cls=Model,
-        components=ALL_COMPONENTS,
-        supports_qnn=False,
-        supports_ort=False,
-    )
+    parser = export_parser(model_cls=Model, components=ALL_COMPONENTS)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mediapipe_face/model.py b/qai_hub_models/models/mediapipe_face/model.py
index 29b79435..e7d62e57 100644
--- a/qai_hub_models/models/mediapipe_face/model.py
+++ b/qai_hub_models/models/mediapipe_face/model.py
@@ -13,7 +13,7 @@
 from qai_hub_models.utils.input_spec import InputSpec
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 1
+MODEL_ASSET_VERSION = 2
 
 # Vertex indices can be found in
 # https://github.com/google/mediapipe/blob/0.8.1/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualization.png
diff --git a/qai_hub_models/models/mediapipe_face/perf.yaml b/qai_hub_models/models/mediapipe_face/perf.yaml
index 030a5d9e..14a2bc5b 100644
--- a/qai_hub_models/models/mediapipe_face/perf.yaml
+++ b/qai_hub_models/models/mediapipe_face/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MediaPipeFaceDetector
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 785.0
-      throughput: 1273.8853503184714
+      inference_time: 815.0
+      throughput: 1226.993865030675
       estimated_peak_memory_range:
-        min: 12288
-        max: 1533536
+        min: 20480
+        max: 1627976
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 112
-      job_id: jegnlk6v5
+      job_id: jygz7d4zp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 839.0
-      throughput: 1191.8951132300358
+      inference_time: 843.0
+      throughput: 1186.2396204033214
       estimated_peak_memory_range:
-        min: 815104
-        max: 6910200
+        min: 806912
+        max: 6902688
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 148
-      job_id: j2p036z2p
+      job_id: jqp4wr9qg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 996.0
-      throughput: 1004.0160642570281
+      inference_time: 993.0
+      throughput: 1007.0493454179255
       estimated_peak_memory_range:
-        min: 806912
-        max: 6602536
+        min: 802816
+        max: 72047760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 147
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1gl6lveg
+        total_layers: 147
+      job_id: j1p87y1z5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.455108Z'
+    timestamp: '2024-05-20T16:35:29.659622Z'
   - torchscript_onnx_tflite:
-      inference_time: 544.0
-      throughput: 1838.235294117647
+      inference_time: 569.0
+      throughput: 1757.469244288225
       estimated_peak_memory_range:
         min: 12288
-        max: 28679584
+        max: 30017104
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 112
-      job_id: jep20ekxg
+      job_id: jmg94nxq5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 595.0
-      throughput: 1680.672268907563
+      inference_time: 592.0
+      throughput: 1689.1891891891892
       estimated_peak_memory_range:
-        min: 802816
-        max: 47837376
+        min: 12288
+        max: 47426416
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 148
-      job_id: jogk78eyp
+      job_id: jo5mzxdyp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 706.0
-      throughput: 1416.4305949008499
+      inference_time: 719.0
+      throughput: 1390.8205841446454
       estimated_peak_memory_range:
         min: 12288
-        max: 20347024
+        max: 22023952
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 147
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p3v6jxg
+        total_layers: 147
+      job_id: jn5q2qv75
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.455176Z'
+    timestamp: '2024-05-20T16:35:29.659647Z'
   - torchscript_onnx_tflite:
-      inference_time: 784.0
-      throughput: 1275.5102040816328
+      inference_time: 778.0
+      throughput: 1285.3470437017995
       estimated_peak_memory_range:
-        min: 24576
-        max: 1602632
+        min: 12288
+        max: 1913768
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 112
-      job_id: jlpeen47p
+      job_id: jvgdv1zkg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 840.0
-      throughput: 1190.4761904761904
+      inference_time: 845.0
+      throughput: 1183.4319526627219
       estimated_peak_memory_range:
-        min: 815104
-        max: 6172048
+        min: 806912
+        max: 100815984
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 148
-      job_id: j0pxnzej5
+      job_id: jqpyd3mrp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,15 +178,68 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.455221Z'
+    timestamp: '2024-05-20T16:35:29.659664Z'
+  - torchscript_onnx_qnn:
+      inference_time: 928.0
+      throughput: 1077.5862068965516
+      estimated_peak_memory_range:
+        min: 786432
+        max: 786432
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 147
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 147
+      job_id: jopry3wvg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1157.0
+      throughput: 864.304235090752
+      estimated_peak_memory_range:
+        min: 3178496
+        max: 3178496
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 147
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 147
+      job_id: jw5614wvp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 5344.0
+      throughput: 187.125748502994
+      estimated_peak_memory_range:
+        min: 9064448
+        max: 9064448
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jwgov6845
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.659686Z'
 - name: MediaPipeFaceLandmarkDetector
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 315.0
-      throughput: 3174.6031746031745
+      inference_time: 325.0
+      throughput: 3076.923076923077
       estimated_peak_memory_range:
-        min: 24576
-        max: 1781952
+        min: 32768
+        max: 4219616
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -192,14 +247,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 101
-      job_id: jopr8wvv5
+      job_id: jz5w961zp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 390.0
-      throughput: 2564.102564102564
+      inference_time: 400.0
+      throughput: 2500.0
       estimated_peak_memory_range:
-        min: 458752
-        max: 94680040
+        min: 462848
+        max: 42261400
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -207,22 +262,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 107
-      job_id: j1p801qzg
+      job_id: j0px1odjg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 494.0
-      throughput: 2024.2914979757086
+      inference_time: 506.0
+      throughput: 1976.2845849802372
       estimated_peak_memory_range:
         min: 12288
-        max: 7623304
+        max: 7765592
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 106
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jw56ewyvg
+        total_layers: 106
+      job_id: jogkyx8yp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -231,13 +286,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.455272Z'
+    timestamp: '2024-05-20T16:35:29.659708Z'
   - torchscript_onnx_tflite:
-      inference_time: 230.0
-      throughput: 4347.826086956522
+      inference_time: 235.0
+      throughput: 4255.31914893617
       estimated_peak_memory_range:
         min: 12288
-        max: 25090016
+        max: 25797520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -245,14 +300,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 101
-      job_id: jqpyrm1r5
+      job_id: jnp18zvkg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 285.0
-      throughput: 3508.7719298245615
+      inference_time: 282.0
+      throughput: 3546.099290780142
       estimated_peak_memory_range:
         min: 12288
-        max: 33592960
+        max: 39404800
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -260,22 +315,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 107
-      job_id: jn5qev675
+      job_id: jegnevkvg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 408.0
-      throughput: 2450.9803921568628
+      inference_time: 395.0
+      throughput: 2531.6455696202534
       estimated_peak_memory_range:
         min: 12288
-        max: 15898592
+        max: 21486416
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 106
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jwgok824p
+        total_layers: 106
+      job_id: j1glkmlep
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -284,13 +339,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.455318Z'
+    timestamp: '2024-05-20T16:35:29.659728Z'
   - torchscript_onnx_tflite:
-      inference_time: 326.0
-      throughput: 3067.4846625766872
+      inference_time: 306.0
+      throughput: 3267.97385620915
       estimated_peak_memory_range:
-        min: 24576
-        max: 1871744
+        min: 28672
+        max: 1867256
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -298,14 +353,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 101
-      job_id: jygzo0vz5
+      job_id: jz57dr7q5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 396.0
-      throughput: 2525.252525252525
+      inference_time: 378.0
+      throughput: 2645.5026455026455
       estimated_peak_memory_range:
-        min: 458752
-        max: 81438752
+        min: 466944
+        max: 20140984
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -313,7 +368,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 107
-      job_id: jo5mqlvyp
+      job_id: j2p0r062p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -322,4 +377,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.455361Z'
+    timestamp: '2024-05-20T16:35:29.659743Z'
+  - torchscript_onnx_qnn:
+      inference_time: 546.0
+      throughput: 1831.5018315018315
+      estimated_peak_memory_range:
+        min: 442368
+        max: 442368
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 106
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 106
+      job_id: jep2myex5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 529.0
+      throughput: 1890.359168241966
+      estimated_peak_memory_range:
+        min: 4382720
+        max: 4382720
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 106
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 106
+      job_id: j1p3m06xg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2139.0
+      throughput: 467.50818139317437
+      estimated_peak_memory_range:
+        min: 5292032
+        max: 5292032
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 80
+        total_layers: 80
+      job_id: j1pvwk77g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.659765Z'
diff --git a/qai_hub_models/models/mediapipe_hand/README.md b/qai_hub_models/models/mediapipe_hand/README.md
index 481ebcb7..8e327a5a 100644
--- a/qai_hub_models/models/mediapipe_hand/README.md
+++ b/qai_hub_models/models/mediapipe_hand/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mediapipe_hand/export.py b/qai_hub_models/models/mediapipe_hand/export.py
index 8e8b2ec1..8e734c87 100644
--- a/qai_hub_models/models/mediapipe_hand/export.py
+++ b/qai_hub_models/models/mediapipe_hand/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -226,9 +226,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(
-        model_cls=Model, components=ALL_COMPONENTS, supports_ort=False
-    )
+    parser = export_parser(model_cls=Model, components=ALL_COMPONENTS)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mediapipe_hand/model.py b/qai_hub_models/models/mediapipe_hand/model.py
index 25d2c4e5..134f3751 100644
--- a/qai_hub_models/models/mediapipe_hand/model.py
+++ b/qai_hub_models/models/mediapipe_hand/model.py
@@ -14,7 +14,7 @@
 from qai_hub_models.utils.input_spec import InputSpec
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 1
+MODEL_ASSET_VERSION = 2
 
 # https://github.com/metalwhale/hand_tracking/blob/b2a650d61b4ab917a2367a05b85765b81c0564f2/run.py
 #        8   12  16  20
diff --git a/qai_hub_models/models/mediapipe_hand/perf.yaml b/qai_hub_models/models/mediapipe_hand/perf.yaml
index 828ff486..18cb733d 100644
--- a/qai_hub_models/models/mediapipe_hand/perf.yaml
+++ b/qai_hub_models/models/mediapipe_hand/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MediaPipeHandDetector
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 953.0
-      throughput: 1049.3179433368311
+      inference_time: 957.0
+      throughput: 1044.932079414838
       estimated_peak_memory_range:
         min: 12288
-        max: 7786576
+        max: 2098904
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 152
-      job_id: jlpeeyd7p
+      job_id: j7gjlnq7p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1019.0
-      throughput: 981.3542688910696
+      inference_time: 1014.0
+      throughput: 986.1932938856016
       estimated_peak_memory_range:
-        min: 806912
-        max: 8813592
+        min: 12288
+        max: 21477272
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 197
-      job_id: jnp1y64kp
+      job_id: jvgdv12kg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1219.0
-      throughput: 820.3445447087777
+      inference_time: 1160.0
+      throughput: 862.0689655172414
       estimated_peak_memory_range:
         min: 12288
-        max: 19518840
+        max: 18289360
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 196
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j0pxnxkj5
+        total_layers: 196
+      job_id: jqpyd3xrp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.499757Z'
+    timestamp: '2024-05-20T16:35:29.706483Z'
   - torchscript_onnx_tflite:
-      inference_time: 679.0
-      throughput: 1472.7540500736377
+      inference_time: 680.0
+      throughput: 1470.5882352941176
       estimated_peak_memory_range:
         min: 12288
-        max: 52020064
+        max: 53739952
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 152
-      job_id: jz5w24ez5
+      job_id: jygz7dnzp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 722.0
-      throughput: 1385.0415512465374
+      inference_time: 725.0
+      throughput: 1379.3103448275863
       estimated_peak_memory_range:
         min: 802816
-        max: 57062560
+        max: 62597664
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 197
-      job_id: jz5709yqg
+      job_id: jqp4wr3qg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 838.0
-      throughput: 1193.3174224343675
+      inference_time: 868.0
+      throughput: 1152.073732718894
       estimated_peak_memory_range:
-        min: 565248
-        max: 29618560
+        min: 380928
+        max: 38582032
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 196
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jegnlk0v5
+        total_layers: 196
+      job_id: j1p87yxz5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.499831Z'
+    timestamp: '2024-05-20T16:35:29.706509Z'
   - torchscript_onnx_tflite:
-      inference_time: 959.0
-      throughput: 1042.752867570386
+      inference_time: 956.0
+      throughput: 1046.0251046025105
       estimated_peak_memory_range:
         min: 24576
-        max: 3871952
+        max: 4980488
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 152
-      job_id: j1p3vr8xg
+      job_id: jmg94ndq5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1020.0
-      throughput: 980.3921568627451
+      inference_time: 1011.0
+      throughput: 989.1196834817013
       estimated_peak_memory_range:
-        min: 806912
-        max: 7974248
+        min: 802816
+        max: 6723176
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 197
-      job_id: jnp1ym3kp
+      job_id: jopry30vg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,15 +178,68 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.499901Z'
+    timestamp: '2024-05-20T16:35:29.706525Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1052.0
+      throughput: 950.5703422053232
+      estimated_peak_memory_range:
+        min: 786432
+        max: 786432
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 196
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 196
+      job_id: jo5mzx8yp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1200.0
+      throughput: 833.3333333333334
+      estimated_peak_memory_range:
+        min: 868352
+        max: 868352
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 196
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 196
+      job_id: jn5q2qy75
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 16080.0
+      throughput: 62.18905472636816
+      estimated_peak_memory_range:
+        min: 802816
+        max: 802816
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 116
+        total_layers: 116
+      job_id: jw56147vp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.706548Z'
 - name: MediaPipeHandLandmarkDetector
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1259.0
-      throughput: 794.2811755361398
+      inference_time: 1214.0
+      throughput: 823.7232289950576
       estimated_peak_memory_range:
-        min: 24576
-        max: 1977616
+        min: 16384
+        max: 2188824
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -192,14 +247,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 159
-      job_id: jygzon3z5
+      job_id: jlpevmy75
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1293.0
-      throughput: 773.3952049497293
+      inference_time: 1284.0
+      throughput: 778.816199376947
       estimated_peak_memory_range:
-        min: 638976
-        max: 10247184
+        min: 16384
+        max: 51815576
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -207,22 +262,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 210
-      job_id: jvgde2xk5
+      job_id: jz57dr9q5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 54823.0
-      throughput: 18.240519489995076
+      inference_time: 1506.0
+      throughput: 664.0106241699867
       estimated_peak_memory_range:
-        min: 217088
-        max: 18000624
+        min: 12288
+        max: 42058584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 209
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jo5mq8nyp
+        total_layers: 209
+      job_id: j2p0r0j2p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -231,13 +286,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.499968Z'
+    timestamp: '2024-05-20T16:35:29.706573Z'
   - torchscript_onnx_tflite:
-      inference_time: 901.0
-      throughput: 1109.8779134295228
+      inference_time: 889.0
+      throughput: 1124.859392575928
       estimated_peak_memory_range:
         min: 12288
-        max: 56691584
+        max: 57135392
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -245,14 +300,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 159
-      job_id: jmg9jdlq5
+      job_id: jz5w964zp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 963.0
-      throughput: 1038.4215991692627
+      inference_time: 948.0
+      throughput: 1054.8523206751054
       estimated_peak_memory_range:
         min: 802816
-        max: 62409504
+        max: 63945952
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -260,22 +315,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 210
-      job_id: jqp4k3lqg
+      job_id: j0px1oxjg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 41069.0
-      throughput: 24.34926586963403
+      inference_time: 1099.0
+      throughput: 909.9181073703367
       estimated_peak_memory_range:
-        min: 868352
-        max: 30450496
+        min: 802816
+        max: 33494480
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 209
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jopr8w6v5
+        total_layers: 209
+      job_id: jogkyx4yp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -284,13 +339,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.500029Z'
+    timestamp: '2024-05-20T16:35:29.706594Z'
   - torchscript_onnx_tflite:
-      inference_time: 1206.0
-      throughput: 829.1873963515754
+      inference_time: 1200.0
+      throughput: 833.3333333333334
       estimated_peak_memory_range:
-        min: 40960
-        max: 2078488
+        min: 12288
+        max: 2557040
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -298,14 +353,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 159
-      job_id: jwgok9m4p
+      job_id: jnp18z6kg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1308.0
-      throughput: 764.525993883792
+      inference_time: 1311.0
+      throughput: 762.7765064836003
       estimated_peak_memory_range:
-        min: 811008
-        max: 8238832
+        min: 815104
+        max: 52770200
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -313,7 +368,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 210
-      job_id: jvgdem0k5
+      job_id: jep2mywx5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -322,4 +377,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.500084Z'
+    timestamp: '2024-05-20T16:35:29.706611Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1461.0
+      throughput: 684.4626967830253
+      estimated_peak_memory_range:
+        min: 786432
+        max: 786432
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 209
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 209
+      job_id: jegnevnvg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1548.0
+      throughput: 645.9948320413437
+      estimated_peak_memory_range:
+        min: 19423232
+        max: 19423232
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 209
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 209
+      job_id: j1glkmxep
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 8524.0
+      throughput: 117.31581417175035
+      estimated_peak_memory_range:
+        min: 20221952
+        max: 20221952
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j1p3m09xg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.706632Z'
diff --git a/qai_hub_models/models/mediapipe_pose/README.md b/qai_hub_models/models/mediapipe_pose/README.md
index 06fd35f4..97b007ee 100644
--- a/qai_hub_models/models/mediapipe_pose/README.md
+++ b/qai_hub_models/models/mediapipe_pose/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mediapipe_pose/export.py b/qai_hub_models/models/mediapipe_pose/export.py
index a187daf4..71d05f58 100644
--- a/qai_hub_models/models/mediapipe_pose/export.py
+++ b/qai_hub_models/models/mediapipe_pose/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -226,9 +226,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(
-        model_cls=Model, components=ALL_COMPONENTS, supports_ort=False
-    )
+    parser = export_parser(model_cls=Model, components=ALL_COMPONENTS)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mediapipe_pose/model.py b/qai_hub_models/models/mediapipe_pose/model.py
index 7c96d6e5..583773a6 100644
--- a/qai_hub_models/models/mediapipe_pose/model.py
+++ b/qai_hub_models/models/mediapipe_pose/model.py
@@ -13,7 +13,7 @@
 from qai_hub_models.utils.input_spec import InputSpec
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 1
+MODEL_ASSET_VERSION = 2
 
 POSE_LANDMARK_CONNECTIONS = [
     (0, 1),
diff --git a/qai_hub_models/models/mediapipe_pose/perf.yaml b/qai_hub_models/models/mediapipe_pose/perf.yaml
index 68281558..d3b007a4 100644
--- a/qai_hub_models/models/mediapipe_pose/perf.yaml
+++ b/qai_hub_models/models/mediapipe_pose/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MediaPipePoseDetector
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 835.0
-      throughput: 1197.6047904191616
+      inference_time: 839.0
+      throughput: 1191.8951132300358
       estimated_peak_memory_range:
-        min: 16384
-        max: 1889240
+        min: 24576
+        max: 2326784
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 107
-      job_id: j2p03642p
+      job_id: jwgov6r45
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 884.0
-      throughput: 1131.2217194570135
+      inference_time: 873.0
+      throughput: 1145.475372279496
       estimated_peak_memory_range:
-        min: 69632
-        max: 15459024
+        min: 12288
+        max: 16427488
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 140
-      job_id: j1gl6l4eg
+      job_id: jmg94nmq5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1006.0
-      throughput: 994.0357852882704
+      inference_time: 1003.0
+      throughput: 997.0089730807578
       estimated_peak_memory_range:
-        min: 16384
-        max: 9676016
+        min: 36864
+        max: 10321904
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 139
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1pv07q75
+        total_layers: 139
+      job_id: jopry3lvg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.544709Z'
+    timestamp: '2024-05-20T16:35:29.766048Z'
   - torchscript_onnx_tflite:
-      inference_time: 612.0
-      throughput: 1633.986928104575
+      inference_time: 606.0
+      throughput: 1650.1650165016501
       estimated_peak_memory_range:
         min: 16384
-        max: 40580928
+        max: 41021648
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 107
-      job_id: jogk78vyp
+      job_id: j7gjln77p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 636.0
-      throughput: 1572.3270440251572
+      inference_time: 630.0
+      throughput: 1587.3015873015872
       estimated_peak_memory_range:
-        min: 208896
-        max: 44032080
+        min: 0
+        max: 45101520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 140
-      job_id: j1p3v6nxg
+      job_id: jvgdv13kg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 732.0
-      throughput: 1366.120218579235
+      inference_time: 769.0
+      throughput: 1300.3901170351105
       estimated_peak_memory_range:
-        min: 208896
-        max: 21601008
+        min: 212992
+        max: 30386624
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 139
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jlpeeyo7p
+        total_layers: 139
+      job_id: jqpyd3orp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.544766Z'
+    timestamp: '2024-05-20T16:35:29.766075Z'
   - torchscript_onnx_tflite:
-      inference_time: 845.0
-      throughput: 1183.4319526627219
+      inference_time: 829.0
+      throughput: 1206.2726176115802
       estimated_peak_memory_range:
-        min: 32768
-        max: 1538160
+        min: 77824
+        max: 1838752
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 107
-      job_id: j2p03xdep
+      job_id: jygz7dmzp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 886.0
-      throughput: 1128.6681715575621
+      inference_time: 875.0
+      throughput: 1142.857142857143
       estimated_peak_memory_range:
-        min: 12288
-        max: 104292296
+        min: 229376
+        max: 5314120
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 140
-      job_id: j1pv0nem5
+      job_id: jo5mzxmyp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,15 +178,68 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.544811Z'
+    timestamp: '2024-05-20T16:35:29.766091Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1047.0
+      throughput: 955.1098376313277
+      estimated_peak_memory_range:
+        min: 540672
+        max: 540672
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 139
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 139
+      job_id: jqp4wr1qg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1080.0
+      throughput: 925.925925925926
+      estimated_peak_memory_range:
+        min: 1073152
+        max: 1073152
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 139
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 139
+      job_id: j1p87yez5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 15947.0
+      throughput: 62.70771932024832
+      estimated_peak_memory_range:
+        min: 26939392
+        max: 26939392
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 81
+        total_layers: 81
+      job_id: jn5q2ql75
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.766117Z'
 - name: MediaPipePoseLandmarkDetector
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1206.0
-      throughput: 829.1873963515754
+      inference_time: 1204.0
+      throughput: 830.5647840531561
       estimated_peak_memory_range:
-        min: 16384
-        max: 2448848
+        min: 24576
+        max: 2528368
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -192,14 +247,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 230
-      job_id: j1p8012zg
+      job_id: j1pvwkd7g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1297.0
-      throughput: 771.0100231303007
+      inference_time: 1311.0
+      throughput: 762.7765064836003
       estimated_peak_memory_range:
-        min: 12288
-        max: 15533680
+        min: 16384
+        max: 13548072
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -207,22 +262,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 306
-      job_id: jw56ew2vg
+      job_id: jnp18zjkg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 106535.0
-      throughput: 9.386586567794621
+      inference_time: 1658.0
+      throughput: 603.1363088057901
       estimated_peak_memory_range:
-        min: 102400
-        max: 26214168
+        min: 53248
+        max: 26730224
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 304
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j7gjzqd75
+        total_layers: 304
+      job_id: jep2myrx5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -231,13 +286,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.544899Z'
+    timestamp: '2024-05-20T16:35:29.766142Z'
   - torchscript_onnx_tflite:
-      inference_time: 880.0
-      throughput: 1136.3636363636363
+      inference_time: 864.0
+      throughput: 1157.4074074074074
       estimated_peak_memory_range:
-        min: 16384
-        max: 87924496
+        min: 20480
+        max: 88312288
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -245,14 +300,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 230
-      job_id: jn5qev075
+      job_id: jlpevmz75
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 964.0
-      throughput: 1037.344398340249
+      inference_time: 948.0
+      throughput: 1054.8523206751054
       estimated_peak_memory_range:
         min: 802816
-        max: 83648384
+        max: 89559840
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -260,22 +315,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 306
-      job_id: jwgok8z4p
+      job_id: jz57dr4q5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 82694.0
-      throughput: 12.092775775751566
+      inference_time: 1253.0
+      throughput: 798.0845969672786
       estimated_peak_memory_range:
-        min: 819200
-        max: 35448288
+        min: 454656
+        max: 38605792
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 304
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jygzon2z5
+        total_layers: 304
+      job_id: j2p0r0m2p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -284,13 +339,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.544985Z'
+    timestamp: '2024-05-20T16:35:29.766163Z'
   - torchscript_onnx_tflite:
-      inference_time: 1247.0
-      throughput: 801.924619085806
+      inference_time: 1244.0
+      throughput: 803.8585209003215
       estimated_peak_memory_range:
-        min: 12288
-        max: 2817072
+        min: 86016
+        max: 3237392
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -298,14 +353,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 230
-      job_id: j1p80k68g
+      job_id: jz5w967zp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1291.0
-      throughput: 774.5933384972889
+      inference_time: 1309.0
+      throughput: 763.9419404125287
       estimated_peak_memory_range:
-        min: 24576
-        max: 13908424
+        min: 12288
+        max: 14098200
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -313,7 +368,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 306
-      job_id: j7gjz8o85
+      job_id: jegnevzvg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -322,4 +377,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.545055Z'
+    timestamp: '2024-05-20T16:35:29.766179Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1501.0
+      throughput: 666.2225183211193
+      estimated_peak_memory_range:
+        min: 786432
+        max: 786432
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 305
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 305
+      job_id: j0px1o4jg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1633.0
+      throughput: 612.369871402327
+      estimated_peak_memory_range:
+        min: 7917568
+        max: 7917568
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 304
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 304
+      job_id: jogkyx2yp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 6059.0
+      throughput: 165.0437365901964
+      estimated_peak_memory_range:
+        min: 20336640
+        max: 20336640
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j1glkmyep
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.766201Z'
diff --git a/qai_hub_models/models/mediapipe_selfie/README.md b/qai_hub_models/models/mediapipe_selfie/README.md
index 350d2545..fd842e56 100644
--- a/qai_hub_models/models/mediapipe_selfie/README.md
+++ b/qai_hub_models/models/mediapipe_selfie/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/mediapipe_selfie/export.py b/qai_hub_models/models/mediapipe_selfie/export.py
index d257c256..ec8731e8 100644
--- a/qai_hub_models/models/mediapipe_selfie/export.py
+++ b/qai_hub_models/models/mediapipe_selfie/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mediapipe_selfie/perf.yaml b/qai_hub_models/models/mediapipe_selfie/perf.yaml
index 46644d9a..f988331b 100644
--- a/qai_hub_models/models/mediapipe_selfie/perf.yaml
+++ b/qai_hub_models/models/mediapipe_selfie/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MediaPipe-Selfie-Segmentation
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 792.0
-      throughput: 1262.6262626262626
+      inference_time: 807.0
+      throughput: 1239.1573729863692
       estimated_peak_memory_range:
         min: 12288
-        max: 4536656
+        max: 1954960
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 118
-      job_id: jnp1y62kp
+      job_id: jw56148vp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 773.0
-      throughput: 1293.6610608020699
+      inference_time: 787.0
+      throughput: 1270.6480304955528
       estimated_peak_memory_range:
-        min: 32768
-        max: 18516080
+        min: 28672
+        max: 13500824
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 138
-      job_id: jz57092qg
+      job_id: j1pvwkl7g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 164651.0
-      throughput: 6.073452332509368
+      inference_time: 1327.0
+      throughput: 753.5795026375282
       estimated_peak_memory_range:
-        min: 1437696
-        max: 5932024
+        min: 802816
+        max: 5487496
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 140
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j0pxnx9j5
+        total_layers: 140
+      job_id: jz5w96lzp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.589824Z'
+    timestamp: '2024-05-20T16:35:29.823981Z'
   - torchscript_onnx_tflite:
-      inference_time: 536.0
-      throughput: 1865.6716417910447
+      inference_time: 542.0
+      throughput: 1845.018450184502
       estimated_peak_memory_range:
-        min: 12288
-        max: 23055696
+        min: 16384
+        max: 23610032
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 118
-      job_id: jvgde2nk5
+      job_id: j1p3m0zxg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 525.0
-      throughput: 1904.7619047619048
+      inference_time: 510.0
+      throughput: 1960.7843137254902
       estimated_peak_memory_range:
         min: 176128
-        max: 41755712
+        max: 41845584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 138
-      job_id: jqp4k3nqg
+      job_id: j7gjlnr7p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 121169.0
-      throughput: 8.252935981975588
+      inference_time: 945.0
+      throughput: 1058.2010582010582
       estimated_peak_memory_range:
         min: 12288
-        max: 18735968
+        max: 20917104
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 140
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jo5mq8eyp
+        total_layers: 140
+      job_id: jmg94nzq5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.589884Z'
+    timestamp: '2024-05-20T16:35:29.824007Z'
   - torchscript_onnx_tflite:
-      inference_time: 785.0
-      throughput: 1273.8853503184714
+      inference_time: 809.0
+      throughput: 1236.0939431396787
       estimated_peak_memory_range:
-        min: 12288
-        max: 2039720
+        min: 20480
+        max: 1607472
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 118
-      job_id: j0pxnzd95
+      job_id: jwgov6l45
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 772.0
-      throughput: 1295.3367875647668
+      inference_time: 787.0
+      throughput: 1270.6480304955528
       estimated_peak_memory_range:
-        min: 819200
-        max: 8273816
+        min: 806912
+        max: 41288280
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 138
-      job_id: jep20zvmg
+      job_id: jygz7dlzp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.589938Z'
+    timestamp: '2024-05-20T16:35:29.824029Z'
+  - torchscript_onnx_qnn:
+      inference_time: 945.0
+      throughput: 1058.2010582010582
+      estimated_peak_memory_range:
+        min: 786432
+        max: 786432
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 138
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 138
+      job_id: jlpevm775
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1395.0
+      throughput: 716.8458781362007
+      estimated_peak_memory_range:
+        min: 2465792
+        max: 2465792
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 140
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 140
+      job_id: jnp18znkg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 4582.0
+      throughput: 218.2453077258839
+      estimated_peak_memory_range:
+        min: 16928768
+        max: 16928768
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 110
+        total_layers: 110
+      job_id: jvgdv1dkg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.824052Z'
diff --git a/qai_hub_models/models/midas/README.md b/qai_hub_models/models/midas/README.md
new file mode 100644
index 00000000..69a660f5
--- /dev/null
+++ b/qai_hub_models/models/midas/README.md
@@ -0,0 +1,56 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [Midas-V2: Deep Convolutional Neural Network model for depth estimation](#)
+
+Midas is designed for estimating depth at each point in an image.
+
+This is based on the implementation of Midas-V2 found
+[here](https://github.com/isl-org/MiDaS). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](#).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.midas.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.midas.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of Midas-V2 can be found
+  [here](https://github.com/isl-org/MiDaS/blob/master/LICENSE).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer](https://arxiv.org/abs/1907.01341v3)
+* [Source Model Implementation](https://github.com/isl-org/MiDaS)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
diff --git a/qai_hub_models/models/midas/__init__.py b/qai_hub_models/models/midas/__init__.py
new file mode 100644
index 00000000..10b63ec3
--- /dev/null
+++ b/qai_hub_models/models/midas/__init__.py
@@ -0,0 +1,7 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from .app import MidasApp as App  # noqa: F401
+from .model import MODEL_ID  # noqa: F401
+from .model import Midas as Model  # noqa: F401
diff --git a/qai_hub_models/models/midas/app.py b/qai_hub_models/models/midas/app.py
new file mode 100644
index 00000000..949c87b1
--- /dev/null
+++ b/qai_hub_models/models/midas/app.py
@@ -0,0 +1,63 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Callable, List
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from qai_hub_models.utils.image_processing import pil_resize_pad, undo_resize_pad
+
+
+class MidasApp:
+    def __init__(
+        self,
+        model: Callable[[torch.Tensor], torch.Tensor],
+        input_height: int,
+        input_width: int,
+    ):
+        self.model = model
+        self.input_height = input_height
+        self.input_width = input_width
+
+    def predict(self, *args, **kwargs):
+        return self.estimate_depth(*args, **kwargs)
+
+    def estimate_depth(
+        self,
+        image: Image.Image,
+        raw_output: bool = False,
+    ) -> List[Image.Image] | np.ndarray:
+        """
+        Estimates the depth at each point in an image and produces a heatmap.
+
+        Parameters:
+            image: PIL Image to estimate depth.
+            raw_output: If set, returns the raw depth estimates instead of a heatmap.
+
+        Returns:
+            A heatmap PIL Image or an np array of depth estimates.
+            np array will be shape (h, w) where h, w are the dimensions of the input.
+            np array will contain raw depth estimates, while PIL image will normalize
+            the values and display them as an RGB image.
+        """
+        resized_image, scale, padding = pil_resize_pad(
+            image, (self.input_height, self.input_width)
+        )
+        image_tensor = transforms.ToTensor()(resized_image).unsqueeze(0)
+        with torch.no_grad():
+            prediction = self.model(image_tensor)
+        prediction = undo_resize_pad(
+            prediction.unsqueeze(0), image.size, scale, padding
+        )
+        numpy_output = prediction.squeeze().cpu().numpy()
+        if raw_output:
+            return numpy_output
+        heatmap = plt.cm.plasma(numpy_output / numpy_output.max())[..., :3]
+        return Image.fromarray((heatmap * 255).astype(np.uint8))
diff --git a/qai_hub_models/models/midas/conftest.py b/qai_hub_models/models/midas/conftest.py
new file mode 100644
index 00000000..fb82cdde
--- /dev/null
+++ b/qai_hub_models/models/midas/conftest.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+import inspect
+
+import pytest
+
+from qai_hub_models.models.midas import Model
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+# Instantiate the model only once for all tests.
+# Mock from_pretrained to always return the initialized model.
+# This speeds up tests and limits memory leaks.
+@pytest.fixture(scope="module", autouse=True)
+def cached_from_pretrained():
+    with pytest.MonkeyPatch.context() as mp:
+        pretrained_cache = {}
+        from_pretrained = Model.from_pretrained
+        sig = inspect.signature(from_pretrained)
+
+        @skip_clone_repo_check
+        def _cached_from_pretrained(*args, **kwargs):
+            cache_key = str(args) + str(kwargs)
+            model = pretrained_cache.get(cache_key, None)
+            if model:
+                return model
+            else:
+                model = from_pretrained(*args, **kwargs)
+                pretrained_cache[cache_key] = model
+                return model
+
+        _cached_from_pretrained.__signature__ = sig
+
+        mp.setattr(Model, "from_pretrained", _cached_from_pretrained)
+        yield mp
diff --git a/qai_hub_models/models/midas/demo.py b/qai_hub_models/models/midas/demo.py
new file mode 100644
index 00000000..9b2aa9b7
--- /dev/null
+++ b/qai_hub_models/models/midas/demo.py
@@ -0,0 +1,62 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Type
+
+from qai_hub_models.models.midas.app import MidasApp
+from qai_hub_models.models.midas.model import MODEL_ASSET_VERSION, MODEL_ID, Midas
+from qai_hub_models.utils.args import (
+    demo_model_from_cli_args,
+    get_model_cli_parser,
+    get_on_device_demo_parser,
+    validate_on_device_demo_args,
+)
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image
+from qai_hub_models.utils.display import display_or_save_image
+
+# Demo image comes from https://github.com/pytorch/hub/raw/master/images/dog.jpg
+INPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "test_input_image.jpg"
+)
+
+
+# Run Midas end-to-end on a sample image.
+# The demo will display a heatmap of the estimated depth at each point in the image.
+def midas_demo(model_cls: Type[Midas], is_test: bool = False):
+    # Demo parameters
+    parser = get_model_cli_parser(model_cls)
+    parser = get_on_device_demo_parser(parser, add_output_dir=True)
+    parser.add_argument(
+        "--image",
+        type=str,
+        default=INPUT_IMAGE_ADDRESS,
+        help="image file path or URL",
+    )
+    args = parser.parse_args([] if is_test else None)
+    model = demo_model_from_cli_args(model_cls, MODEL_ID, args)
+    validate_on_device_demo_args(args, MODEL_ID)
+
+    # Load image
+    (_, _, height, width) = model_cls.get_input_spec()["image"][0]
+    image = load_image(args.image)
+    print("Model Loaded")
+
+    app = MidasApp(model, height, width)
+    heatmap_image = app.estimate_depth(image)
+
+    if not is_test:
+        # Resize / unpad annotated image
+        display_or_save_image(
+            heatmap_image, args.output_dir, "midas_heatmap.png", "heatmap"
+        )
+
+
+def main(is_test: bool = False):
+    return midas_demo(model_cls=Midas, is_test=is_test)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/midas/export.py b/qai_hub_models/models/midas/export.py
new file mode 100644
index 00000000..b02c2b68
--- /dev/null
+++ b/qai_hub_models/models/midas/export.py
@@ -0,0 +1,217 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import qai_hub as hub
+import torch
+
+from qai_hub_models.models.midas import Model
+from qai_hub_models.utils.args import (
+    export_parser,
+    get_input_spec_kwargs,
+    get_model_kwargs,
+)
+from qai_hub_models.utils.base_model import TargetRuntime
+from qai_hub_models.utils.compare import torch_inference
+from qai_hub_models.utils.input_spec import make_torch_inputs
+from qai_hub_models.utils.printing import (
+    print_inference_metrics,
+    print_on_target_demo_cmd,
+    print_profile_metrics_from_job,
+)
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+    transpose_channel_first_to_last,
+)
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_downloading: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    target_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    compile_options: str = "",
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[
+    str
+]:
+    """
+    This function accomplishes 6 main tasks:
+
+        1. Instantiates a PyTorch model and converts it to a traced TorchScript format.
+        2. Compiles the model to an asset that can be run on device.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Downloads the model asset to the local directory.
+        6. Summarizes the results from profiling and inference.
+
+    Each of the last four steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_downloading: If set, skips downloading of compiled model.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling and inference.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        target_runtime: Which on-device runtime to target. Default is TFLite.
+        compile_options: Additional options to pass when submitting the compile job.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_pretrained` and `model.get_input_spec`
+
+    Returns:
+        A 3-tuple of:
+            * A CompileJob object containing metadata about the compile job submitted to hub.
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "midas"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "midas",
+            "Midas-V2",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            skip_downloading,
+            skip_summary,
+            output_path,
+            target_runtime,
+            compile_options,
+            profile_options,
+        )
+
+    # 1. Initialize PyTorch model
+    model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs))
+    input_spec = model.get_input_spec(
+        **get_input_spec_kwargs(model, additional_model_kwargs)
+    )
+
+    # Trace the model
+    model.eval()
+    source_model = torch.jit.trace(
+        model.to("cpu"), make_torch_inputs(input_spec), check_trace=False
+    )
+
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
+    # 2. Compile the model to an on-device asset
+    model_compile_options = model.get_hub_compile_options(
+        target_runtime, compile_options + channel_last_flags, hub_device
+    )
+    print(f"Optimizing model {model_name} to run on-device")
+    submitted_compile_job = hub.submit_compile_job(
+        model=source_model,
+        input_specs=input_spec,
+        device=hub_device,
+        name=model_name,
+        options=model_compile_options,
+    )
+    compile_job = cast(hub.client.CompileJob, submitted_compile_job)
+
+    # 3. Profile the model asset on real devices
+    profile_job: Optional[hub.client.ProfileJob] = None
+    if not skip_profiling:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(f"Profiling model {model_name} on a hosted device.")
+        submitted_profile_job = hub.submit_profile_job(
+            model=compile_job.get_target_model(),
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        profile_job = cast(hub.client.ProfileJob, submitted_profile_job)
+
+    # 4. Run inference on-device with sample inputs
+    inference_job: Optional[hub.client.InferenceJob] = None
+    if not skip_inferencing:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(
+            f"Running inference for {model_name} on a hosted device with example inputs."
+        )
+        sample_inputs = model.sample_inputs(input_spec)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
+        )
+        submitted_inference_job = hub.submit_inference_job(
+            model=compile_job.get_target_model(),
+            inputs=hub_inputs,
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        inference_job = cast(hub.client.InferenceJob, submitted_inference_job)
+
+    # 5. Download the model asset to a local file
+    if not skip_downloading:
+        os.makedirs(output_path, exist_ok=True)
+        target_model: hub.Model = compile_job.get_target_model()  # type: ignore
+        target_model.download(str(output_path / f"{model_name}.tflite"))
+
+    # 6. Summarize the results from profiling and inference
+    if not skip_summary and not skip_profiling:
+        assert profile_job is not None and profile_job.wait().success
+        profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+        print_profile_metrics_from_job(profile_job, profile_data)
+
+    if not skip_summary and not skip_inferencing:
+        torch_out = torch_inference(model, sample_inputs)
+        assert inference_job is not None and inference_job.wait().success
+        inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
+        print_inference_metrics(inference_job, inference_result, torch_out)
+
+    if not skip_summary:
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+
+    return (compile_job, profile_job, inference_job)
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(model_cls=Model)
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/midas/info.yaml b/qai_hub_models/models/midas/info.yaml
new file mode 100644
index 00000000..be8b2527
--- /dev/null
+++ b/qai_hub_models/models/midas/info.yaml
@@ -0,0 +1,34 @@
+name: Midas-V2
+# id must match with the model dir name in qai_hub_models
+id: midas
+status: public
+headline: Deep Convolutional Neural Network model for depth estimation.
+domain: Computer Vision
+use_case: Depth Estimation
+description: Midas is designed for estimating depth at each point in an image.
+tags: []
+research_paper: https://arxiv.org/abs/1907.01341v3
+research_paper_title: 'Towards Robust Monocular Depth Estimation: Mixing Datasets
+  for Zero-shot Cross-dataset Transfer'
+license: https://github.com/isl-org/MiDaS/blob/master/LICENSE
+deploy_license:
+  https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf
+source_repo: https://github.com/isl-org/MiDaS
+technical_details:
+  Model checkpoint: MiDaS_small
+  Input resolution: 256x256
+  Number of parameters: 16.6M
+  Model size: 63.2 MB
+applicable_scenarios:
+  - Anomaly Detection
+  - Inventory Management
+related_models: []
+form_factors:
+  - Phone
+  - Tablet
+  - IoT
+has_static_banner: yes
+has_animated_banner: no
+license_type: mit
+deploy_license_type: AI Model Hub License
+dataset: []
diff --git a/qai_hub_models/models/midas/model.py b/qai_hub_models/models/midas/model.py
new file mode 100644
index 00000000..3f6b1d53
--- /dev/null
+++ b/qai_hub_models/models/midas/model.py
@@ -0,0 +1,54 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+import torch
+
+from qai_hub_models.utils.base_model import BaseModel
+from qai_hub_models.utils.image_processing import normalize_image_torchvision
+from qai_hub_models.utils.input_spec import InputSpec
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 1
+DEFAULT_WEIGHTS = "MiDaS_small"
+
+
+class Midas(BaseModel):
+    """Exportable Midas depth estimation model."""
+
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        normalize_input: bool = True,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.normalize_input = normalize_input
+
+    @classmethod
+    def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> Midas:
+        model = torch.hub.load("intel-isl/MiDaS", weights).eval()
+        return cls(model)
+
+    @staticmethod
+    def get_input_spec(height: int = 256, width: int = 256) -> InputSpec:
+        return {"image": ((1, 3, height, width), "float32")}
+
+    def forward(self, image):
+        """
+        Runs the model on an image tensor and returns a tensor of depth estimates
+
+        Parameters:
+            image: A [1, 3, H, W] image.
+                   Pixel values pre-processed for encoder consumption.
+                   Range: float[0, 1] if self.normalize_input, else ~[-2.5, 2.5]
+                   3-channel Color Space: RGB
+
+        Returns:
+            Tensor of depth estimates of size [1, H, W].
+        """
+        if self.normalize_input:
+            image = normalize_image_torchvision(image)
+        return self.model(image)
diff --git a/qai_hub_models/models/midas/test.py b/qai_hub_models/models/midas/test.py
new file mode 100644
index 00000000..11377780
--- /dev/null
+++ b/qai_hub_models/models/midas/test.py
@@ -0,0 +1,51 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import numpy as np
+import pytest
+
+from qai_hub_models.models.midas.app import MidasApp
+from qai_hub_models.models.midas.demo import INPUT_IMAGE_ADDRESS
+from qai_hub_models.models.midas.demo import main as demo_main
+from qai_hub_models.models.midas.model import MODEL_ASSET_VERSION, MODEL_ID, Midas
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "midas_output.png"
+)
+
+
+# Verify that the output from Torch is as expected.
+@skip_clone_repo_check
+def test_task():
+    (_, _, height, width) = Midas.get_input_spec()["image"][0]
+    app = MidasApp(Midas.from_pretrained(), height, width)
+    original_image = load_image(INPUT_IMAGE_ADDRESS)
+    output_image = app.estimate_depth(original_image)
+    output_image_oracle = load_image(OUTPUT_IMAGE_ADDRESS)
+
+    np.testing.assert_allclose(
+        np.asarray(output_image), np.asarray(output_image_oracle), atol=3
+    )
+
+
+@pytest.mark.trace
+@skip_clone_repo_check
+def test_trace():
+    (_, _, height, width) = Midas.get_input_spec()["image"][0]
+    traced_model = Midas.from_pretrained().convert_to_torchscript(check_trace=False)
+    app = MidasApp(traced_model, height, width)
+    original_image = load_image(INPUT_IMAGE_ADDRESS)
+    output_image = app.estimate_depth(original_image)
+    output_image_oracle = load_image(OUTPUT_IMAGE_ADDRESS)
+
+    np.testing.assert_allclose(
+        np.asarray(output_image), np.asarray(output_image_oracle), atol=3
+    )
+
+
+@skip_clone_repo_check
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/mnasnet05/README.md b/qai_hub_models/models/mnasnet05/README.md
index 3921fdd5..f17444f4 100644
--- a/qai_hub_models/models/mnasnet05/README.md
+++ b/qai_hub_models/models/mnasnet05/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mnasnet05/export.py b/qai_hub_models/models/mnasnet05/export.py
index bf5d429c..046dda81 100644
--- a/qai_hub_models/models/mnasnet05/export.py
+++ b/qai_hub_models/models/mnasnet05/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mnasnet05/perf.yaml b/qai_hub_models/models/mnasnet05/perf.yaml
index 63b22c4e..8bfa98ec 100644
--- a/qai_hub_models/models/mnasnet05/perf.yaml
+++ b/qai_hub_models/models/mnasnet05/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MNASNet05
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 800.0
-      throughput: 1250.0
+      inference_time: 771.0
+      throughput: 1297.0168612191958
       estimated_peak_memory_range:
-        min: 16384
-        max: 1867832
+        min: 49152
+        max: 2163152
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 71
-      job_id: jopr8w8v5
+      job_id: jqpyd384p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 848.0
-      throughput: 1179.245283018868
+      inference_time: 824.0
+      throughput: 1213.5922330097087
       estimated_peak_memory_range:
-        min: 630784
-        max: 4926760
+        min: 16384
+        max: 45567712
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 103
-      job_id: jqpyrmrr5
+      job_id: jogkyx6op
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 990.0
-      throughput: 1010.10101010101
+      inference_time: 768.0
+      throughput: 1302.0833333333333
       estimated_peak_memory_range:
-        min: 12288
-        max: 21275160
+        min: 16384
+        max: 18880896
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 104
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p8010zg
+        total_layers: 104
+      job_id: j1p3m0ozg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.613841Z'
+    timestamp: '2024-05-20T16:35:29.865693Z'
   - torchscript_onnx_tflite:
-      inference_time: 530.0
-      throughput: 1886.7924528301887
+      inference_time: 522.0
+      throughput: 1915.7088122605364
       estimated_peak_memory_range:
-        min: 12288
-        max: 45612800
+        min: 16384
+        max: 46214320
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 71
-      job_id: jep20e0xg
+      job_id: j2p0r0oep
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 565.0
-      throughput: 1769.9115044247787
+      inference_time: 562.0
+      throughput: 1779.3594306049822
       estimated_peak_memory_range:
         min: 0
-        max: 41195552
+        max: 38662336
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 103
-      job_id: j2p03632p
+      job_id: jn5q2q4m5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 641.0
-      throughput: 1560.0624024960998
+      inference_time: 531.0
+      throughput: 1883.2391713747645
       estimated_peak_memory_range:
-        min: 24576
-        max: 21468016
+        min: 634880
+        max: 26749664
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 104
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jogk787yp
+        total_layers: 104
+      job_id: jwgov6dd5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.613900Z'
+    timestamp: '2024-05-20T16:35:29.865722Z'
   - torchscript_onnx_tflite:
-      inference_time: 799.0
-      throughput: 1251.5644555694619
+      inference_time: 774.0
+      throughput: 1291.9896640826873
       estimated_peak_memory_range:
-        min: 20480
-        max: 1900528
+        min: 28672
+        max: 1977952
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 71
-      job_id: j1p3vrwzg
+      job_id: j1p87yj85
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 852.0
-      throughput: 1173.7089201877934
+      inference_time: 834.0
+      throughput: 1199.0407673860911
       estimated_peak_memory_range:
-        min: 0
-        max: 47875160
+        min: 16384
+        max: 24694288
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 103
-      job_id: jlpeenl0p
+      job_id: jw5614o7p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.613941Z'
+    timestamp: '2024-05-20T16:35:29.865740Z'
+  - torchscript_onnx_qnn:
+      inference_time: 952.0
+      throughput: 1050.420168067227
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 103
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 103
+      job_id: j1glkmwlp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 816.0
+      throughput: 1225.4901960784314
+      estimated_peak_memory_range:
+        min: 15839232
+        max: 15839232
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 104
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 104
+      job_id: j1pvwk2mg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2632.0
+      throughput: 379.9392097264438
+      estimated_peak_memory_range:
+        min: 11706368
+        max: 11706368
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j7gjln38p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.865762Z'
diff --git a/qai_hub_models/models/mobilenet_v2/README.md b/qai_hub_models/models/mobilenet_v2/README.md
index 4fe640ba..4c9f4616 100644
--- a/qai_hub_models/models/mobilenet_v2/README.md
+++ b/qai_hub_models/models/mobilenet_v2/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mobilenet_v2/export.py b/qai_hub_models/models/mobilenet_v2/export.py
index 14a6b5ae..5e134e8a 100644
--- a/qai_hub_models/models/mobilenet_v2/export.py
+++ b/qai_hub_models/models/mobilenet_v2/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mobilenet_v2/perf.yaml b/qai_hub_models/models/mobilenet_v2/perf.yaml
index 27021fb5..3818b729 100644
--- a/qai_hub_models/models/mobilenet_v2/perf.yaml
+++ b/qai_hub_models/models/mobilenet_v2/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MobileNet-v2
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 974.0
-      throughput: 1026.694045174538
+      inference_time: 935.0
+      throughput: 1069.51871657754
       estimated_peak_memory_range:
         min: 20480
-        max: 1954912
+        max: 1805232
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: j1gl6l6eg
+      job_id: jlpevm605
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1281.0
-      throughput: 780.64012490242
+      inference_time: 1268.0
+      throughput: 788.6435331230284
       estimated_peak_memory_range:
         min: 622592
-        max: 7823048
+        max: 52139528
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 105
-      job_id: j1p3v6vxg
+      job_id: jmg94nov5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1110.0
-      throughput: 900.9009009009009
+      inference_time: 926.0
+      throughput: 1079.913606911447
       estimated_peak_memory_range:
         min: 12288
-        max: 31867536
+        max: 26577912
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 105
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1pv07075
+        total_layers: 105
+      job_id: jqp4wr4lg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.638108Z'
+    timestamp: '2024-05-20T16:35:29.896619Z'
   - torchscript_onnx_tflite:
-      inference_time: 651.0
-      throughput: 1536.0983102918588
+      inference_time: 622.0
+      throughput: 1607.717041800643
       estimated_peak_memory_range:
-        min: 16384
-        max: 56986240
+        min: 12288
+        max: 56265456
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: jw56ewevg
+      job_id: jygz7dz6p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 836.0
-      throughput: 1196.1722488038276
+      inference_time: 828.0
+      throughput: 1207.729468599034
       estimated_peak_memory_range:
         min: 618496
-        max: 42487872
+        max: 39673920
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 105
-      job_id: jwgok8k4p
+      job_id: jnp18zolg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 750.0
-      throughput: 1333.3333333333333
+      inference_time: 638.0
+      throughput: 1567.398119122257
       estimated_peak_memory_range:
-        min: 12288
-        max: 22319216
+        min: 471040
+        max: 25361728
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 105
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j7gjzqz75
+        total_layers: 105
+      job_id: j0px1or9g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.638177Z'
+    timestamp: '2024-05-20T16:35:29.896647Z'
   - torchscript_onnx_tflite:
-      inference_time: 957.0
-      throughput: 1044.932079414838
+      inference_time: 941.0
+      throughput: 1062.6992561105208
       estimated_peak_memory_range:
-        min: 57344
-        max: 1611720
+        min: 28672
+        max: 1470792
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: jvgdemzl5
+      job_id: jz5w96yjp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1265.0
-      throughput: 790.5138339920949
+      inference_time: 1269.0
+      throughput: 788.0220646178093
       estimated_peak_memory_range:
-        min: 618496
-        max: 128931224
+        min: 16384
+        max: 143728912
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 105
-      job_id: jo5mql8qp
+      job_id: jz57drnr5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.638216Z'
+    timestamp: '2024-05-20T16:35:29.896664Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1516.0
+      throughput: 659.6306068601583
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 105
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 105
+      job_id: jvgdv16lg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 994.0
+      throughput: 1006.0362173038229
+      estimated_peak_memory_range:
+        min: 17502208
+        max: 17502208
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 105
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 105
+      job_id: jo5mzxkqp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 8063.0
+      throughput: 124.0233163834801
+      estimated_peak_memory_range:
+        min: 798720
+        max: 798720
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 57
+        total_layers: 57
+      job_id: jegnevqmg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.896687Z'
diff --git a/qai_hub_models/models/mobilenet_v2_quantized/README.md b/qai_hub_models/models/mobilenet_v2_quantized/README.md
index 3fd6afea..2f07b35a 100644
--- a/qai_hub_models/models/mobilenet_v2_quantized/README.md
+++ b/qai_hub_models/models/mobilenet_v2_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mobilenet_v2_quantized/export.py b/qai_hub_models/models/mobilenet_v2_quantized/export.py
index 1d8b1899..4db91c07 100644
--- a/qai_hub_models/models/mobilenet_v2_quantized/export.py
+++ b/qai_hub_models/models/mobilenet_v2_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml
index 361584aa..c7088b78 100644
--- a/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml
+++ b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MobileNet-v2-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 302.0
-      throughput: 3311.2582781456954
+      inference_time: 295.0
+      throughput: 3389.830508474576
       estimated_peak_memory_range:
-        min: 16384
-        max: 1568424
+        min: 40960
+        max: 6698304
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: jygzonoz5
+      job_id: jopry3deg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 666.0
-      throughput: 1501.5015015015015
+      inference_time: 654.0
+      throughput: 1529.051987767584
       estimated_peak_memory_range:
-        min: 12288
-        max: 75287400
+        min: 172032
+        max: 5185880
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,22 +69,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 71
-      job_id: jmg9jdjq5
+      job_id: j2p0r09ep
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 897.0
-      throughput: 1114.8272017837235
+      inference_time: 634.0
+      throughput: 1577.2870662460568
       estimated_peak_memory_range:
-        min: 12288
-        max: 146664848
+        min: 200704
+        max: 21639208
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 77
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jvgde2ek5
+        total_layers: 77
+      job_id: j1glkm8lp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -91,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.662420Z'
+    timestamp: '2024-05-20T16:35:29.927306Z'
   - torchscript_onnx_tflite:
-      inference_time: 233.0
-      throughput: 4291.845493562232
+      inference_time: 238.0
+      throughput: 4201.680672268908
       estimated_peak_memory_range:
         min: 12288
-        max: 37162256
+        max: 37430768
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: jz5w242z5
+      job_id: jep2mydm5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 480.0
-      throughput: 2083.3333333333335
+      inference_time: 474.0
+      throughput: 2109.7046413502107
       estimated_peak_memory_range:
-        min: 159744
-        max: 36918192
+        min: 163840
+        max: 38345472
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -120,22 +122,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 71
-      job_id: jnp1y6ykp
+      job_id: j1p87yr85
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 644.0
-      throughput: 1552.7950310559006
+      inference_time: 463.0
+      throughput: 2159.827213822894
       estimated_peak_memory_range:
         min: 0
-        max: 18572416
+        max: 22362560
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 77
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz57090qg
+        total_layers: 77
+      job_id: jw5614m7p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -144,51 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.662476Z'
+    timestamp: '2024-05-20T16:35:29.927332Z'
   - torchscript_onnx_tflite:
-      inference_time: 949.0
-      throughput: 1053.740779768177
+      inference_time: 296.0
+      throughput: 3378.3783783783783
       estimated_peak_memory_range:
         min: 12288
-        max: 23229040
+        max: 1719624
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 74
+        layers_on_npu: 72
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 74
-      job_id: jogk7k8op
+        total_layers: 72
+      job_id: jqpyd324p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 659.0
+      throughput: 1517.4506828528072
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 20480
+        max: 75350840
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 71
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jwgok98dp
-      job_status: Failed
-    torchscript_onnx_ort:
-      inference_time: 6507.0
-      throughput: 153.68065160596282
+        total_layers: 71
+      job_id: jn5q2q1m5
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:29.927350Z'
+  - torchscript_onnx_tflite:
+      inference_time: 853.0
+      throughput: 1172.3329425556858
       estimated_peak_memory_range:
-        min: 335872
-        max: 43247408
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 12288
+        max: 23360768
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 72
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 72
+      job_id: j0pxyrqlg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1400.0
+      throughput: 714.2857142857143
+      estimated_peak_memory_range:
+        min: 0
+        max: 34410432
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 71
         layers_on_gpu: 0
-        layers_on_cpu: 84
-        total_layers: 84
-      job_id: jqp4k3kqg
+        layers_on_cpu: 0
+        total_layers: 71
+      job_id: j2p0l9wnp
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -197,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.662525Z'
+    timestamp: '2024-05-20T16:35:29.927366Z'
   - torchscript_onnx_tflite:
-      inference_time: 7442.0
-      throughput: 134.37248051599033
+      inference_time: 7603.0
+      throughput: 131.5270288044193
       estimated_peak_memory_range:
-        min: 12288
-        max: 11587968
+        min: 20480
+        max: 11376824
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 72
+        layers_on_npu: 70
         layers_on_gpu: 2
         layers_on_cpu: 0
-        total_layers: 74
-      job_id: j1gl2wkep
+        total_layers: 72
+      job_id: jo5m3k79g
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -220,42 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.662544Z'
-  - torchscript_onnx_tflite:
-      inference_time: 325.0
-      throughput: 3076.923076923077
+    timestamp: '2024-05-20T16:35:29.927377Z'
+  - torchscript_onnx_qnn:
+      inference_time: 762.0
+      throughput: 1312.3359580052493
       estimated_peak_memory_range:
-        min: 20480
-        max: 1768808
+        min: 573440
+        max: 573440
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 74
+        layers_on_npu: 71
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 74
-      job_id: j1p80k18g
+        total_layers: 71
+      job_id: jogkyx0op
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 695.0
-      throughput: 1438.8489208633093
+    torchscript_onnx_ort:
+      inference_time: 677.0
+      throughput: 1477.1048744460857
       estimated_peak_memory_range:
-        min: 20480
-        max: 131789128
+        min: 19963904
+        max: 19963904
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 73
+        layers_on_npu: 77
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 73
-      job_id: jw56e0w7g
+        total_layers: 77
+      job_id: j1p3m07zg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 43191.0
+      throughput: 23.15297168391563
+      estimated_peak_memory_range:
+        min: 20062208
+        max: 20062208
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jwgov6wd5
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.662578Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.927401Z'
diff --git a/qai_hub_models/models/mobilenet_v3_large/README.md b/qai_hub_models/models/mobilenet_v3_large/README.md
index 7cb2fa15..bf675b22 100644
--- a/qai_hub_models/models/mobilenet_v3_large/README.md
+++ b/qai_hub_models/models/mobilenet_v3_large/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mobilenet_v3_large/export.py b/qai_hub_models/models/mobilenet_v3_large/export.py
index c75123cf..28b64384 100644
--- a/qai_hub_models/models/mobilenet_v3_large/export.py
+++ b/qai_hub_models/models/mobilenet_v3_large/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mobilenet_v3_large/perf.yaml b/qai_hub_models/models/mobilenet_v3_large/perf.yaml
index c1e8a4af..07819107 100644
--- a/qai_hub_models/models/mobilenet_v3_large/perf.yaml
+++ b/qai_hub_models/models/mobilenet_v3_large/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MobileNet-v3-Large
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1022.0
-      throughput: 978.4735812133073
+      inference_time: 1002.0
+      throughput: 998.003992015968
       estimated_peak_memory_range:
-        min: 16384
-        max: 1643944
+        min: 12288
+        max: 1963520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 136
-      job_id: j0pxnxnj5
+      job_id: j1pvwkmmg
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 3790.0
-      throughput: 263.85224274406335
+    torchscript_onnx_qnn:
+      inference_time: 1037.0
+      throughput: 964.3201542912246
       estimated_peak_memory_range:
         min: 0
-        max: 28283024
+        max: 68891008
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 9
+        layers_on_npu: 144
         layers_on_gpu: 0
-        layers_on_cpu: 8
-        total_layers: 17
-      job_id: jegnlkmv5
+        layers_on_cpu: 0
+        total_layers: 144
+      job_id: jygz7dy6p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1038.0
+      throughput: 963.3911368015414
+      estimated_peak_memory_range:
+        min: 12288
+        max: 87795632
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 162
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 162
+      job_id: jvgdv14lg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.697406Z'
+    timestamp: '2024-05-20T16:35:29.966635Z'
   - torchscript_onnx_tflite:
-      inference_time: 691.0
-      throughput: 1447.178002894356
+      inference_time: 702.0
+      throughput: 1424.5014245014245
       estimated_peak_memory_range:
-        min: 16384
-        max: 61060464
+        min: 12288
+        max: 61294288
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 136
-      job_id: jo5mq8qyp
+      job_id: j7gjlny8p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 712.0
+      throughput: 1404.4943820224719
+      estimated_peak_memory_range:
+        min: 0
+        max: 51701120
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 144
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 144
+      job_id: jz5w96zjp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2766.0
-      throughput: 361.53289949385396
+      inference_time: 719.0
+      throughput: 1390.8205841446454
       estimated_peak_memory_range:
-        min: 12288
-        max: 25734304
+        min: 618496
+        max: 32246576
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 9
+        layers_on_npu: 162
         layers_on_gpu: 0
-        layers_on_cpu: 8
-        total_layers: 17
-      job_id: jopr8w2v5
+        layers_on_cpu: 0
+        total_layers: 162
+      job_id: jz57dr8r5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.697460Z'
+    timestamp: '2024-05-20T16:35:29.966661Z'
   - torchscript_onnx_tflite:
-      inference_time: 1022.0
-      throughput: 978.4735812133073
+      inference_time: 1001.0
+      throughput: 999.000999000999
       estimated_peak_memory_range:
-        min: 24576
-        max: 1929640
+        min: 20480
+        max: 1880160
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 136
-      job_id: jz5w2r4j5
+      job_id: jlpevmx05
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1041.0
+      throughput: 960.6147934678194
+      estimated_peak_memory_range:
+        min: 20480
+        max: 47502336
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 144
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 144
+      job_id: jnp18z1lg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.697490Z'
+    timestamp: '2024-05-20T16:35:29.966678Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1207.0
+      throughput: 828.5004142502071
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 144
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 144
+      job_id: jmg94n2v5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1108.0
+      throughput: 902.5270758122743
+      estimated_peak_memory_range:
+        min: 54001664
+        max: 54001664
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 162
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 162
+      job_id: jqp4wr2lg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 9897.0
+      throughput: 101.0407194099222
+      estimated_peak_memory_range:
+        min: 1593344
+        max: 1593344
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 126
+        total_layers: 126
+      job_id: j0px1oz9g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.966700Z'
diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/README.md b/qai_hub_models/models/mobilenet_v3_large_quantized/README.md
index 1c9f24d6..8873f0b0 100644
--- a/qai_hub_models/models/mobilenet_v3_large_quantized/README.md
+++ b/qai_hub_models/models/mobilenet_v3_large_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/export.py b/qai_hub_models/models/mobilenet_v3_large_quantized/export.py
index 9555199e..0b17aeb4 100644
--- a/qai_hub_models/models/mobilenet_v3_large_quantized/export.py
+++ b/qai_hub_models/models/mobilenet_v3_large_quantized/export.py
@@ -30,6 +30,7 @@
 from qai_hub_models.utils.qai_hub_helpers import (
     can_access_qualcomm_ai_hub,
     export_without_hub_access,
+    transpose_channel_first_to_last,
 )
 from qai_hub_models.utils.qnn_helpers import get_qnn_inputs
 
@@ -122,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -165,6 +173,14 @@ def export_model(
         hub_inputs = sample_inputs
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
+        )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
             inputs=hub_inputs,
@@ -200,7 +216,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml b/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml
index eabb0a70..fad12147 100644
--- a/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml
+++ b/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,38 +37,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MobileNet-v3-Large-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 585.0
-      throughput: 1709.4017094017095
+      inference_time: 357.0
+      throughput: 2801.1204481792715
+      estimated_peak_memory_range:
+        min: 16384
+        max: 2663832
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 135
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 135
+      job_id: jo5mzxlqp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 623.0
+      throughput: 1605.1364365971108
       estimated_peak_memory_range:
         min: 12288
-        max: 1681920
+        max: 7124224
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 136
+        layers_on_npu: 126
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 136
-      job_id: jqpyrmjr5
+        total_layers: 126
+      job_id: jep2myzm5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 6430.0
-      throughput: 155.52099533437013
+      inference_time: 5302.0
+      throughput: 188.6080724254998
       estimated_peak_memory_range:
-        min: 15818752
-        max: 29085400
+        min: 15572992
+        max: 31527200
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 9
+        layers_on_npu: 150
         layers_on_gpu: 0
-        layers_on_cpu: 8
-        total_layers: 17
-      job_id: j1p801mzg
+        layers_on_cpu: 24
+        total_layers: 174
+      job_id: jogkyxkop
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,36 +93,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.715082Z'
+    timestamp: '2024-05-20T16:35:29.996961Z'
   - torchscript_onnx_tflite:
-      inference_time: 413.0
-      throughput: 2421.3075060532688
+      inference_time: 277.0
+      throughput: 3610.1083032490974
       estimated_peak_memory_range:
         min: 12288
-        max: 46829184
+        max: 47595728
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 136
+        layers_on_npu: 135
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 136
-      job_id: j2p03622p
+        total_layers: 135
+      job_id: jegnevwmg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 452.0
+      throughput: 2212.3893805309735
+      estimated_peak_memory_range:
+        min: 0
+        max: 45251296
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jqpyd3y4p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 4730.0
-      throughput: 211.41649048625794
+      inference_time: 4131.0
+      throughput: 242.0721374969741
       estimated_peak_memory_range:
-        min: 21893120
-        max: 53274160
+        min: 21827584
+        max: 58653840
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 9
+        layers_on_npu: 150
         layers_on_gpu: 0
-        layers_on_cpu: 8
-        total_layers: 17
-      job_id: jogk78qyp
+        layers_on_cpu: 24
+        total_layers: 174
+      job_id: jn5q2qdm5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,88 +146,156 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.715131Z'
+    timestamp: '2024-05-20T16:35:29.996991Z'
   - torchscript_onnx_tflite:
-      inference_time: 1547.0
-      throughput: 646.4124111182934
+      inference_time: 351.0
+      throughput: 2849.002849002849
       estimated_peak_memory_range:
         min: 12288
-        max: 28081232
+        max: 1686776
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 138
+        layers_on_npu: 135
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 138
-      job_id: jegnlwnm5
+        total_layers: 135
+      job_id: jopry37eg
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 10400.0
-      throughput: 96.15384615384616
+    torchscript_onnx_qnn:
+      inference_time: 624.0
+      throughput: 1602.5641025641025
       estimated_peak_memory_range:
-        min: 11681792
-        max: 108762160
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 24576
+        max: 15252232
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 126
         layers_on_gpu: 0
-        layers_on_cpu: 218
-        total_layers: 218
-      job_id: jn5qevr75
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: j1p87yk85
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.715190Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:29.997008Z'
   - torchscript_onnx_tflite:
-      inference_time: 5306.0
-      throughput: 188.46588767433096
+      inference_time: 1189.0
+      throughput: 841.0428931875525
       estimated_peak_memory_range:
-        min: 40960
-        max: 2748408
+        min: 12288
+        max: 28245440
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 138
+        layers_on_npu: 135
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 138
-      job_id: jw56zo1vg
+        total_layers: 135
+      job_id: jqp4v4z1p
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jqpy629l5
+      job_status: Failed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.715215Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:29.997028Z'
   - torchscript_onnx_tflite:
-      inference_time: 667.0
-      throughput: 1499.2503748125937
+      inference_time: 6580.0
+      throughput: 151.9756838905775
       estimated_peak_memory_range:
-        min: 40960
-        max: 1853728
+        min: 45056
+        max: 10222544
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 138
+        layers_on_npu: 135
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 138
-      job_id: jo5mqlmqp
+        total_layers: 135
+      job_id: j0pxyrwlg
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.715252Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:29.997039Z'
+  - torchscript_onnx_qnn:
+      inference_time: 705.0
+      throughput: 1418.4397163120568
+      estimated_peak_memory_range:
+        min: 520192
+        max: 520192
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: j2p0r0xep
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 4772.0
+      throughput: 209.55574182732607
+      estimated_peak_memory_range:
+        min: 25464832
+        max: 25464832
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 150
+        layers_on_gpu: 0
+        layers_on_cpu: 24
+        total_layers: 174
+      job_id: j1glkmqlp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 28805.0
+      throughput: 34.71619510501649
+      estimated_peak_memory_range:
+        min: 20099072
+        max: 20099072
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jw561407p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:29.997063Z'
diff --git a/qai_hub_models/models/mobilenet_v3_small/README.md b/qai_hub_models/models/mobilenet_v3_small/README.md
index cb3ce811..9d058839 100644
--- a/qai_hub_models/models/mobilenet_v3_small/README.md
+++ b/qai_hub_models/models/mobilenet_v3_small/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/m
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/mobilenet_v3_small/export.py b/qai_hub_models/models/mobilenet_v3_small/export.py
index c1a8b1ad..57bde8ba 100644
--- a/qai_hub_models/models/mobilenet_v3_small/export.py
+++ b/qai_hub_models/models/mobilenet_v3_small/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/mobilenet_v3_small/perf.yaml b/qai_hub_models/models/mobilenet_v3_small/perf.yaml
index d5089b1d..5f36a06f 100644
--- a/qai_hub_models/models/mobilenet_v3_small/perf.yaml
+++ b/qai_hub_models/models/mobilenet_v3_small/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: MobileNet-v3-Small
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 840.0
-      throughput: 1190.4761904761904
+      inference_time: 834.0
+      throughput: 1199.0407673860911
       estimated_peak_memory_range:
-        min: 12288
-        max: 1842512
+        min: 16384
+        max: 1577560
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 124
-      job_id: j1gl6l2eg
+      job_id: j1p3m0rzg
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 3404.0
-      throughput: 293.7720329024677
+    torchscript_onnx_qnn:
+      inference_time: 866.0
+      throughput: 1154.7344110854503
       estimated_peak_memory_range:
         min: 16384
-        max: 13250040
+        max: 24077256
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: j7gjln88p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 813.0
+      throughput: 1230.0123001230013
+      estimated_peak_memory_range:
+        min: 12288
+        max: 34364368
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 10
+        layers_on_npu: 146
         layers_on_gpu: 0
-        layers_on_cpu: 9
-        total_layers: 19
-      job_id: j1p3v61xg
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jmg94nqv5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.742071Z'
+    timestamp: '2024-05-20T16:35:30.036067Z'
   - torchscript_onnx_tflite:
-      inference_time: 547.0
-      throughput: 1828.1535648994516
+      inference_time: 545.0
+      throughput: 1834.8623853211009
       estimated_peak_memory_range:
-        min: 12288
-        max: 40731056
+        min: 20480
+        max: 41085008
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 124
-      job_id: jw56ewzvg
+      job_id: jwgov69d5
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 3006.0
-      throughput: 332.667997338656
+    torchscript_onnx_qnn:
+      inference_time: 582.0
+      throughput: 1718.213058419244
       estimated_peak_memory_range:
         min: 12288
-        max: 27095152
+        max: 46524832
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 10
+        layers_on_npu: 126
         layers_on_gpu: 0
-        layers_on_cpu: 9
-        total_layers: 19
-      job_id: jwgok8n4p
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jlpevmn05
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 560.0
+      throughput: 1785.7142857142858
+      estimated_peak_memory_range:
+        min: 618496
+        max: 27970128
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jnp18zmlg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.742122Z'
+    timestamp: '2024-05-20T16:35:30.036094Z'
   - torchscript_onnx_tflite:
-      inference_time: 844.0
-      throughput: 1184.8341232227488
+      inference_time: 826.0
+      throughput: 1210.6537530266344
       estimated_peak_memory_range:
-        min: 12288
-        max: 1902856
+        min: 24576
+        max: 1999704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 124
-      job_id: j1gl6qxlg
+      job_id: j1pvwknmg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 866.0
+      throughput: 1154.7344110854503
+      estimated_peak_memory_range:
+        min: 0
+        max: 25356816
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jz5w96rjp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.742147Z'
+    timestamp: '2024-05-20T16:35:30.036111Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1032.0
+      throughput: 968.9922480620155
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jygz7d06p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 908.0
+      throughput: 1101.3215859030836
+      estimated_peak_memory_range:
+        min: 3018752
+        max: 3018752
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jvgdv1mlg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 4962.0
+      throughput: 201.53164046755342
+      estimated_peak_memory_range:
+        min: 1437696
+        max: 1437696
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 114
+        total_layers: 114
+      job_id: jz57dr1r5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.036134Z'
diff --git a/qai_hub_models/models/openai_clip/README.md b/qai_hub_models/models/openai_clip/README.md
index d22ca80d..06c429e8 100644
--- a/qai_hub_models/models/openai_clip/README.md
+++ b/qai_hub_models/models/openai_clip/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/o
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/openai_clip/export.py b/qai_hub_models/models/openai_clip/export.py
index 2c00d8fd..68ef8cab 100644
--- a/qai_hub_models/models/openai_clip/export.py
+++ b/qai_hub_models/models/openai_clip/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -226,12 +226,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(
-        model_cls=Model,
-        components=ALL_COMPONENTS,
-        supports_qnn=False,
-        supports_ort=False,
-    )
+    parser = export_parser(model_cls=Model, components=ALL_COMPONENTS)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/openai_clip/perf.yaml b/qai_hub_models/models/openai_clip/perf.yaml
index 22fec649..c61f9847 100644
--- a/qai_hub_models/models/openai_clip/perf.yaml
+++ b/qai_hub_models/models/openai_clip/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: CLIPTextEncoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 15395.0
-      throughput: 64.95615459564793
+      inference_time: 13312.0
+      throughput: 75.1201923076923
       estimated_peak_memory_range:
-        min: 32768
-        max: 2875584
+        min: 20480
+        max: 2971744
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 2
         total_layers: 576
-      job_id: j7gjzq275
+      job_id: jqp4wr6lg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 7826.0
+      throughput: 127.77919754663941
+      estimated_peak_memory_range:
+        min: 45056
+        max: 25299672
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 377
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 377
+      job_id: jqpyd3k4p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 33201.0
-      throughput: 30.119574711605072
+      inference_time: 31411.0
+      throughput: 31.83598102575531
       estimated_peak_memory_range:
-        min: 40960
-        max: 328459688
+        min: 16384
+        max: 325180960
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 389
         layers_on_gpu: 0
-        layers_on_cpu: 1
-        total_layers: 2
-      job_id: jmg9jdyq5
+        layers_on_cpu: 0
+        total_layers: 389
+      job_id: jwgov67d5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.759891Z'
+    timestamp: '2024-05-20T16:35:30.066622Z'
   - torchscript_onnx_tflite:
-      inference_time: 11237.0
-      throughput: 88.99172376968941
+      inference_time: 9410.0
+      throughput: 106.26992561105207
       estimated_peak_memory_range:
         min: 16384
-        max: 219358080
+        max: 211565584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 2
         total_layers: 576
-      job_id: jygzonjz5
+      job_id: jo5mzx1qp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 5494.0
+      throughput: 182.01674554058974
+      estimated_peak_memory_range:
+        min: 0
+        max: 141191120
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 377
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 377
+      job_id: j1p87yd85
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 23967.0
-      throughput: 41.7240372178412
+      inference_time: 22506.0
+      throughput: 44.43259575224385
       estimated_peak_memory_range:
         min: 36864
-        max: 216279616
+        max: 184881664
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 389
         layers_on_gpu: 0
-        layers_on_cpu: 1
-        total_layers: 2
-      job_id: jvgde2qk5
+        layers_on_cpu: 0
+        total_layers: 389
+      job_id: j7gjln68p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.759974Z'
+    timestamp: '2024-05-20T16:35:30.066650Z'
   - torchscript_onnx_tflite:
-      inference_time: 15367.0
-      throughput: 65.07451031430989
+      inference_time: 13176.0
+      throughput: 75.89556769884639
       estimated_peak_memory_range:
-        min: 49152
-        max: 3357800
+        min: 16384
+        max: 3268096
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 2
         total_layers: 576
-      job_id: j7gjz8785
+      job_id: jopry3meg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 7787.0
+      throughput: 128.4191601386927
+      estimated_peak_memory_range:
+        min: 32768
+        max: 17390072
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 377
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 377
+      job_id: jw561497p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,15 +178,68 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.760069Z'
+    timestamp: '2024-05-20T16:35:30.066668Z'
+  - torchscript_onnx_qnn:
+      inference_time: 8463.0
+      throughput: 118.16140848398913
+      estimated_peak_memory_range:
+        min: 229376
+        max: 229376
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 377
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 377
+      job_id: jn5q2qxm5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 32986.0
+      throughput: 30.315891590371674
+      estimated_peak_memory_range:
+        min: 137265152
+        max: 137265152
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 389
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 389
+      job_id: jygz7dq6p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 9446.0
+      throughput: 105.86491636671607
+      estimated_peak_memory_range:
+        min: 684032
+        max: 684032
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 290
+        total_layers: 290
+      job_id: jmg94n7v5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.066690Z'
 - name: CLIPImageEncoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 126657.0
-      throughput: 7.895339381163299
+      inference_time: 126619.0
+      throughput: 7.8977088746554625
       estimated_peak_memory_range:
-        min: 163840
-        max: 3470824
+        min: 126976
+        max: 4408960
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -147,23 +247,38 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 576
-      job_id: jlpeeyw7p
+      job_id: j0px1o89g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 50334.0
+      throughput: 19.86728652600628
+      estimated_peak_memory_range:
+        min: 16384
+        max: 67772216
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 371
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 371
+      job_id: j2p0r08ep
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 173185.0
+      throughput: 5.774172128071138
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 40960
+        max: 529782032
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 382
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jnp1y6wkp
-      job_status: Failed
+        total_layers: 382
+      job_id: j1pvwkymg
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -171,13 +286,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.760150Z'
+    timestamp: '2024-05-20T16:35:30.066714Z'
   - torchscript_onnx_tflite:
-      inference_time: 96976.0
-      throughput: 10.31182973106748
+      inference_time: 95991.0
+      throughput: 10.417643320727985
       estimated_peak_memory_range:
-        min: 229376
-        max: 865695568
+        min: 204800
+        max: 748165536
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -185,22 +300,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 576
-      job_id: jz5w243z5
+      job_id: jegnevdmg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 37870.0
+      throughput: 26.406126221283337
+      estimated_peak_memory_range:
+        min: 655360
+        max: 195252672
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 371
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 371
+      job_id: jogkyxwop
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 128177.0
-      throughput: 7.801711695546003
+      inference_time: 131060.0
+      throughput: 7.630093087135663
       estimated_peak_memory_range:
-        min: 774144
-        max: 1720363664
+        min: 618496
+        max: 1274243488
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 382
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz5w243j5
+        total_layers: 382
+      job_id: jlpevm005
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -209,13 +339,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.760227Z'
+    timestamp: '2024-05-20T16:35:30.066735Z'
   - torchscript_onnx_tflite:
-      inference_time: 127012.0
-      throughput: 7.873271816836205
+      inference_time: 126196.0
+      throughput: 7.924181432058068
       estimated_peak_memory_range:
-        min: 184320
-        max: 4508448
+        min: 155648
+        max: 4526472
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -223,7 +353,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 576
-      job_id: jlpeenz0p
+      job_id: jep2myqm5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 50570.0
+      throughput: 19.774569903104606
+      estimated_peak_memory_range:
+        min: 57344
+        max: 57651824
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 371
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 371
+      job_id: j1p3m0lzg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -232,4 +377,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.760295Z'
+    timestamp: '2024-05-20T16:35:30.066751Z'
+  - torchscript_onnx_qnn:
+      inference_time: 48896.0
+      throughput: 20.451570680628272
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 369
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 369
+      job_id: j1glkm9lp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 168856.0
+      throughput: 5.922205903254844
+      estimated_peak_memory_range:
+        min: 492744704
+        max: 492744704
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 382
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 382
+      job_id: jz5w960jp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jnp18zklg
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.066780Z'
diff --git a/qai_hub_models/models/openpose/README.md b/qai_hub_models/models/openpose/README.md
index 1789e21e..f5b7c4f4 100644
--- a/qai_hub_models/models/openpose/README.md
+++ b/qai_hub_models/models/openpose/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/o
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/openpose/export.py b/qai_hub_models/models/openpose/export.py
index d8e1d30e..430e7ecb 100644
--- a/qai_hub_models/models/openpose/export.py
+++ b/qai_hub_models/models/openpose/export.py
@@ -120,12 +120,17 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        + " --force_channel_last_output output_0,output_1"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0,output_1",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +168,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +199,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0,output_1", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0,output_1", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +216,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/openpose/info.yaml b/qai_hub_models/models/openpose/info.yaml
index 3ec39a9a..8b1c1e00 100644
--- a/qai_hub_models/models/openpose/info.yaml
+++ b/qai_hub_models/models/openpose/info.yaml
@@ -31,7 +31,7 @@ related_models:
   - litehrnet
   - mediapipe_pose
 has_static_banner: yes
-has_animated_banner: no
+has_animated_banner: yes
 license_type: other
 deploy_license_type: AI Model Hub License
 dataset: []
diff --git a/qai_hub_models/models/openpose/perf.yaml b/qai_hub_models/models/openpose/perf.yaml
index df272c6b..196de26e 100644
--- a/qai_hub_models/models/openpose/perf.yaml
+++ b/qai_hub_models/models/openpose/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: OpenPose
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 11751.0
-      throughput: 85.09914049868097
+      inference_time: 11697.0
+      throughput: 85.4920064973925
       estimated_peak_memory_range:
-        min: 225280
-        max: 2603680
+        min: 204800
+        max: 2413880
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 103
-      job_id: j0pxnxy95
+      job_id: jvgdv1ylg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 11827.0
-      throughput: 84.5522955948254
+      inference_time: 11783.0
+      throughput: 84.86803021301876
       estimated_peak_memory_range:
-        min: 651264
-        max: 242798248
+        min: 638976
+        max: 240653744
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 186
-      job_id: jegnlk3m5
+      job_id: jnp18z92g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 12055.0
-      throughput: 82.9531314807134
+      inference_time: 11925.0
+      throughput: 83.85744234800839
       estimated_peak_memory_range:
-        min: 589824
-        max: 430729112
+        min: 622592
+        max: 408558976
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 189
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jep20elmg
+        total_layers: 189
+      job_id: j0px1oq1g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.792235Z'
+    timestamp: '2024-05-20T16:35:30.123582Z'
   - torchscript_onnx_tflite:
-      inference_time: 8779.0
-      throughput: 113.90818999886092
+      inference_time: 8714.0
+      throughput: 114.75786091347257
       estimated_peak_memory_range:
-        min: 196608
-        max: 34017488
+        min: 212992
+        max: 35487584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 103
-      job_id: jo5mq83qp
+      job_id: jz5w96k6p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 8774.0
-      throughput: 113.97310234784591
+      inference_time: 8761.0
+      throughput: 114.1422212076247
       estimated_peak_memory_range:
-        min: 638976
-        max: 51579776
+        min: 618496
+        max: 53231792
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 186
-      job_id: jopr8wee5
+      job_id: jvgdv1keg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 9248.0
-      throughput: 108.13148788927336
+      inference_time: 9189.0
+      throughput: 108.82576994232234
       estimated_peak_memory_range:
-        min: 622592
-        max: 22342656
+        min: 2715648
+        max: 30463376
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 189
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jqpyrm645
+        total_layers: 189
+      job_id: jo5mzx7wp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.792298Z'
+    timestamp: '2024-05-20T16:35:30.123609Z'
   - torchscript_onnx_tflite:
-      inference_time: 11875.0
-      throughput: 84.21052631578948
+      inference_time: 11765.0
+      throughput: 84.99787505312368
       estimated_peak_memory_range:
-        min: 139264
-        max: 2225560
+        min: 233472
+        max: 2374096
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 103
-      job_id: j0pxnzl15
+      job_id: jmg94nrl5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 11826.0
-      throughput: 84.5594452900389
+      inference_time: 11798.0
+      throughput: 84.76012883539583
       estimated_peak_memory_range:
-        min: 663552
-        max: 242581864
+        min: 622592
+        max: 241415392
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 186
-      job_id: jep20zr4g
+      job_id: jqp4wr7vg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.792350Z'
+    timestamp: '2024-05-20T16:35:30.123627Z'
+  - torchscript_onnx_qnn:
+      inference_time: 14112.0
+      throughput: 70.86167800453515
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 186
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 186
+      job_id: jz57drml5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 12340.0
+      throughput: 81.03727714748784
+      estimated_peak_memory_range:
+        min: 90116096
+        max: 90116096
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 189
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 189
+      job_id: jegnev4rg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 167707.0
+      throughput: 5.962780325210039
+      estimated_peak_memory_range:
+        min: 87339008
+        max: 87339008
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 103
+        total_layers: 103
+      job_id: jopry3r9g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.123650Z'
diff --git a/qai_hub_models/models/posenet_mobilenet/README.md b/qai_hub_models/models/posenet_mobilenet/README.md
new file mode 100644
index 00000000..38ed61a7
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/README.md
@@ -0,0 +1,56 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [Posenet-Mobilenet: Perform accurate human pose estimation](#)
+
+Posenet performs pose estimation on human images.
+
+This is based on the implementation of Posenet-Mobilenet found
+[here](https://github.com/rwightman/posenet-pytorch). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](#).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.posenet_mobilenet.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.posenet_mobilenet.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of Posenet-Mobilenet can be found
+  [here](https://github.com/rwightman/posenet-pytorch/blob/master/LICENSE.txt).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [PersonLab: Person Pose Estimation and Instance Segmentation with a Bottom-Up, Part-Based, Geometric Embedding Model](https://arxiv.org/abs/1803.08225)
+* [Source Model Implementation](https://github.com/rwightman/posenet-pytorch)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
diff --git a/qai_hub_models/models/posenet_mobilenet/__init__.py b/qai_hub_models/models/posenet_mobilenet/__init__.py
new file mode 100644
index 00000000..87c45151
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/__init__.py
@@ -0,0 +1,7 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from .app import PosenetApp  # noqa: F401
+from .model import MODEL_ID  # noqa: F401
+from .model import PosenetMobilenet as Model  # noqa: F401
diff --git a/qai_hub_models/models/posenet_mobilenet/app.py b/qai_hub_models/models/posenet_mobilenet/app.py
new file mode 100644
index 00000000..2ccca2f2
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/app.py
@@ -0,0 +1,588 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Callable, List, Tuple
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+
+from qai_hub_models.models.posenet_mobilenet.model import OUTPUT_STRIDE
+from qai_hub_models.utils.draw import draw_points
+from qai_hub_models.utils.image_processing import pil_resize_pad, pil_undo_resize_pad
+
+# Most code here is from the source repo https://github.com/rwightman/posenet-pytorch
+
+PART_NAMES = [
+    "nose",
+    "leftEye",
+    "rightEye",
+    "leftEar",
+    "rightEar",
+    "leftShoulder",
+    "rightShoulder",
+    "leftElbow",
+    "rightElbow",
+    "leftWrist",
+    "rightWrist",
+    "leftHip",
+    "rightHip",
+    "leftKnee",
+    "rightKnee",
+    "leftAnkle",
+    "rightAnkle",
+]
+
+NUM_KEYPOINTS = len(PART_NAMES)
+
+PART_IDS = {pn: pid for pid, pn in enumerate(PART_NAMES)}
+LOCAL_MAXIMUM_RADIUS = 1
+
+POSE_CHAIN = [
+    ("nose", "leftEye"),
+    ("leftEye", "leftEar"),
+    ("nose", "rightEye"),
+    ("rightEye", "rightEar"),
+    ("nose", "leftShoulder"),
+    ("leftShoulder", "leftElbow"),
+    ("leftElbow", "leftWrist"),
+    ("leftShoulder", "leftHip"),
+    ("leftHip", "leftKnee"),
+    ("leftKnee", "leftAnkle"),
+    ("nose", "rightShoulder"),
+    ("rightShoulder", "rightElbow"),
+    ("rightElbow", "rightWrist"),
+    ("rightShoulder", "rightHip"),
+    ("rightHip", "rightKnee"),
+    ("rightKnee", "rightAnkle"),
+]
+
+PARENT_CHILD_TUPLES = [
+    (PART_IDS[parent], PART_IDS[child]) for parent, child in POSE_CHAIN
+]
+CONNECTED_PART_NAMES = [
+    ("leftHip", "leftShoulder"),
+    ("leftElbow", "leftShoulder"),
+    ("leftElbow", "leftWrist"),
+    ("leftHip", "leftKnee"),
+    ("leftKnee", "leftAnkle"),
+    ("rightHip", "rightShoulder"),
+    ("rightElbow", "rightShoulder"),
+    ("rightElbow", "rightWrist"),
+    ("rightHip", "rightKnee"),
+    ("rightKnee", "rightAnkle"),
+    ("leftShoulder", "rightShoulder"),
+    ("leftHip", "rightHip"),
+]
+
+CONNECTED_PART_INDICES = [(PART_IDS[a], PART_IDS[b]) for a, b in CONNECTED_PART_NAMES]
+
+
+def traverse_to_targ_keypoint(
+    edge_id: int,
+    source_keypoint: np.ndarray,
+    target_keypoint_id: int,
+    scores: np.ndarray,
+    offsets: np.ndarray,
+    displacements: np.ndarray,
+) -> Tuple[float, np.ndarray]:
+    """
+    Given a source keypoint and target_keypoint_id,
+        predict the score and coordinates of the target keypoint.
+
+    Parameters:
+        edge_id: Index of the edge being considered.
+            Equivalent to the index in `POSE_CHAIN`.
+        source_keypoint: (y, x) coordinates of the keypoint.
+        target_keypoint_id: Which body part type of the 17 this keypoint is.
+        scores: See `decode_multiple_poses`.
+        offsets: See `decode_multiple_poses`.
+        displacements: See `decode_multiple_poses`.
+
+    Returns:
+        Tuple of target keypoint score and coordinates.
+    """
+    height = scores.shape[1]
+    width = scores.shape[2]
+
+    source_keypoint_indices = np.clip(
+        np.round(source_keypoint / OUTPUT_STRIDE),
+        a_min=0,
+        a_max=[height - 1, width - 1],
+    ).astype(np.int32)
+
+    displaced_point = (
+        source_keypoint
+        + displacements[edge_id, source_keypoint_indices[0], source_keypoint_indices[1]]
+    )
+
+    displaced_point_indices = np.clip(
+        np.round(displaced_point / OUTPUT_STRIDE),
+        a_min=0,
+        a_max=[height - 1, width - 1],
+    ).astype(np.int32)
+
+    score = scores[
+        target_keypoint_id, displaced_point_indices[0], displaced_point_indices[1]
+    ]
+
+    image_coord = (
+        displaced_point_indices * OUTPUT_STRIDE
+        + offsets[
+            target_keypoint_id, displaced_point_indices[0], displaced_point_indices[1]
+        ]
+    )
+
+    return score, image_coord
+
+
+def decode_pose(
+    root_score: float,
+    root_id: int,
+    root_image_coord: np.ndarray,
+    scores: np.ndarray,
+    offsets: np.ndarray,
+    displacements_fwd: np.ndarray,
+    displacements_bwd: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get all keypoint predictions for a pose given a root keypoint with a high score.
+
+    Parameters:
+        root_score: The confidence score of the root keypoint.
+        root_id: Which body part type of the 17 this keypoint is.
+        root_image_coord: (y, x) coordinates of the keypoint.
+        scores: See `decode_multiple_poses`.
+        offsets: See `decode_multiple_poses`.
+        displacements_fwd: See `decode_multiple_poses`.
+        displacements_bwd: See `decode_multiple_poses`.
+
+    Returns:
+        Tuple of list of keypoint scores and list of coordinates.
+    """
+    num_parts = scores.shape[0]
+    num_edges = len(PARENT_CHILD_TUPLES)
+
+    instance_keypoint_scores = np.zeros(num_parts)
+    instance_keypoint_coords = np.zeros((num_parts, 2))
+    instance_keypoint_scores[root_id] = root_score
+    instance_keypoint_coords[root_id] = root_image_coord
+
+    for edge in reversed(range(num_edges)):
+        target_keypoint_id, source_keypoint_id = PARENT_CHILD_TUPLES[edge]
+        if (
+            instance_keypoint_scores[source_keypoint_id] > 0.0
+            and instance_keypoint_scores[target_keypoint_id] == 0.0
+        ):
+            score, coords = traverse_to_targ_keypoint(
+                edge,
+                instance_keypoint_coords[source_keypoint_id],
+                target_keypoint_id,
+                scores,
+                offsets,
+                displacements_bwd,
+            )
+            instance_keypoint_scores[target_keypoint_id] = score
+            instance_keypoint_coords[target_keypoint_id] = coords
+
+    for edge in range(num_edges):
+        source_keypoint_id, target_keypoint_id = PARENT_CHILD_TUPLES[edge]
+        if (
+            instance_keypoint_scores[source_keypoint_id] > 0.0
+            and instance_keypoint_scores[target_keypoint_id] == 0.0
+        ):
+            score, coords = traverse_to_targ_keypoint(
+                edge,
+                instance_keypoint_coords[source_keypoint_id],
+                target_keypoint_id,
+                scores,
+                offsets,
+                displacements_fwd,
+            )
+            instance_keypoint_scores[target_keypoint_id] = score
+            instance_keypoint_coords[target_keypoint_id] = coords
+
+    return instance_keypoint_scores, instance_keypoint_coords
+
+
+def within_nms_radius_fast(
+    pose_coords: np.ndarray, nms_radius: float, point: np.ndarray
+) -> bool:
+    """
+    Whether the candidate point is nearby any existing point in `pose_coords`.
+
+    pose_coords:
+        Numpy array of points, shape (N, 2).
+    nms_radius:
+        The distance between two points for them to be considered nearby.
+    point:
+        The candidate point, shape (2,).
+    """
+    if not pose_coords.shape[0]:
+        return False
+    return np.any(np.sum((pose_coords - point) ** 2, axis=1) <= nms_radius**2)
+
+
+def get_instance_score_fast(
+    exist_pose_coords: np.ndarray,
+    nms_radius: int,
+    keypoint_scores: np.ndarray,
+    keypoint_coords: np.ndarray,
+) -> float:
+    """
+    Compute a probability that the given pose is real.
+    Equal to the average confidence of each keypoint, excluding keypoints
+    that are shared with existing poses.
+
+    Parameters:
+        exist_pose_coords: Keypoint coordinates of poses that have already been found.
+            Shape (N, 17, 2)
+        nms_radius:
+            If two candidate keypoints for the same body part are within this distance,
+                they are considered the same, and the lower confidence one discarded.
+        keypoint_scores:
+            Keypoint scores for the new pose. Shape (17,)
+        keypoint_coords:
+            Coordinates for the new pose. Shape (17, 2)
+
+    Returns:
+        Confidence score for the pose.
+    """
+    if exist_pose_coords.shape[0]:
+        s = np.sum((exist_pose_coords - keypoint_coords) ** 2, axis=2) > nms_radius**2
+        not_overlapped_scores = np.sum(keypoint_scores[np.all(s, axis=0)])
+    else:
+        not_overlapped_scores = np.sum(keypoint_scores)
+    return not_overlapped_scores / len(keypoint_scores)
+
+
+def build_part_with_score_torch(
+    score_threshold: float, max_vals: torch.Tensor, scores: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Get candidate keypoints to be considered the root for a pose.
+    Score for the keypoint must be >= all neighboring scores.
+    Score must also be above given score_threshold.
+
+    Parameters:
+        score_threshold: Minimum score for a keypoint to be considered as a root.
+        max_vals: See `decode_multiple_poses`.
+        scores: See `decode_multiple_poses`.
+
+    Returns:
+        Tuple of:
+            - Torch scores for each keypoint to be considered.
+            - Indices of the considered keypoints. Shape (N, 3) where the 3 indices
+                map to the dimensions of the scores tensor with shape (17, h, w).
+    """
+    max_loc = (scores == max_vals) & (scores >= score_threshold)
+    max_loc_idx = max_loc.nonzero()
+    scores_vec = scores[max_loc]
+    sort_idx = torch.argsort(scores_vec, descending=True)
+    return scores_vec[sort_idx], max_loc_idx[sort_idx]
+
+
+def decode_multiple_poses(
+    scores: torch.Tensor,
+    offsets: torch.Tensor,
+    displacements_fwd: torch.Tensor,
+    displacements_bwd: torch.Tensor,
+    max_vals: torch.Tensor,
+    max_pose_detections: int = 10,
+    score_threshold: float = 0.25,
+    nms_radius: int = 20,
+    min_pose_score: float = 0.25,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Converts raw model outputs into image with keypoints drawn.
+    Can detect multiple poses in the same image, up to `max_pose_detections`.
+    This model has 17 candidate keypoints it predicts.
+    In this docstring, (h, w) correspond to height and width of the grid
+    and are roughly equal to input image size divided by 16.
+
+    Parameters:
+        scores:
+            Tensor of scores in range [0, 1] indicating probability
+                a candidate pose is real. Shape [17, h, w].
+        offsets:
+            Tensor of offsets for a given keypoint, relative to the grid point.
+                Shape [34, h, w].
+        displacements_fwd:
+            When tracing the points for a pose, given a source keypoint, this value
+                gives the displacement to the next keypoint in the pose. There are 16
+                connections from one keypoint to another (it's a minimum spanning tree).
+                Shape [32, h, w].
+        displacements_bwd:
+            Same as displacements_fwd, except when traversing keypoint connections
+                in the opposite direction.
+        max_vals:
+            Same as scores except with a max pool applied with kernel size 3.
+        max_pose_detections:
+            Maximum number of distinct poses to detect in a single image.
+        score_threshold:
+            Minimum score for a keypoint to be considered the root for a pose.
+        nms_radius:
+            If two candidate keypoints for the same body part are within this distance,
+                they are considered the same, and the lower confidence one discarded.
+        min_pose_score:
+            Minimum confidence that a pose exists for it to be displayed.
+
+    Returns:
+        Tuple of:
+            - Numpy array of pose confidence scores.
+            - Numpy array of keypoint confidence scores.
+            - Numpy array of keypoint coordinates.
+    """
+    part_scores, part_idx = build_part_with_score_torch(
+        score_threshold, max_vals, scores
+    )
+    part_scores = part_scores.cpu().numpy()
+    part_idx = part_idx.cpu().numpy()
+
+    scores = scores.cpu().numpy()
+    height = scores.shape[1]
+    width = scores.shape[2]
+    # change dimensions from (x, h, w) to (x//2, h, w, 2) to allow return of complete coord array
+    offsets = (
+        offsets.cpu().numpy().reshape(2, -1, height, width).transpose((1, 2, 3, 0))
+    )
+    displacements_fwd = (
+        displacements_fwd.cpu()
+        .numpy()
+        .reshape(2, -1, height, width)
+        .transpose((1, 2, 3, 0))
+    )
+    displacements_bwd = (
+        displacements_bwd.cpu()
+        .numpy()
+        .reshape(2, -1, height, width)
+        .transpose((1, 2, 3, 0))
+    )
+
+    pose_count = 0
+    pose_scores = np.zeros(max_pose_detections)
+    pose_keypoint_scores = np.zeros((max_pose_detections, NUM_KEYPOINTS))
+    pose_keypoint_coords = np.zeros((max_pose_detections, NUM_KEYPOINTS, 2))
+
+    for root_score, (root_id, root_coord_y, root_coord_x) in zip(part_scores, part_idx):
+        root_coord = np.array([root_coord_y, root_coord_x])
+        root_image_coords = (
+            root_coord * OUTPUT_STRIDE + offsets[root_id, root_coord_y, root_coord_x]
+        )
+
+        if within_nms_radius_fast(
+            pose_keypoint_coords[:pose_count, root_id, :],
+            nms_radius,
+            root_image_coords,
+        ):
+            continue
+
+        keypoint_scores, keypoint_coords = decode_pose(
+            root_score,
+            root_id,
+            root_image_coords,
+            scores,
+            offsets,
+            displacements_fwd,
+            displacements_bwd,
+        )
+
+        pose_score = get_instance_score_fast(
+            pose_keypoint_coords[:pose_count, :, :],
+            nms_radius,
+            keypoint_scores,
+            keypoint_coords,
+        )
+
+        # NOTE this isn't in the original implementation, but it appears that by initially ordering by
+        # part scores, and having a max # of detections, we can end up populating the returned poses with
+        # lower scored poses than if we discard 'bad' ones and continue (higher pose scores can still come later).
+        # Set min_pose_score to 0. to revert to original behaviour
+        if min_pose_score == 0.0 or pose_score >= min_pose_score:
+            pose_scores[pose_count] = pose_score
+            pose_keypoint_scores[pose_count, :] = keypoint_scores
+            pose_keypoint_coords[pose_count, :, :] = keypoint_coords
+            pose_count += 1
+
+        if pose_count >= max_pose_detections:
+            break
+
+    return pose_scores, pose_keypoint_scores, pose_keypoint_coords
+
+
+def get_adjacent_keypoints(
+    keypoint_scores: np.ndarray, keypoint_coords: np.ndarray, score_threshold: float
+) -> List[np.ndarray]:
+    """
+    Compute which keypoints should be connected in the image.
+
+    keypoint_scores:
+        Scores for all candidate keypoints in the pose.
+    keypoint_coords:
+        Coordinates for all candidate keypoints in the pose.
+    score_threshold:
+        If either keypoint in a candidate edge is below this threshold, omit the edge.
+
+    Returns:
+        List of (2, 2) numpy arrays containing coordinates of edge endpoints.
+    """
+    results = []
+    for left, right in CONNECTED_PART_INDICES:
+        if (
+            keypoint_scores[left] < score_threshold
+            or keypoint_scores[right] < score_threshold
+        ):
+            continue
+        results.append(
+            np.array(
+                [keypoint_coords[left][::-1], keypoint_coords[right][::-1]]
+            ).astype(np.int32),
+        )
+    return results
+
+
+def draw_skel_and_kp(
+    img: np.ndarray,
+    instance_scores: np.ndarray,
+    keypoint_scores: np.ndarray,
+    keypoint_coords: np.ndarray,
+    min_pose_score: float = 0.5,
+    min_part_score: float = 0.5,
+) -> None:
+    """
+    Draw the keypoints and edges on the input numpy array image in-place.
+
+    Parameters:
+        img: Numpy array of the image.
+        instance_scores: Numpy array of confidence for each pose.
+        keypoint_scores: Numpy array of confidence for each keypoint.
+        keypoint_coords: Numpy array of coordinates for each keypoint.
+        min_pose_score: Minimum score for a pose to be displayed.
+        min_part_score: Minimum score for a keypoint to be displayed.
+    """
+    adjacent_keypoints = []
+    points = []
+    sizes = []
+    for ii, score in enumerate(instance_scores):
+        if score < min_pose_score:
+            continue
+
+        new_connections = get_adjacent_keypoints(
+            keypoint_scores[ii, :], keypoint_coords[ii, :, :], min_part_score
+        )
+        adjacent_keypoints.extend(new_connections)
+
+        for ks, kc in zip(keypoint_scores[ii, :], keypoint_coords[ii, :, :]):
+            if ks < min_part_score:
+                continue
+            points.append([kc[1], kc[0]])
+            sizes.append(10.0 * ks)
+
+    if points:
+        points_np = np.array(points)
+        draw_points(img, points_np, color=(255, 255, 0), size=sizes)
+        cv2.polylines(img, adjacent_keypoints, isClosed=False, color=(255, 255, 0))
+
+
+class PosenetApp:
+    pass
+    """
+    This class consists of light-weight "app code" that is required to perform end to end inference with Posenet.
+
+    The app uses 1 model:
+        * Posenet
+
+    For a given image input, the app will:
+        * pre-process the image
+        * Run Posenet inference
+        * Convert the output into a list of keypoint coordiates
+        * Return raw coordinates or an image with keypoints overlayed
+    """
+
+    def __init__(
+        self,
+        model: Callable[
+            [torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        ],
+        input_height: int,
+        input_width: int,
+    ):
+        self.model = model
+        self.input_height = input_height
+        self.input_width = input_width
+
+    def predict(self, *args, **kwargs):
+        # See predict_pose_keypoints.
+        return self.predict_pose_keypoints(*args, **kwargs)
+
+    def predict_pose_keypoints(
+        self,
+        image: Image.Image,
+        raw_output: bool = False,
+    ) -> np.ndarray | Image.Image:
+        """
+        Predicts up to 17 pose keypoints for up to 10 people in the image.
+
+        Parameters:
+            image: Image on which to predict pose keypoints.
+            raw_output: bool
+                See "returns" doc section for details.
+
+        Returns:
+            If raw_output is true, returns:
+                pose_scores: np.ndarray, shape (10,)
+                    Confidence score that a given pose is real for up to 10 poses.
+                keypoint_scores: np.ndarray, shape (10, 17)
+                    Confidence score that a given keypoint is real. There can be up to
+                        10 poses and up to 17 keypoints per pose.
+                keypoint_coords: np.ndarray, shape (10, 17, 2)
+                    Coordinates of predicted keypoints in (y, x) format.
+
+            Otherwise, returns:
+                predicted_images: PIL.Image.Image
+                    Image with keypoints drawn.
+        """
+        original_size = (image.size[-2], image.size[-1])
+        image, scale, padding = pil_resize_pad(
+            image, (self.input_height, self.input_width)
+        )
+        tensor = transforms.ToTensor()(image)
+        tensor = tensor.reshape(1, 3, self.input_height, self.input_width)
+
+        np.save("build/posenet_inputs", tensor.numpy())
+        with torch.no_grad():
+            (
+                heatmaps_result,
+                offsets_result,
+                displacement_fwd_result,
+                displacement_bwd_result,
+                max_vals,
+            ) = self.model(tensor)
+        pose_scores, keypoint_scores, keypoint_coords = decode_multiple_poses(
+            heatmaps_result.squeeze(0),
+            offsets_result.squeeze(0),
+            displacement_fwd_result.squeeze(0),
+            displacement_bwd_result.squeeze(0),
+            max_vals.squeeze(0),
+            max_pose_detections=10,
+            min_pose_score=0.25,
+        )
+        if raw_output:
+            return pose_scores, keypoint_scores, keypoint_coords
+        output_arr = np.array(image)
+        draw_skel_and_kp(
+            output_arr,
+            pose_scores,
+            keypoint_scores,
+            keypoint_coords,
+            min_pose_score=0.25,
+            min_part_score=0.25,
+        )
+        image_result = Image.fromarray(output_arr)
+        return pil_undo_resize_pad(image_result, original_size, scale, padding)
diff --git a/qai_hub_models/models/posenet_mobilenet/conftest.py b/qai_hub_models/models/posenet_mobilenet/conftest.py
new file mode 100644
index 00000000..6c1bdeed
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/conftest.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+import inspect
+
+import pytest
+
+from qai_hub_models.models.posenet_mobilenet import Model
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+# Instantiate the model only once for all tests.
+# Mock from_pretrained to always return the initialized model.
+# This speeds up tests and limits memory leaks.
+@pytest.fixture(scope="module", autouse=True)
+def cached_from_pretrained():
+    with pytest.MonkeyPatch.context() as mp:
+        pretrained_cache = {}
+        from_pretrained = Model.from_pretrained
+        sig = inspect.signature(from_pretrained)
+
+        @skip_clone_repo_check
+        def _cached_from_pretrained(*args, **kwargs):
+            cache_key = str(args) + str(kwargs)
+            model = pretrained_cache.get(cache_key, None)
+            if model:
+                return model
+            else:
+                model = from_pretrained(*args, **kwargs)
+                pretrained_cache[cache_key] = model
+                return model
+
+        _cached_from_pretrained.__signature__ = sig
+
+        mp.setattr(Model, "from_pretrained", _cached_from_pretrained)
+        yield mp
diff --git a/qai_hub_models/models/posenet_mobilenet/demo.py b/qai_hub_models/models/posenet_mobilenet/demo.py
new file mode 100644
index 00000000..57a045f3
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/demo.py
@@ -0,0 +1,62 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Type
+
+from qai_hub_models.models.posenet_mobilenet.app import PosenetApp
+from qai_hub_models.models.posenet_mobilenet.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    PosenetMobilenet,
+)
+from qai_hub_models.utils.args import (
+    demo_model_from_cli_args,
+    get_model_cli_parser,
+    get_on_device_demo_parser,
+    validate_on_device_demo_args,
+)
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image
+from qai_hub_models.utils.display import display_or_save_image
+
+IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "posenet_demo.jpg"
+)
+
+
+# The demo will display a image with the predicted keypoints.
+def posenet_demo(model_cls: Type[PosenetMobilenet], is_test: bool = False):
+    # Demo parameters
+    parser = get_model_cli_parser(model_cls)
+    parser = get_on_device_demo_parser(parser, add_output_dir=True)
+    parser.add_argument(
+        "--image",
+        type=str,
+        default=IMAGE_ADDRESS,
+        help="image file path or URL",
+    )
+    args = parser.parse_args([] if is_test else None)
+    validate_on_device_demo_args(args, MODEL_ID)
+
+    # Load image & model
+    model = demo_model_from_cli_args(model_cls, MODEL_ID, args)
+    image = load_image(args.image)
+    print("Model Loaded")
+
+    h, w = model_cls.get_input_spec()["image"][0][2:]
+    app = PosenetApp(model, h, w)
+    keypoints = app.predict_pose_keypoints(image)
+    if not is_test:
+        display_or_save_image(
+            keypoints, args.output_dir, "posenet_demo_output.png", "keypoints"
+        )
+
+
+def main(is_test: bool = False):
+    return posenet_demo(PosenetMobilenet, is_test)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/posenet_mobilenet/export.py b/qai_hub_models/models/posenet_mobilenet/export.py
new file mode 100644
index 00000000..e948410d
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/export.py
@@ -0,0 +1,215 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import qai_hub as hub
+import torch
+
+from qai_hub_models.models.posenet_mobilenet import Model
+from qai_hub_models.utils.args import (
+    export_parser,
+    get_input_spec_kwargs,
+    get_model_kwargs,
+)
+from qai_hub_models.utils.base_model import TargetRuntime
+from qai_hub_models.utils.compare import torch_inference
+from qai_hub_models.utils.input_spec import make_torch_inputs
+from qai_hub_models.utils.printing import (
+    print_inference_metrics,
+    print_on_target_demo_cmd,
+    print_profile_metrics_from_job,
+)
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+    transpose_channel_first_to_last,
+)
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_downloading: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    target_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    compile_options: str = "",
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[
+    str
+]:
+    """
+    This function accomplishes 6 main tasks:
+
+        1. Instantiates a PyTorch model and converts it to a traced TorchScript format.
+        2. Compiles the model to an asset that can be run on device.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Downloads the model asset to the local directory.
+        6. Summarizes the results from profiling and inference.
+
+    Each of the last four steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_downloading: If set, skips downloading of compiled model.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling and inference.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        target_runtime: Which on-device runtime to target. Default is TFLite.
+        compile_options: Additional options to pass when submitting the compile job.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_pretrained` and `model.get_input_spec`
+
+    Returns:
+        A 3-tuple of:
+            * A CompileJob object containing metadata about the compile job submitted to hub.
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "posenet_mobilenet"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "posenet_mobilenet",
+            "Posenet-Mobilenet",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            skip_downloading,
+            skip_summary,
+            output_path,
+            target_runtime,
+            compile_options,
+            profile_options,
+        )
+
+    # 1. Initialize PyTorch model
+    model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs))
+    input_spec = model.get_input_spec(
+        **get_input_spec_kwargs(model, additional_model_kwargs)
+    )
+
+    # Trace the model
+    model.eval()
+    source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
+
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
+    # 2. Compile the model to an on-device asset
+    model_compile_options = model.get_hub_compile_options(
+        target_runtime, compile_options + channel_last_flags, hub_device
+    )
+    print(f"Optimizing model {model_name} to run on-device")
+    submitted_compile_job = hub.submit_compile_job(
+        model=source_model,
+        input_specs=input_spec,
+        device=hub_device,
+        name=model_name,
+        options=model_compile_options,
+    )
+    compile_job = cast(hub.client.CompileJob, submitted_compile_job)
+
+    # 3. Profile the model asset on real devices
+    profile_job: Optional[hub.client.ProfileJob] = None
+    if not skip_profiling:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(f"Profiling model {model_name} on a hosted device.")
+        submitted_profile_job = hub.submit_profile_job(
+            model=compile_job.get_target_model(),
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        profile_job = cast(hub.client.ProfileJob, submitted_profile_job)
+
+    # 4. Run inference on-device with sample inputs
+    inference_job: Optional[hub.client.InferenceJob] = None
+    if not skip_inferencing:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(
+            f"Running inference for {model_name} on a hosted device with example inputs."
+        )
+        sample_inputs = model.sample_inputs(input_spec)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
+        )
+        submitted_inference_job = hub.submit_inference_job(
+            model=compile_job.get_target_model(),
+            inputs=hub_inputs,
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        inference_job = cast(hub.client.InferenceJob, submitted_inference_job)
+
+    # 5. Download the model asset to a local file
+    if not skip_downloading:
+        os.makedirs(output_path, exist_ok=True)
+        target_model: hub.Model = compile_job.get_target_model()  # type: ignore
+        target_model.download(str(output_path / f"{model_name}.tflite"))
+
+    # 6. Summarize the results from profiling and inference
+    if not skip_summary and not skip_profiling:
+        assert profile_job is not None and profile_job.wait().success
+        profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+        print_profile_metrics_from_job(profile_job, profile_data)
+
+    if not skip_summary and not skip_inferencing:
+        torch_out = torch_inference(model, sample_inputs)
+        assert inference_job is not None and inference_job.wait().success
+        inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
+        print_inference_metrics(inference_job, inference_result, torch_out)
+
+    if not skip_summary:
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+
+    return (compile_job, profile_job, inference_job)
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(model_cls=Model)
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/posenet_mobilenet/info.yaml b/qai_hub_models/models/posenet_mobilenet/info.yaml
new file mode 100644
index 00000000..beaeaabb
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/info.yaml
@@ -0,0 +1,39 @@
+name: Posenet-Mobilenet
+# id must match with the model dir name in qai_hub_models
+id: posenet_mobilenet
+status: public
+headline: Perform accurate human pose estimation.
+domain: Computer Vision
+use_case: Pose Estimation
+description: Posenet performs pose estimation on human images.
+tags: []
+research_paper: https://arxiv.org/abs/1803.08225
+research_paper_title: 'PersonLab: Person Pose Estimation and Instance Segmentation
+  with a Bottom-Up, Part-Based, Geometric Embedding Model'
+license: https://github.com/rwightman/posenet-pytorch/blob/master/LICENSE.txt
+deploy_license:
+  https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf
+source_repo: https://github.com/rwightman/posenet-pytorch
+technical_details:
+  Model checkpoint: mobilenet_v1_101
+  Input resolution: 257x193
+  Number of parameters: 3.31M
+  Model size: 12.7 MB
+applicable_scenarios:
+  - Injury prevention training
+  - Sports performance analysis
+  - Posture recognition
+form_factors:
+  - Phone
+  - Tablet
+  - IoT
+related_models:
+  - litehrnet
+  - openpose
+  - hrnet_pose
+has_static_banner: yes
+has_animated_banner: yes
+license_type: other
+deploy_license_type: AI Model Hub License
+dataset:
+  - coco
diff --git a/qai_hub_models/models/posenet_mobilenet/model.py b/qai_hub_models/models/posenet_mobilenet/model.py
new file mode 100644
index 00000000..c4f17782
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/model.py
@@ -0,0 +1,81 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+from qai_hub_models.models.common import SampleInputsType
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    SourceAsRoot,
+    load_numpy,
+)
+from qai_hub_models.utils.base_model import BaseModel
+from qai_hub_models.utils.input_spec import InputSpec
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 2
+SOURCE_REPOSITORY = "https://github.com/rwightman/posenet-pytorch"
+COMMIT_HASH = "6f7376d47683553b99d6b67734bc8b368dbcda73"
+SAMPLE_INPUTS = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "posenet_inputs.npy"
+)
+DEFAULT_MODEL_WEIGHTS = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "mobilenet_v1_101.pth"
+)
+OUTPUT_STRIDE = 16
+
+
+class PosenetMobilenet(BaseModel):
+    def __init__(self, model: nn.Module) -> None:
+        super().__init__()
+        self.model = model
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: int = 101,
+    ) -> PosenetMobilenet:
+        with SourceAsRoot(
+            SOURCE_REPOSITORY,
+            COMMIT_HASH,
+            MODEL_ID,
+            MODEL_ASSET_VERSION,
+        ) as repo_path:
+            # Built in weights downloading is sometimes flaky.
+            # Download default weights from Qualcomm AWS
+            ckpt_path = Path(repo_path) / "_models" / DEFAULT_MODEL_WEIGHTS.path().name
+            if not ckpt_path.exists():
+                DEFAULT_MODEL_WEIGHTS.fetch()
+                os.makedirs(ckpt_path.parent, exist_ok=True)
+                os.symlink(DEFAULT_MODEL_WEIGHTS.path(), ckpt_path)
+
+            import posenet
+
+            model = posenet.load_model(model_id)
+
+            return cls(model).eval()
+
+    def forward(self, image):
+        """
+        Image inputs are expected to be in RGB format in the range [0, 1].
+        """
+        raw_output = self.model(image * 2.0 - 1.0)
+        max_vals = F.max_pool2d(raw_output[0], 3, stride=1, padding=1)
+        return (*raw_output, max_vals)
+
+    @staticmethod
+    def get_input_spec(
+        height: int = 513,
+        width: int = 257,
+    ) -> InputSpec:
+        return {"image": ((1, 3, height, width), "float32")}
+
+    def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType:
+        return {"image": [load_numpy(SAMPLE_INPUTS)]}
diff --git a/qai_hub_models/models/posenet_mobilenet/test.py b/qai_hub_models/models/posenet_mobilenet/test.py
new file mode 100644
index 00000000..889ec641
--- /dev/null
+++ b/qai_hub_models/models/posenet_mobilenet/test.py
@@ -0,0 +1,53 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import numpy as np
+
+from qai_hub_models.models.posenet_mobilenet.app import PosenetApp
+from qai_hub_models.models.posenet_mobilenet.demo import IMAGE_ADDRESS
+from qai_hub_models.models.posenet_mobilenet.demo import main as demo_main
+from qai_hub_models.models.posenet_mobilenet.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    PosenetMobilenet,
+)
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    load_numpy,
+)
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+KEYPOINT_SCORES_GT = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "keypoint_scores_gt.npy"
+)
+KEYPOINT_COORDS_GT = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "keypoint_coords_gt.npy"
+)
+
+
+@skip_clone_repo_check
+def test_task():
+    image = load_image(IMAGE_ADDRESS)
+    model = PosenetMobilenet.from_pretrained()
+    h, w = PosenetMobilenet.get_input_spec()["image"][0][2:]
+    app = PosenetApp(model, h, w)
+    pose_scores, keypoint_scores, keypoint_coords = app.predict(image, raw_output=True)
+
+    assert pose_scores[0] >= 0.5
+    assert pose_scores[1] >= 0.5
+    for score in pose_scores[2:]:
+        assert score < 1e-4
+
+    np.testing.assert_allclose(
+        keypoint_scores[:2], load_numpy(KEYPOINT_SCORES_GT), atol=1e-4
+    )
+    np.testing.assert_allclose(
+        keypoint_coords[:2], load_numpy(KEYPOINT_COORDS_GT), atol=1e-4
+    )
+
+
+@skip_clone_repo_check
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/protocols.py b/qai_hub_models/models/protocols.py
index adfdcf5e..1d79a391 100644
--- a/qai_hub_models/models/protocols.py
+++ b/qai_hub_models/models/protocols.py
@@ -20,9 +20,9 @@
 
 from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Protocol, Type, TypeVar, runtime_checkable
+from typing import Any, List, Optional, Protocol, Type, TypeVar, runtime_checkable
 
-from qai_hub.client import DatasetEntries, SourceModel
+from qai_hub.client import DatasetEntries, Device, SourceModel
 
 from qai_hub_models.evaluators.base_evaluators import BaseEvaluator, _DataLoader
 from qai_hub_models.models.common import (
@@ -200,6 +200,8 @@ def convert_to_hub_source_model(
         output_path: str | Path,
         input_spec: InputSpec | None = None,
         check_trace: bool = True,
+        external_onnx_weights: bool = False,
+        output_names: Optional[List[str]] = None,
     ) -> SourceModel:
         ...
 
@@ -207,6 +209,7 @@ def get_hub_compile_options(
         self,
         target_runtime: TargetRuntime,
         other_compile_options: str = "",
+        device: Optional[Device] = None,
     ) -> str:
         """
         AI Hub compile options recommended for the model.
diff --git a/qai_hub_models/models/quicksrnetlarge/README.md b/qai_hub_models/models/quicksrnetlarge/README.md
index af704650..12c61b60 100644
--- a/qai_hub_models/models/quicksrnetlarge/README.md
+++ b/qai_hub_models/models/quicksrnetlarge/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/quicksrnetlarge/export.py b/qai_hub_models/models/quicksrnetlarge/export.py
index c25dbb5a..4ea18e1d 100644
--- a/qai_hub_models/models/quicksrnetlarge/export.py
+++ b/qai_hub_models/models/quicksrnetlarge/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/quicksrnetlarge/model.py b/qai_hub_models/models/quicksrnetlarge/model.py
index bfed7f6c..bac993cc 100644
--- a/qai_hub_models/models/quicksrnetlarge/model.py
+++ b/qai_hub_models/models/quicksrnetlarge/model.py
@@ -57,7 +57,7 @@ def from_pretrained(cls) -> QuickSRNetLarge:
     def get_evaluator(self) -> BaseEvaluator:
         return SuperResolutionOutputEvaluator()
 
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
+    def forward(self, image):
         """
         Run QuickSRNet-Large on `image`, and produce an upscaled image
 
diff --git a/qai_hub_models/models/quicksrnetlarge/perf.yaml b/qai_hub_models/models/quicksrnetlarge/perf.yaml
index c94c1aaa..08f17738 100644
--- a/qai_hub_models/models/quicksrnetlarge/perf.yaml
+++ b/qai_hub_models/models/quicksrnetlarge/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: QuickSRNetLarge
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2492.0
-      throughput: 401.2841091492777
+      inference_time: 2434.0
+      throughput: 410.84634346754314
       estimated_peak_memory_range:
-        min: 16384
-        max: 8350520
+        min: 24576
+        max: 1530712
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 31
-      job_id: j1p801z8g
+      job_id: j7gjln9xp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2101.0
-      throughput: 475.9638267491671
+      inference_time: 2102.0
+      throughput: 475.7373929590866
       estimated_peak_memory_range:
-        min: 225280
-        max: 5584760
+        min: 16384
+        max: 6719848
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: jn5qev3m5
+      job_id: jz5w96j6p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2738.0
-      throughput: 365.23009495982467
+      inference_time: 2677.0
+      throughput: 373.55248412401943
       estimated_peak_memory_range:
-        min: 12288
-        max: 5692928
+        min: 28672
+        max: 47131704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 33
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jw56ewn7g
+        total_layers: 33
+      job_id: jz57drql5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.816274Z'
+    timestamp: '2024-05-20T16:35:30.164349Z'
   - torchscript_onnx_tflite:
-      inference_time: 1917.0
-      throughput: 521.6484089723526
+      inference_time: 1778.0
+      throughput: 562.429696287964
       estimated_peak_memory_range:
         min: 16384
-        max: 28332832
+        max: 28468960
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 31
-      job_id: jogk783op
+      job_id: jlpevmq15
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1500.0
-      throughput: 666.6666666666666
+      inference_time: 1506.0
+      throughput: 664.0106241699867
       estimated_peak_memory_range:
-        min: 208896
-        max: 17648384
+        min: 204800
+        max: 21459584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: j1gl6l3lg
+      job_id: jmg94n6l5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1897.0
-      throughput: 527.1481286241434
+      inference_time: 1850.0
+      throughput: 540.5405405405405
       estimated_peak_memory_range:
         min: 212992
-        max: 19230192
+        max: 18821168
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 33
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p3v6ezg
+        total_layers: 33
+      job_id: jqp4wrzvg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.816325Z'
+    timestamp: '2024-05-20T16:35:30.164374Z'
   - torchscript_onnx_tflite:
-      inference_time: 2485.0
-      throughput: 402.4144869215292
+      inference_time: 2448.0
+      throughput: 408.4967320261438
       estimated_peak_memory_range:
-        min: 32768
-        max: 1755936
+        min: 16384
+        max: 7574720
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 31
-      job_id: j1p80kexg
+      job_id: jygz7d6kp
       job_status: Passed
     torchscript_onnx_qnn:
       inference_time: 2097.0
       throughput: 476.87172150691464
       estimated_peak_memory_range:
-        min: 225280
-        max: 13035320
+        min: 212992
+        max: 78311448
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: jw56e080g
+      job_id: jvgdv1jeg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.816351Z'
+    timestamp: '2024-05-20T16:35:30.164392Z'
+  - torchscript_onnx_qnn:
+      inference_time: 2961.0
+      throughput: 337.7237419790611
+      estimated_peak_memory_range:
+        min: 217088
+        max: 217088
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 31
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 31
+      job_id: jnp18zr2g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2660.0
+      throughput: 375.9398496240602
+      estimated_peak_memory_range:
+        min: 13025280
+        max: 13025280
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 33
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 33
+      job_id: j0px1ow1g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 14976.0
+      throughput: 66.77350427350427
+      estimated_peak_memory_range:
+        min: 31150080
+        max: 31150080
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 15
+        total_layers: 15
+      job_id: jo5mzxjwp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.164414Z'
diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/README.md b/qai_hub_models/models/quicksrnetlarge_quantized/README.md
index 23624f4c..f749b985 100644
--- a/qai_hub_models/models/quicksrnetlarge_quantized/README.md
+++ b/qai_hub_models/models/quicksrnetlarge_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/export.py b/qai_hub_models/models/quicksrnetlarge_quantized/export.py
index 4cb7e308..125242cf 100644
--- a/qai_hub_models/models/quicksrnetlarge_quantized/export.py
+++ b/qai_hub_models/models/quicksrnetlarge_quantized/export.py
@@ -124,12 +124,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -171,8 +175,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -200,8 +206,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -213,7 +223,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/model.py b/qai_hub_models/models/quicksrnetlarge_quantized/model.py
index e9185b68..b1541f6d 100644
--- a/qai_hub_models/models/quicksrnetlarge_quantized/model.py
+++ b/qai_hub_models/models/quicksrnetlarge_quantized/model.py
@@ -8,30 +8,24 @@
 # This verifies aimet is installed, and this must be included first.
 from qai_hub_models.utils.quantization_aimet import (
     AIMETQuantizableMixin,
+    constrain_quantized_inputs_to_image_range,
 )
 
 # isort: on
 
 import torch
 from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
 from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
 
-from qai_hub_models.models.common import SourceModelFormat, TargetRuntime
 from qai_hub_models.models.quicksrnetlarge.model import QuickSRNetLarge
-from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
 from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 2
+MODEL_ASSET_VERSION = 3
 
-# Weights and config stored in S3 are sourced from
-# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_large_4x_w8a8.json:
-# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_large_4x_checkpoint_int8.pth
-# and
-# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js
-# Encodings were generated with AIMET QuantSim library
-QUANTIZED_WEIGHTS = "quicksrnet_large_4x_checkpoint_int8.pth"
-AIMET_ENCODINGS = "aimet_quantization_encodings.json"
+DEFAULT_ENCODINGS = "quicksrnetlarge_quantized_encodings.json"
 SCALING_FACTOR = 4
 
 
@@ -46,9 +40,7 @@ def __init__(
         quicksrnet_model: QuantizationSimModel,
     ) -> None:
         QuickSRNetLarge.__init__(self, quicksrnet_model.model)
-        AIMETQuantizableMixin.__init__(
-            self, quicksrnet_model, needs_onnx_direct_aimet_export=True
-        )
+        AIMETQuantizableMixin.__init__(self, quicksrnet_model)
 
     @classmethod
     def from_pretrained(
@@ -63,46 +55,27 @@ def from_pretrained(
             else: Interprets as a filepath and loads the encodings stored there.
         """
         # Load Model
-        quicksrnet = QuickSRNetLarge.from_pretrained()
-        input_shape = quicksrnet.get_input_spec()["image"][0]
-        equalize_model(quicksrnet, input_shape)
-
-        # Download weights and quantization parameters
-        weights = CachedWebModelAsset.from_asset_store(
-            MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS
-        ).fetch()
-        aimet_config = get_default_aimet_config_legacy_v2()
+        fp16_model = QuickSRNetLarge.from_pretrained()
+        input_shape = cls.get_input_spec()["image"][0]
+        model = prepare_model(fp16_model)
+        equalize_model(model, input_shape)
 
-        # Load the model weights and quantization parameters
-        # In this particular instance, the state_dict keys from the model are all named "model.<expected name>"
-        # where <expected name> is the name of each key in the weights file - without the word model.
-        # We rename all the keys to add the word model
-        state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"]
-        new_state_dict = {"model." + key: value for key, value in state_dict.items()}
-        quicksrnet.load_state_dict(new_state_dict)
         sim = QuantizationSimModel(
-            quicksrnet,
+            model,
             quant_scheme="tf_enhanced",
             default_param_bw=8,
             default_output_bw=8,
-            config_file=aimet_config,
+            config_file=get_default_aimet_config(),
             dummy_input=torch.rand(input_shape),
         )
+        constrain_quantized_inputs_to_image_range(sim)
         if aimet_encodings:
             if aimet_encodings == "DEFAULT":
                 aimet_encodings = CachedWebModelAsset.from_asset_store(
-                    MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS
+                    MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
                 ).fetch()
             load_encodings_to_sim(sim, aimet_encodings)
 
         sim.model.eval()
 
         return cls(sim)
-
-    def preferred_hub_source_model_format(
-        self, target_runtime: TargetRuntime
-    ) -> SourceModelFormat:
-        if target_runtime == TargetRuntime.QNN:
-            return SourceModelFormat.ONNX
-        else:
-            return SourceModelFormat.TORCHSCRIPT
diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml b/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml
index 5108803d..6ba5074a 100644
--- a/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml
+++ b/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: QuickSRNetLarge-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1512.0
-      throughput: 661.3756613756614
+      inference_time: 1340.0
+      throughput: 746.2686567164179
       estimated_peak_memory_range:
-        min: 20480
-        max: 1404424
+        min: 16384
+        max: 1701800
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,7 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 31
-      job_id: j1pv07vm5
+      job_id: jegnevjrg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1156.0
+      throughput: 865.0519031141869
+      estimated_peak_memory_range:
+        min: 16384
+        max: 8330600
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: jqpyd397p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1455.0
+      throughput: 687.2852233676975
+      estimated_peak_memory_range:
+        min: 212992
+        max: 8065904
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 24
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 24
+      job_id: jn5q2qj45
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -61,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.840385Z'
+    timestamp: '2024-05-20T16:35:30.195379Z'
   - torchscript_onnx_tflite:
-      inference_time: 1167.0
-      throughput: 856.898029134533
+      inference_time: 996.0
+      throughput: 1004.0160642570281
       estimated_peak_memory_range:
-        min: 12288
-        max: 25644128
+        min: 16384
+        max: 24755152
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -75,7 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 31
-      job_id: j7gjzqe85
+      job_id: jopry3z9g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 811.0
+      throughput: 1233.0456226880394
+      estimated_peak_memory_range:
+        min: 12288
+        max: 18436512
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: j2p0r0n6p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1054.0
+      throughput: 948.7666034155598
+      estimated_peak_memory_range:
+        min: 0
+        max: 16738208
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 24
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 24
+      job_id: j1glkmj8p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -84,73 +146,156 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.840407Z'
+    timestamp: '2024-05-20T16:35:30.195406Z'
   - torchscript_onnx_tflite:
-      inference_time: 6024.0
-      throughput: 166.00265604249668
+      inference_time: 1313.0
+      throughput: 761.6146230007616
       estimated_peak_memory_range:
-        min: 40960
-        max: 19668928
+        min: 360448
+        max: 2507680
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 30
+        layers_on_npu: 28
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 33
-      job_id: jz5w2ry65
+        total_layers: 31
+      job_id: jep2my245
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1162.0
+      throughput: 860.5851979345955
+      estimated_peak_memory_range:
+        min: 20480
+        max: 11496296
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: jogkyxj2p
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.840436Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.195422Z'
   - torchscript_onnx_tflite:
-      inference_time: 41995.0
-      throughput: 23.81235861412073
+      inference_time: 4195.0
+      throughput: 238.37902264600714
       estimated_peak_memory_range:
-        min: 1863680
-        max: 4699224
+        min: 45056
+        max: 18644448
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 30
+        layers_on_npu: 28
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 33
-      job_id: j1p31omxg
+        total_layers: 31
+      job_id: jz5wqzvm5
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j0pxyrjlg
+      job_status: Failed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.840455Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:30.195441Z'
   - torchscript_onnx_tflite:
-      inference_time: 1874.0
-      throughput: 533.6179295624333
+      inference_time: 37890.0
+      throughput: 26.392187912377935
       estimated_peak_memory_range:
-        min: 24576
-        max: 6948872
+        min: 3629056
+        max: 6133384
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 30
+        layers_on_npu: 28
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 33
-      job_id: jygzo0lk5
+        total_layers: 31
+      job_id: jmg9w218p
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.840469Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:30.195453Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1082.0
+      throughput: 924.2144177449168
+      estimated_peak_memory_range:
+        min: 53248
+        max: 53248
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: j1p87ylx5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1417.0
+      throughput: 705.7163020465773
+      estimated_peak_memory_range:
+        min: 8822784
+        max: 8822784
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 24
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 24
+      job_id: jw5614k0p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 212078.0
+      throughput: 4.715246277313064
+      estimated_peak_memory_range:
+        min: 29732864
+        max: 29732864
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j1p3m0ylg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.195475Z'
diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/test.py b/qai_hub_models/models/quicksrnetlarge_quantized/test.py
index 32337b60..16e59332 100644
--- a/qai_hub_models/models/quicksrnetlarge_quantized/test.py
+++ b/qai_hub_models/models/quicksrnetlarge_quantized/test.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 import os
-import tempfile
 import zipfile
 
 import numpy as np
@@ -18,7 +17,11 @@
     MODEL_ID,
     QuickSRNetLargeQuantizable,
 )
-from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    qaihm_temp_dir,
+)
 from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check
 
 OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store(
@@ -69,7 +72,7 @@ def test_trace():
 def test_aimet_export():
     model = QuickSRNetLargeQuantizable.from_pretrained()
     name = model.__class__.__name__
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         output_zip = model.convert_to_onnx_and_aimet_encodings(
             tmpdir,
         )
diff --git a/qai_hub_models/models/quicksrnetmedium/README.md b/qai_hub_models/models/quicksrnetmedium/README.md
index 191dd8dc..cb5b80f1 100644
--- a/qai_hub_models/models/quicksrnetmedium/README.md
+++ b/qai_hub_models/models/quicksrnetmedium/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/quicksrnetmedium/export.py b/qai_hub_models/models/quicksrnetmedium/export.py
index 6c99ed79..32246017 100644
--- a/qai_hub_models/models/quicksrnetmedium/export.py
+++ b/qai_hub_models/models/quicksrnetmedium/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/quicksrnetmedium/model.py b/qai_hub_models/models/quicksrnetmedium/model.py
index e050d160..abb5817a 100644
--- a/qai_hub_models/models/quicksrnetmedium/model.py
+++ b/qai_hub_models/models/quicksrnetmedium/model.py
@@ -35,6 +35,7 @@ def __init__(
         quicksrnet_model: torch.nn.Module,
     ) -> None:
         super().__init__()
+        self.relu = torch.nn.ReLU()
         self.model = quicksrnet_model
 
     @classmethod
@@ -57,7 +58,7 @@ def from_pretrained(cls) -> QuickSRNetMedium:
     def get_evaluator(self) -> BaseEvaluator:
         return SuperResolutionOutputEvaluator()
 
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
+    def forward(self, image):
         """
         Run QuickSRNet-Medium on `image`, and produce an upscaled image
 
@@ -71,7 +72,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor:
                    Range: float[0, 1]
                    3-channel Color Space: RGB
         """
-
+        # image = self.relu(image)
         return self.model(image)
 
     @staticmethod
diff --git a/qai_hub_models/models/quicksrnetmedium/perf.yaml b/qai_hub_models/models/quicksrnetmedium/perf.yaml
index d7547648..bf8575e7 100644
--- a/qai_hub_models/models/quicksrnetmedium/perf.yaml
+++ b/qai_hub_models/models/quicksrnetmedium/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: QuickSRNetMedium
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1385.0
-      throughput: 722.0216606498195
+      inference_time: 1388.0
+      throughput: 720.4610951008646
       estimated_peak_memory_range:
-        min: 16384
-        max: 1507064
+        min: 32768
+        max: 1844064
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 17
-      job_id: jlpeeyk0p
+      job_id: jwgov6jx5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 998.0
-      throughput: 1002.0040080160321
+      inference_time: 1011.0
+      throughput: 989.1196834817013
       estimated_peak_memory_range:
-        min: 221184
-        max: 7358048
+        min: 28672
+        max: 8507224
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 17
-      job_id: jz5w24qj5
+      job_id: jlpevmj15
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1500.0
-      throughput: 666.6666666666666
+      inference_time: 1498.0
+      throughput: 667.5567423230974
       estimated_peak_memory_range:
-        min: 212992
-        max: 8597144
+        min: 12288
+        max: 8500872
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 19
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jnp1y6elp
+        total_layers: 19
+      job_id: jnp18z02g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.860888Z'
+    timestamp: '2024-05-20T16:35:30.235442Z'
   - torchscript_onnx_tflite:
-      inference_time: 871.0
-      throughput: 1148.105625717566
+      inference_time: 923.0
+      throughput: 1083.4236186348862
       estimated_peak_memory_range:
         min: 16384
-        max: 19182544
+        max: 19845568
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 17
-      job_id: jygzonr65
+      job_id: j1pvwkjjg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 641.0
-      throughput: 1560.0624024960998
+      inference_time: 648.0
+      throughput: 1543.20987654321
       estimated_peak_memory_range:
         min: 208896
-        max: 14603312
+        max: 15787072
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 17
-      job_id: jmg9jdwv5
+      job_id: jygz7d1kp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1118.0
-      throughput: 894.4543828264758
+      inference_time: 1030.0
+      throughput: 970.8737864077669
       estimated_peak_memory_range:
-        min: 217088
-        max: 15048656
+        min: 0
+        max: 14123616
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 19
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jvgde2ol5
+        total_layers: 19
+      job_id: jvgdv1weg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.860931Z'
+    timestamp: '2024-05-20T16:35:30.235469Z'
   - torchscript_onnx_tflite:
-      inference_time: 1365.0
-      throughput: 732.6007326007326
+      inference_time: 1370.0
+      throughput: 729.92700729927
       estimated_peak_memory_range:
         min: 24576
-        max: 16231088
+        max: 1369376
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 17
-      job_id: jz5708olg
+      job_id: j7gjlnjxp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1005.0
-      throughput: 995.0248756218906
+      inference_time: 1008.0
+      throughput: 992.063492063492
       estimated_peak_memory_range:
         min: 221184
-        max: 6072368
+        max: 12353904
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 17
-      job_id: jegnlw1r5
+      job_id: jmg94nvl5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.860962Z'
+    timestamp: '2024-05-20T16:35:30.235492Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1129.0
+      throughput: 885.7395925597874
+      estimated_peak_memory_range:
+        min: 217088
+        max: 217088
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 17
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 17
+      job_id: jz5w96o6p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1559.0
+      throughput: 641.4368184733804
+      estimated_peak_memory_range:
+        min: 8896512
+        max: 8896512
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: jz57drzl5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 3368.0
+      throughput: 296.91211401425176
+      estimated_peak_memory_range:
+        min: 33103872
+        max: 33103872
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jqp4wrqvg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.235514Z'
diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/README.md b/qai_hub_models/models/quicksrnetmedium_quantized/README.md
index 732326e4..4b09275b 100644
--- a/qai_hub_models/models/quicksrnetmedium_quantized/README.md
+++ b/qai_hub_models/models/quicksrnetmedium_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/export.py b/qai_hub_models/models/quicksrnetmedium_quantized/export.py
index fad49cad..9c4ced9f 100644
--- a/qai_hub_models/models/quicksrnetmedium_quantized/export.py
+++ b/qai_hub_models/models/quicksrnetmedium_quantized/export.py
@@ -124,12 +124,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -171,8 +175,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -200,8 +206,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -213,7 +223,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/model.py b/qai_hub_models/models/quicksrnetmedium_quantized/model.py
index 939d8e67..1c17a3dc 100644
--- a/qai_hub_models/models/quicksrnetmedium_quantized/model.py
+++ b/qai_hub_models/models/quicksrnetmedium_quantized/model.py
@@ -8,30 +8,24 @@
 # This verifies aimet is installed, and this must be included first.
 from qai_hub_models.utils.quantization_aimet import (
     AIMETQuantizableMixin,
+    constrain_quantized_inputs_to_image_range,
 )
 
 # isort: on
 
 import torch
 from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
 from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
 
-from qai_hub_models.models.common import SourceModelFormat, TargetRuntime
 from qai_hub_models.models.quicksrnetmedium.model import QuickSRNetMedium
-from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
 from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 2
+MODEL_ASSET_VERSION = 4
 
-# Weights and config stored in S3 are sourced from
-# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_medium_4x_w8a8.json:
-# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_medium_4x_checkpoint_int8.pth
-# and
-# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js
-# Encodings were generated with AIMET QuantSim library
-QUANTIZED_WEIGHTS = "quicksrnet_medium_4x_checkpoint_int8.pth"
-AIMET_ENCODINGS = "aimet_quantization_encodings.json"
+DEFAULT_ENCODINGS = "quicksrnetmedium_quantized_encodings.json"
 SCALING_FACTOR = 4
 
 
@@ -45,9 +39,7 @@ def __init__(
         quicksrnet_model: QuantizationSimModel,
     ) -> None:
         QuickSRNetMedium.__init__(self, quicksrnet_model.model)
-        AIMETQuantizableMixin.__init__(
-            self, quicksrnet_model, needs_onnx_direct_aimet_export=True
-        )
+        AIMETQuantizableMixin.__init__(self, quicksrnet_model)
 
     @classmethod
     def from_pretrained(
@@ -62,46 +54,27 @@ def from_pretrained(
             else: Interprets as a filepath and loads the encodings stored there.
         """
         # Load Model
-        quicksrnet = QuickSRNetMedium.from_pretrained()
-        input_shape = quicksrnet.get_input_spec()["image"][0]
-        equalize_model(quicksrnet, input_shape)
-
-        # Download weights and quantization parameters
-        weights = CachedWebModelAsset.from_asset_store(
-            MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS
-        ).fetch()
-        aimet_config = get_default_aimet_config_legacy_v2()
-
-        # Load the model weights and quantization parameters
-        # In this particular instance, the state_dict keys from the model are all named "model.<expected name>"
-        # where <expected name> is the name of each key in the weights file - without the word model.
-        # We rename all the keys to add the word model
-        state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"]
-        new_state_dict = {"model." + key: value for key, value in state_dict.items()}
-        quicksrnet.load_state_dict(new_state_dict)
+        fp16_model = QuickSRNetMedium.from_pretrained()
+        input_shape = cls.get_input_spec()["image"][0]
+        model = prepare_model(fp16_model)
+        equalize_model(model, input_shape)
         sim = QuantizationSimModel(
-            quicksrnet,
+            model,
             quant_scheme="tf_enhanced",
             default_param_bw=8,
             default_output_bw=8,
-            config_file=aimet_config,
+            config_file=get_default_aimet_config(),
             dummy_input=torch.rand(input_shape),
         )
+        constrain_quantized_inputs_to_image_range(sim)
+
         if aimet_encodings:
             if aimet_encodings == "DEFAULT":
                 aimet_encodings = CachedWebModelAsset.from_asset_store(
-                    MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS
+                    MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
                 ).fetch()
             load_encodings_to_sim(sim, aimet_encodings)
 
         sim.model.eval()
 
         return cls(sim)
-
-    def preferred_hub_source_model_format(
-        self, target_runtime: TargetRuntime
-    ) -> SourceModelFormat:
-        if target_runtime == TargetRuntime.QNN:
-            return SourceModelFormat.ONNX
-        else:
-            return SourceModelFormat.TORCHSCRIPT
diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml b/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml
index 4c7d84e5..d69ab3d0 100644
--- a/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml
+++ b/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: QuickSRNetMedium-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1046.0
-      throughput: 956.0229445506692
+      inference_time: 992.0
+      throughput: 1008.0645161290323
       estimated_peak_memory_range:
-        min: 1339392
-        max: 2781424
+        min: 12288
+        max: 1410992
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,7 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 17
-      job_id: jqp4k3wlg
+      job_id: j0px1ov1g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 801.0
+      throughput: 1248.4394506866417
+      estimated_peak_memory_range:
+        min: 65536
+        max: 68916056
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 11
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 11
+      job_id: jopry3k9g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1215.0
+      throughput: 823.0452674897119
+      estimated_peak_memory_range:
+        min: 12288
+        max: 9491496
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 16
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 16
+      job_id: j1p87yox5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -61,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.884809Z'
+    timestamp: '2024-05-20T16:35:30.266116Z'
   - torchscript_onnx_tflite:
-      inference_time: 871.0
-      throughput: 1148.105625717566
+      inference_time: 865.0
+      throughput: 1156.0693641618498
       estimated_peak_memory_range:
         min: 16384
-        max: 19479952
+        max: 19816736
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -75,7 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 17
-      job_id: j0pxnx195
+      job_id: jo5mzxrwp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 550.0
+      throughput: 1818.1818181818182
+      estimated_peak_memory_range:
+        min: 65536
+        max: 15505168
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 11
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 11
+      job_id: jep2my845
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 882.0
+      throughput: 1133.7868480725624
+      estimated_peak_memory_range:
+        min: 0
+        max: 14140464
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 16
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 16
+      job_id: jogkyxz2p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -84,73 +146,156 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.884829Z'
+    timestamp: '2024-05-20T16:35:30.266144Z'
   - torchscript_onnx_tflite:
-      inference_time: 3381.0
-      throughput: 295.77048210588583
+      inference_time: 1016.0
+      throughput: 984.2519685039371
       estimated_peak_memory_range:
-        min: 12288
-        max: 15175488
+        min: 69632
+        max: 1384896
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 16
+        layers_on_npu: 14
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 19
-      job_id: j1p80kjxg
+        total_layers: 17
+      job_id: jegnev2rg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 803.0
+      throughput: 1245.3300124533
+      estimated_peak_memory_range:
+        min: 65536
+        max: 70718264
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 11
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 11
+      job_id: j2p0r0y6p
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.884857Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.266161Z'
   - torchscript_onnx_tflite:
-      inference_time: 15536.0
-      throughput: 64.36663233779609
+      inference_time: 1823.0
+      throughput: 548.5463521667581
       estimated_peak_memory_range:
-        min: 1720320
-        max: 4755304
+        min: 20480
+        max: 13941344
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 16
+        layers_on_npu: 14
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 19
-      job_id: jwgondv4p
+        total_layers: 17
+      job_id: jygzrykx5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1392.0
+      throughput: 718.3908045977012
+      estimated_peak_memory_range:
+        min: 65536
+        max: 15064032
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 11
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 11
+      job_id: jmg9w2emp
       job_status: Passed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.884871Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:30.266177Z'
   - torchscript_onnx_tflite:
-      inference_time: 1396.0
-      throughput: 716.3323782234957
+      inference_time: 9357.0
+      throughput: 106.87186063909373
       estimated_peak_memory_range:
-        min: 32768
-        max: 1677424
+        min: 3276800
+        max: 6753144
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 16
+        layers_on_npu: 14
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 19
-      job_id: j1gl6qw8g
+        total_layers: 17
+      job_id: jz5wqznm5
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.884887Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:30.266189Z'
+  - torchscript_onnx_qnn:
+      inference_time: 794.0
+      throughput: 1259.4458438287154
+      estimated_peak_memory_range:
+        min: 53248
+        max: 53248
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 11
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 11
+      job_id: jqpyd3e7p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1142.0
+      throughput: 875.6567425569177
+      estimated_peak_memory_range:
+        min: 8826880
+        max: 8826880
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 16
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 16
+      job_id: jn5q2q845
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 3479.0
+      throughput: 287.4389192296637
+      estimated_peak_memory_range:
+        min: 15757312
+        max: 15757312
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 17
+        total_layers: 17
+      job_id: j1glkmn8p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.266211Z'
diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/test.py b/qai_hub_models/models/quicksrnetmedium_quantized/test.py
index 4da76b9d..c8c6ea58 100644
--- a/qai_hub_models/models/quicksrnetmedium_quantized/test.py
+++ b/qai_hub_models/models/quicksrnetmedium_quantized/test.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 import os
-import tempfile
 import zipfile
 
 import numpy as np
@@ -18,7 +17,11 @@
     MODEL_ID,
     QuickSRNetMediumQuantizable,
 )
-from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    qaihm_temp_dir,
+)
 from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check
 
 OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store(
@@ -69,7 +72,7 @@ def test_trace():
 def test_aimet_export():
     model = QuickSRNetMediumQuantizable.from_pretrained()
     name = model.__class__.__name__
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         output_zip = model.convert_to_onnx_and_aimet_encodings(
             tmpdir,
         )
diff --git a/qai_hub_models/models/quicksrnetsmall/README.md b/qai_hub_models/models/quicksrnetsmall/README.md
index 3c3e06ac..665e005e 100644
--- a/qai_hub_models/models/quicksrnetsmall/README.md
+++ b/qai_hub_models/models/quicksrnetsmall/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/quicksrnetsmall/export.py b/qai_hub_models/models/quicksrnetsmall/export.py
index bc672fe9..0449ff6c 100644
--- a/qai_hub_models/models/quicksrnetsmall/export.py
+++ b/qai_hub_models/models/quicksrnetsmall/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/quicksrnetsmall/model.py b/qai_hub_models/models/quicksrnetsmall/model.py
index 54b22d82..f1eb380f 100644
--- a/qai_hub_models/models/quicksrnetsmall/model.py
+++ b/qai_hub_models/models/quicksrnetsmall/model.py
@@ -57,7 +57,7 @@ def from_pretrained(cls) -> QuickSRNetSmall:
     def get_evaluator(self) -> BaseEvaluator:
         return SuperResolutionOutputEvaluator()
 
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
+    def forward(self, image):
         """
         Run QuickSRNet-Small on `image`, and produce an upscaled image
 
diff --git a/qai_hub_models/models/quicksrnetsmall/perf.yaml b/qai_hub_models/models/quicksrnetsmall/perf.yaml
index 41a8c83d..311e6769 100644
--- a/qai_hub_models/models/quicksrnetsmall/perf.yaml
+++ b/qai_hub_models/models/quicksrnetsmall/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: QuickSRNetSmall
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1316.0
-      throughput: 759.8784194528876
+      inference_time: 1315.0
+      throughput: 760.4562737642585
       estimated_peak_memory_range:
-        min: 24576
-        max: 8392968
+        min: 16384
+        max: 8193912
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 11
-      job_id: jo5mq8zqp
+      job_id: jw561460p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1010.0
-      throughput: 990.0990099009902
+      inference_time: 999.0
+      throughput: 1001.001001001001
       estimated_peak_memory_range:
-        min: 217088
-        max: 51877032
+        min: 229376
+        max: 63786312
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 11
-      job_id: jopr8wye5
+      job_id: j1pvwk3jg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1411.0
-      throughput: 708.7172218284904
+      inference_time: 1418.0
+      throughput: 705.2186177715091
       estimated_peak_memory_range:
-        min: 217088
-        max: 8686544
+        min: 90112
+        max: 2421520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 13
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jqpyrmd45
+        total_layers: 13
+      job_id: jz5w96v6p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.905242Z'
+    timestamp: '2024-05-20T16:35:30.306313Z'
   - torchscript_onnx_tflite:
-      inference_time: 914.0
-      throughput: 1094.0919037199126
+      inference_time: 884.0
+      throughput: 1131.2217194570135
       estimated_peak_memory_range:
-        min: 16384
-        max: 18347856
+        min: 20480
+        max: 18573536
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 11
-      job_id: jegnlkem5
+      job_id: j1p3m0klg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 617.0
-      throughput: 1620.7455429497568
+      inference_time: 621.0
+      throughput: 1610.3059581320451
       estimated_peak_memory_range:
-        min: 208896
-        max: 14414800
+        min: 0
+        max: 14770544
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 11
-      job_id: jep20emmg
+      job_id: j7gjlnxxp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1011.0
-      throughput: 989.1196834817013
+      inference_time: 931.0
+      throughput: 1074.1138560687432
       estimated_peak_memory_range:
-        min: 0
-        max: 12267184
+        min: 12288
+        max: 12222752
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 13
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j2p036rep
+        total_layers: 13
+      job_id: jmg94n1l5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.905287Z'
+    timestamp: '2024-05-20T16:35:30.306339Z'
   - torchscript_onnx_tflite:
-      inference_time: 1327.0
-      throughput: 753.5795026375282
+      inference_time: 1314.0
+      throughput: 761.03500761035
       estimated_peak_memory_range:
-        min: 28672
-        max: 8134240
+        min: 20480
+        max: 7936728
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 11
-      job_id: j1p3vrolg
+      job_id: jwgov6yx5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1021.0
-      throughput: 979.4319294809011
+      inference_time: 996.0
+      throughput: 1004.0160642570281
       estimated_peak_memory_range:
-        min: 249856
-        max: 7951808
+        min: 229376
+        max: 3511288
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 11
-      job_id: jlpeen61p
+      job_id: jygz7dekp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.905312Z'
+    timestamp: '2024-05-20T16:35:30.306357Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1089.0
+      throughput: 918.2736455463728
+      estimated_peak_memory_range:
+        min: 241664
+        max: 241664
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 11
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 11
+      job_id: jlpevm915
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1478.0
+      throughput: 676.5899864682003
+      estimated_peak_memory_range:
+        min: 8847360
+        max: 8847360
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 13
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 13
+      job_id: jnp18zl2g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2483.0
+      throughput: 402.7386226339106
+      estimated_peak_memory_range:
+        min: 33112064
+        max: 33112064
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jvgdv19eg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.306381Z'
diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/README.md b/qai_hub_models/models/quicksrnetsmall_quantized/README.md
index 20fa4de8..af0f5e82 100644
--- a/qai_hub_models/models/quicksrnetsmall_quantized/README.md
+++ b/qai_hub_models/models/quicksrnetsmall_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/q
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/export.py b/qai_hub_models/models/quicksrnetsmall_quantized/export.py
index f573669f..5a2eb8d2 100644
--- a/qai_hub_models/models/quicksrnetsmall_quantized/export.py
+++ b/qai_hub_models/models/quicksrnetsmall_quantized/export.py
@@ -124,12 +124,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -171,8 +175,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -200,8 +206,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -213,7 +223,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/model.py b/qai_hub_models/models/quicksrnetsmall_quantized/model.py
index 9102f5f9..57c495a8 100644
--- a/qai_hub_models/models/quicksrnetsmall_quantized/model.py
+++ b/qai_hub_models/models/quicksrnetsmall_quantized/model.py
@@ -8,30 +8,24 @@
 # This verifies aimet is installed, and this must be included first.
 from qai_hub_models.utils.quantization_aimet import (
     AIMETQuantizableMixin,
+    constrain_quantized_inputs_to_image_range,
 )
 
 # isort: on
 
 import torch
 from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
 from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
 
-from qai_hub_models.models.common import SourceModelFormat, TargetRuntime
 from qai_hub_models.models.quicksrnetsmall.model import QuickSRNetSmall
-from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
 from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 2
+MODEL_ASSET_VERSION = 4
 
-# Weights and config stored in S3 are sourced from
-# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_small_4x_w8a8.json:
-# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_small_4x_checkpoint_int8.pth
-# and
-# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js
-# Encodings were generated with AIMET QuantSim library
-QUANTIZED_WEIGHTS = "quicksrnet_small_4x_checkpoint_int8.pth"
-AIMET_ENCODINGS = "aimet_quantization_encodings.json"
+DEFAULT_ENCODINGS = "quicksrnetsmall_quantized_encodings.json"
 SCALING_FACTOR = 4
 
 
@@ -45,9 +39,7 @@ def __init__(
         quicksrnet_model: QuantizationSimModel,
     ) -> None:
         QuickSRNetSmall.__init__(self, quicksrnet_model.model)
-        AIMETQuantizableMixin.__init__(
-            self, quicksrnet_model, needs_onnx_direct_aimet_export=True
-        )
+        AIMETQuantizableMixin.__init__(self, quicksrnet_model)
 
     @classmethod
     def from_pretrained(
@@ -61,46 +53,27 @@ def from_pretrained(
             else: Interprets as a filepath and loads the encodings stored there.
         """
         # Load Model
-        quicksrnet = QuickSRNetSmall.from_pretrained()
-        input_shape = quicksrnet.get_input_spec()["image"][0]
-        equalize_model(quicksrnet, input_shape)
-
-        # Download weights and quantization parameters
-        weights = CachedWebModelAsset.from_asset_store(
-            MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS
-        ).fetch()
-        aimet_config = get_default_aimet_config_legacy_v2()
-
-        # Load the model weights and quantization parameters
-        # In this particular instance, the state_dict keys from the model are all named "model.<expected name>"
-        # where <expected name> is the name of each key in the weights file - without the word model.
-        # We rename all the keys to add the word model
-        state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"]
-        new_state_dict = {"model." + key: value for key, value in state_dict.items()}
-        quicksrnet.load_state_dict(new_state_dict)
+        fp16_model = QuickSRNetSmall.from_pretrained()
+        input_shape = cls.get_input_spec()["image"][0]
+        model = prepare_model(fp16_model)
+        equalize_model(model, input_shape)
         sim = QuantizationSimModel(
-            quicksrnet,
+            fp16_model,
             quant_scheme="tf_enhanced",
             default_param_bw=8,
             default_output_bw=8,
-            config_file=aimet_config,
+            config_file=get_default_aimet_config(),
             dummy_input=torch.rand(input_shape),
         )
+        constrain_quantized_inputs_to_image_range(sim)
+
         if aimet_encodings:
             if aimet_encodings == "DEFAULT":
                 aimet_encodings = CachedWebModelAsset.from_asset_store(
-                    MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS
+                    MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
                 ).fetch()
             load_encodings_to_sim(sim, aimet_encodings)
 
         sim.model.eval()
 
         return cls(sim)
-
-    def preferred_hub_source_model_format(
-        self, target_runtime: TargetRuntime
-    ) -> SourceModelFormat:
-        if target_runtime == TargetRuntime.QNN:
-            return SourceModelFormat.ONNX
-        else:
-            return SourceModelFormat.TORCHSCRIPT
diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml b/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml
index 829787bd..20ec2659 100644
--- a/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml
+++ b/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: QuickSRNetSmall-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 987.0
-      throughput: 1013.1712259371834
+      inference_time: 957.0
+      throughput: 1044.932079414838
       estimated_peak_memory_range:
-        min: 20480
-        max: 1821960
+        min: 1048576
+        max: 3323920
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,7 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 11
-      job_id: jogk78yop
+      job_id: jz57drwl5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 662.0
+      throughput: 1510.5740181268882
+      estimated_peak_memory_range:
+        min: 20480
+        max: 2419512
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 8
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 8
+      job_id: jo5mzx2wp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1143.0
+      throughput: 874.8906386701663
+      estimated_peak_memory_range:
+        min: 212992
+        max: 2520600
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: jqpyd3w7p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -61,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.929166Z'
+    timestamp: '2024-05-20T16:35:30.337379Z'
   - torchscript_onnx_tflite:
-      inference_time: 1612.0
-      throughput: 620.3473945409429
+      inference_time: 788.0
+      throughput: 1269.0355329949239
       estimated_peak_memory_range:
-        min: 16384
-        max: 18121488
+        min: 0
+        max: 18194848
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -75,7 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 11
-      job_id: jn5qev2m5
+      job_id: jqp4wrovg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 450.0
+      throughput: 2222.222222222222
+      estimated_peak_memory_range:
+        min: 61440
+        max: 12988496
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 8
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 8
+      job_id: jegnevyrg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 818.0
+      throughput: 1222.4938875305625
+      estimated_peak_memory_range:
+        min: 212992
+        max: 14543472
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: j2p0r0q6p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -84,73 +146,156 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.929186Z'
+    timestamp: '2024-05-20T16:35:30.337406Z'
   - torchscript_onnx_tflite:
-      inference_time: 3227.0
-      throughput: 309.88534242330337
+      inference_time: 979.0
+      throughput: 1021.4504596527069
       estimated_peak_memory_range:
-        min: 49152
-        max: 15102016
+        min: 28672
+        max: 2811096
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 10
+        layers_on_npu: 8
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 13
-      job_id: jqp4k24vg
+        total_layers: 11
+      job_id: j0px1oj1g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 661.0
+      throughput: 1512.8593040847202
+      estimated_peak_memory_range:
+        min: 20480
+        max: 11468640
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 8
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 8
+      job_id: jep2my645
       job_status: Passed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:32.929214Z'
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.337423Z'
   - torchscript_onnx_tflite:
-      inference_time: 12108.0
-      throughput: 82.59002312520647
+      inference_time: 1682.0
+      throughput: 594.5303210463734
       estimated_peak_memory_range:
-        min: 5685248
-        max: 13091440
+        min: 12288
+        max: 13230640
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 10
+        layers_on_npu: 8
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 13
-      job_id: j1pvr2w75
+        total_layers: 11
+      job_id: j1p3er0m5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1294.0
+      throughput: 772.7975270479135
+      estimated_peak_memory_range:
+        min: 65536
+        max: 12983280
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 8
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 8
+      job_id: jz5wqr645
       job_status: Passed
     reference_device_info:
-      name: RB5 (Proxy)
+      name: RB3 Gen 2 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:32.929227Z'
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:30.337440Z'
   - torchscript_onnx_tflite:
-      inference_time: 1388.0
-      throughput: 720.4610951008646
+      inference_time: 5698.0
+      throughput: 175.5001755001755
       estimated_peak_memory_range:
-        min: 24576
-        max: 1828056
+        min: 3362816
+        max: 13394304
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 10
+        layers_on_npu: 8
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 13
-      job_id: j0pxnzr15
+        total_layers: 11
+      job_id: jwgo3961g
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.929244Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:30.337451Z'
+  - torchscript_onnx_qnn:
+      inference_time: 762.0
+      throughput: 1312.3359580052493
+      estimated_peak_memory_range:
+        min: 49152
+        max: 49152
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 8
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 8
+      job_id: jopry3q9g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1088.0
+      throughput: 919.1176470588235
+      estimated_peak_memory_range:
+        min: 9007104
+        max: 9007104
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: j1p87y9x5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 8332.0
+      throughput: 120.01920307249159
+      estimated_peak_memory_range:
+        min: 33210368
+        max: 33210368
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jogkyxn2p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.337478Z'
diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/test.py b/qai_hub_models/models/quicksrnetsmall_quantized/test.py
index be878b99..b23accfd 100644
--- a/qai_hub_models/models/quicksrnetsmall_quantized/test.py
+++ b/qai_hub_models/models/quicksrnetsmall_quantized/test.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 import os
-import tempfile
 import zipfile
 
 import numpy as np
@@ -18,7 +17,11 @@
     MODEL_ID,
     QuickSRNetSmallQuantizable,
 )
-from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    qaihm_temp_dir,
+)
 from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check
 
 OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store(
@@ -69,7 +72,7 @@ def test_trace():
 def test_aimet_export():
     model = QuickSRNetSmallQuantizable.from_pretrained()
     name = model.__class__.__name__
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         output_zip = model.convert_to_onnx_and_aimet_encodings(
             tmpdir,
         )
diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/README.md b/qai_hub_models/models/real_esrgan_general_x4v3/README.md
index f3f03e6e..c25f5606 100644
--- a/qai_hub_models/models/real_esrgan_general_x4v3/README.md
+++ b/qai_hub_models/models/real_esrgan_general_x4v3/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/export.py b/qai_hub_models/models/real_esrgan_general_x4v3/export.py
index d1672a8d..88259dab 100644
--- a/qai_hub_models/models/real_esrgan_general_x4v3/export.py
+++ b/qai_hub_models/models/real_esrgan_general_x4v3/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml
index f10d3449..d62918c6 100644
--- a/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml
+++ b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Real-ESRGAN-General-x4v3
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 7205.0
-      throughput: 138.79250520471894
+      inference_time: 7261.0
+      throughput: 137.72207684891887
       estimated_peak_memory_range:
-        min: 15941632
-        max: 27205736
+        min: 17612800
+        max: 21719648
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 72
-      job_id: j1gl6lklg
+      job_id: jn5q2qk45
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 7008.0
-      throughput: 142.69406392694063
+      inference_time: 6254.0
+      throughput: 159.89766549408378
       estimated_peak_memory_range:
-        min: 45056
-        max: 45937496
+        min: 245760
+        max: 5108560
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: j1p3v6mzg
+      job_id: j1p3m03lg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 7130.0
-      throughput: 140.25245441795232
+      inference_time: 6861.0
+      throughput: 145.75134819997086
       estimated_peak_memory_range:
-        min: 8429568
-        max: 23590888
+        min: 6336512
+        max: 17772656
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 74
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1pv07wm5
+        total_layers: 74
+      job_id: jlpevm115
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.949581Z'
+    timestamp: '2024-05-20T16:35:30.377608Z'
   - torchscript_onnx_tflite:
-      inference_time: 5369.0
-      throughput: 186.25442354255912
+      inference_time: 5603.0
+      throughput: 178.4758165268606
       estimated_peak_memory_range:
-        min: 20480
-        max: 55365360
+        min: 16384
+        max: 55868880
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 72
-      job_id: jw56ew17g
+      job_id: j1glkmz8p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 4934.0
-      throughput: 202.67531414673692
+      inference_time: 4592.0
+      throughput: 217.77003484320556
       estimated_peak_memory_range:
-        min: 12288
-        max: 31445424
+        min: 208896
+        max: 33800560
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: jwgok8vdp
+      job_id: jwgov60x5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 5279.0
-      throughput: 189.42981625307823
+      inference_time: 5149.0
+      throughput: 194.21246844047388
       estimated_peak_memory_range:
-        min: 8392704
-        max: 47488976
+        min: 2310144
+        max: 36369760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 74
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j7gjzql85
+        total_layers: 74
+      job_id: jygz7d9kp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.949640Z'
+    timestamp: '2024-05-20T16:35:30.377636Z'
   - torchscript_onnx_tflite:
-      inference_time: 7123.0
-      throughput: 140.39028499227854
+      inference_time: 7335.0
+      throughput: 136.332651670075
       estimated_peak_memory_range:
-        min: 15777792
-        max: 23652120
+        min: 9465856
+        max: 18689240
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 72
-      job_id: jopr87d95
+      job_id: jw5614j0p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 7016.0
-      throughput: 142.53135689851769
+      inference_time: 6280.0
+      throughput: 159.23566878980893
       estimated_peak_memory_range:
-        min: 32768
-        max: 10477536
+        min: 53248
+        max: 43875408
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 72
-      job_id: j1p80krxg
+      job_id: j7gjlnmxp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.949674Z'
+    timestamp: '2024-05-20T16:35:30.377655Z'
+  - torchscript_onnx_qnn:
+      inference_time: 8724.0
+      throughput: 114.62631820265933
+      estimated_peak_memory_range:
+        min: 229376
+        max: 229376
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 72
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 72
+      job_id: j1pvwkojg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 7228.0
+      throughput: 138.35085777531822
+      estimated_peak_memory_range:
+        min: 8613888
+        max: 8613888
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 74
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 74
+      job_id: jz5w96n6p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 58952.0
+      throughput: 16.96295291084272
+      estimated_peak_memory_range:
+        min: 26607616
+        max: 26607616
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 70
+        total_layers: 70
+      job_id: jmg94nel5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.377681Z'
diff --git a/qai_hub_models/models/real_esrgan_x4plus/README.md b/qai_hub_models/models/real_esrgan_x4plus/README.md
index c3e6d01a..89551a63 100644
--- a/qai_hub_models/models/real_esrgan_x4plus/README.md
+++ b/qai_hub_models/models/real_esrgan_x4plus/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/real_esrgan_x4plus/export.py b/qai_hub_models/models/real_esrgan_x4plus/export.py
index a5693ae6..f10bbd12 100644
--- a/qai_hub_models/models/real_esrgan_x4plus/export.py
+++ b/qai_hub_models/models/real_esrgan_x4plus/export.py
@@ -120,7 +120,7 @@ def export_model(
 
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options
+        target_runtime, compile_options, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -192,7 +192,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/real_esrgan_x4plus/perf.yaml b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml
index a51c1d3e..02636e72 100644
--- a/qai_hub_models/models/real_esrgan_x4plus/perf.yaml
+++ b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,31 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Real-ESRGAN-x4plus
   performance_metrics:
-  - torchscript_onnx_qnn:
-      inference_time: 65726.0
-      throughput: 15.214679122417309
+  - torchscript_onnx_tflite:
+      inference_time: 68854.0
+      throughput: 14.523484474395097
       estimated_peak_memory_range:
-        min: 102400
-        max: 107703704
+        min: 28672
+        max: 3752144
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1028
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1028
+      job_id: jnp18zx2g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 68240.0
+      throughput: 14.654161781946073
+      estimated_peak_memory_range:
+        min: 94208
+        max: 108186752
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1031
-      job_id: jygzon765
+      job_id: jmg94new5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 69431.0
-      throughput: 14.402788379830335
+      inference_time: 67823.0
+      throughput: 14.744260796484967
       estimated_peak_memory_range:
-        min: 6467584
-        max: 119585224
+        min: 6422528
+        max: 150577760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1030
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jmg9jd4v5
+        total_layers: 1030
+      job_id: jqp4wr08g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,28 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.973754Z'
-  - torchscript_onnx_qnn:
-      inference_time: 50526.0
-      throughput: 19.79179036535645
+    timestamp: '2024-05-20T16:35:30.408294Z'
+  - torchscript_onnx_tflite:
+      inference_time: 54608.0
+      throughput: 18.312335188983297
+      estimated_peak_memory_range:
+        min: 3264512
+        max: 587498384
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1028
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1028
+      job_id: jvgdv1leg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 50248.0
+      throughput: 19.901289603566312
       estimated_peak_memory_range:
-        min: 53248
-        max: 259398784
+        min: 86016
+        max: 262075680
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1031
-      job_id: jz5w249j5
+      job_id: jnp18zx8g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 50628.0
-      throughput: 19.751915935845776
+      inference_time: 51447.0
+      throughput: 19.43747934767819
       estimated_peak_memory_range:
-        min: 7217152
-        max: 193898256
+        min: 6303744
+        max: 192645232
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1030
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jvgde2vl5
+        total_layers: 1030
+      job_id: j0px1o23g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,28 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.973885Z'
-  - torchscript_onnx_qnn:
-      inference_time: 67718.0
-      throughput: 14.767122478513837
+    timestamp: '2024-05-20T16:35:30.408323Z'
+  - torchscript_onnx_tflite:
+      inference_time: 74054.0
+      throughput: 13.503659491722257
+      estimated_peak_memory_range:
+        min: 3284992
+        max: 5941440
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1028
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1028
+      job_id: jz5w96n3p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 64798.0
+      throughput: 15.432575079477761
       estimated_peak_memory_range:
-        min: 163840
-        max: 107805352
+        min: 102400
+        max: 107714376
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1031
-      job_id: j1p3vr7lg
+      job_id: jz57dr3v5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.974009Z'
+    timestamp: '2024-05-20T16:35:30.408341Z'
+  - torchscript_onnx_qnn:
+      inference_time: 73958.0
+      throughput: 13.521187701127667
+      estimated_peak_memory_range:
+        min: 217088
+        max: 217088
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1030
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1030
+      job_id: jvgdv1lrg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 65800.0
+      throughput: 15.19756838905775
+      estimated_peak_memory_range:
+        min: 1351680
+        max: 1351680
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1030
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1030
+      job_id: jo5mzxydp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 598980.0
+      throughput: 1.669504824868944
+      estimated_peak_memory_range:
+        min: 550260736
+        max: 550260736
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jegnev8kg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.408364Z'
diff --git a/qai_hub_models/models/regnet/README.md b/qai_hub_models/models/regnet/README.md
index e47cbccd..96c82923 100644
--- a/qai_hub_models/models/regnet/README.md
+++ b/qai_hub_models/models/regnet/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/regnet/export.py b/qai_hub_models/models/regnet/export.py
index b7eea153..079e7a95 100644
--- a/qai_hub_models/models/regnet/export.py
+++ b/qai_hub_models/models/regnet/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/regnet/perf.yaml b/qai_hub_models/models/regnet/perf.yaml
index 4e124163..1569c2bd 100644
--- a/qai_hub_models/models/regnet/perf.yaml
+++ b/qai_hub_models/models/regnet/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: RegNet
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2314.0
-      throughput: 432.152117545376
+      inference_time: 2321.0
+      throughput: 430.8487720809996
       estimated_peak_memory_range:
-        min: 16384
-        max: 2190392
+        min: 28672
+        max: 2093984
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 114
-      job_id: jqp4k3xlg
+      job_id: jopry3j0g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2128.0
-      throughput: 469.9248120300752
+      inference_time: 2130.0
+      throughput: 469.4835680751174
       estimated_peak_memory_range:
-        min: 20480
-        max: 15932376
+        min: 16384
+        max: 16919216
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 188
-      job_id: jo5mq8wqp
+      job_id: j2p0r079p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2423.0
-      throughput: 412.71151465125877
+      inference_time: 2312.0
+      throughput: 432.52595155709344
       estimated_peak_memory_range:
-        min: 12288
-        max: 87079712
+        min: 49152
+        max: 79165336
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 190
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jopr8w4e5
+        total_layers: 190
+      job_id: j1glkmrjp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:32.991772Z'
+    timestamp: '2024-05-20T16:35:30.439253Z'
   - torchscript_onnx_tflite:
-      inference_time: 1616.0
-      throughput: 618.8118811881188
+      inference_time: 1625.0
+      throughput: 615.3846153846154
       estimated_peak_memory_range:
-        min: 12288
-        max: 134209840
+        min: 78073856
+        max: 211737456
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 114
-      job_id: j0pxnx795
+      job_id: jep2mynr5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1506.0
-      throughput: 664.0106241699867
+      inference_time: 1481.0
+      throughput: 675.219446320054
       estimated_peak_memory_range:
-        min: 618496
-        max: 77239488
+        min: 0
+        max: 72188080
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 188
-      job_id: jegnlk9m5
+      job_id: j1p87yvk5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1699.0
-      throughput: 588.5815185403178
+      inference_time: 1586.0
+      throughput: 630.517023959647
       estimated_peak_memory_range:
-        min: 618496
-        max: 36167024
+        min: 0
+        max: 38564464
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 190
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jep20e7mg
+        total_layers: 190
+      job_id: jw5614l6p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:32.991836Z'
+    timestamp: '2024-05-20T16:35:30.439280Z'
   - torchscript_onnx_tflite:
-      inference_time: 2329.0
-      throughput: 429.36882782310005
+      inference_time: 2331.0
+      throughput: 429.000429000429
       estimated_peak_memory_range:
-        min: 24576
-        max: 2315288
+        min: 32768
+        max: 2367144
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 114
-      job_id: jvgdemme5
+      job_id: jqpyd308p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2130.0
-      throughput: 469.4835680751174
+      inference_time: 2139.0
+      throughput: 467.50818139317437
       estimated_peak_memory_range:
-        min: 12288
-        max: 56502216
+        min: 16384
+        max: 66236928
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 188
-      job_id: jo5mqllwp
+      job_id: jn5q2qon5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:32.991902Z'
+    timestamp: '2024-05-20T16:35:30.439298Z'
+  - torchscript_onnx_qnn:
+      inference_time: 2466.0
+      throughput: 405.51500405515003
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 188
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 188
+      job_id: jogkyxmwp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2190.0
+      throughput: 456.62100456621005
+      estimated_peak_memory_range:
+        min: 34840576
+        max: 34840576
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 190
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 190
+      job_id: j1p3m023g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 14744.0
+      throughput: 67.82419967444385
+      estimated_peak_memory_range:
+        min: 70148096
+        max: 70148096
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 85
+        total_layers: 85
+      job_id: jwgov6qq5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.439321Z'
diff --git a/qai_hub_models/models/resnet101/README.md b/qai_hub_models/models/resnet101/README.md
index 145fa7e8..218a6131 100644
--- a/qai_hub_models/models/resnet101/README.md
+++ b/qai_hub_models/models/resnet101/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnet101/export.py b/qai_hub_models/models/resnet101/export.py
index feaaa511..d2b19892 100644
--- a/qai_hub_models/models/resnet101/export.py
+++ b/qai_hub_models/models/resnet101/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnet101/perf.yaml b/qai_hub_models/models/resnet101/perf.yaml
index ec4a90fd..b8e31514 100644
--- a/qai_hub_models/models/resnet101/perf.yaml
+++ b/qai_hub_models/models/resnet101/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNet101
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 3390.0
-      throughput: 294.9852507374631
+      inference_time: 3366.0
+      throughput: 297.08853238265004
       estimated_peak_memory_range:
-        min: 28672
-        max: 1775440
+        min: 36864
+        max: 2178824
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 147
-      job_id: j7gjzq085
+      job_id: jo5mznxdp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 3448.0
-      throughput: 290.0232018561485
+      inference_time: 3453.0
+      throughput: 289.6032435563278
       estimated_peak_memory_range:
-        min: 638976
-        max: 216598456
+        min: 618496
+        max: 173565024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: jygzonx65
+      job_id: jep2mkyr5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 3747.0
-      throughput: 266.88017080330934
+      inference_time: 3601.0
+      throughput: 277.700638711469
       estimated_peak_memory_range:
-        min: 618496
-        max: 366172984
+        min: 12288
+        max: 300122744
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 247
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jmg9jd3v5
+        total_layers: 247
+      job_id: jn5q26qn5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.051295Z'
+    timestamp: '2024-05-20T16:35:30.509475Z'
   - torchscript_onnx_tflite:
-      inference_time: 2446.0
-      throughput: 408.8307440719542
+      inference_time: 2430.0
+      throughput: 411.52263374485597
       estimated_peak_memory_range:
-        min: 212992
-        max: 104476752
+        min: 16384
+        max: 107021088
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 147
-      job_id: jlpeeyr0p
+      job_id: jegne6vkg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2469.0
-      throughput: 405.0222762251924
+      inference_time: 2501.0
+      throughput: 399.8400639744102
       estimated_peak_memory_range:
-        min: 434176
-        max: 81113840
+        min: 618496
+        max: 81769760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: jz5w24dj5
+      job_id: jqpyd138p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2676.0
-      throughput: 373.69207772795215
+      inference_time: 2626.0
+      throughput: 380.8073115003808
       estimated_peak_memory_range:
         min: 618496
-        max: 44227744
+        max: 47698672
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 247
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jnp1y6dlp
+        total_layers: 247
+      job_id: j1glkvmjp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.051366Z'
+    timestamp: '2024-05-20T16:35:30.509503Z'
   - torchscript_onnx_tflite:
-      inference_time: 3443.0
-      throughput: 290.4443799012489
+      inference_time: 3408.0
+      throughput: 293.42723004694835
       estimated_peak_memory_range:
         min: 24576
-        max: 2329152
+        max: 2314664
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 147
-      job_id: jep20zqrg
+      job_id: jopryv30g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 3473.0
-      throughput: 287.93550244745177
+      inference_time: 3469.0
+      throughput: 288.2675122513693
       estimated_peak_memory_range:
         min: 622592
-        max: 217592784
+        max: 173821024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: jn5qedxn5
+      job_id: j1p87qyk5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.051462Z'
+    timestamp: '2024-05-20T16:35:30.509521Z'
+  - torchscript_onnx_qnn:
+      inference_time: 3993.0
+      throughput: 250.4382669671926
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 245
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 245
+      job_id: j2p0rz09p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 3504.0
+      throughput: 285.38812785388126
+      estimated_peak_memory_range:
+        min: 56750080
+        max: 56750080
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 247
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 247
+      job_id: jw561y46p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 28994.0
+      throughput: 34.48989446092295
+      estimated_peak_memory_range:
+        min: 51179520
+        max: 51179520
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 110
+        total_layers: 110
+      job_id: j1p3mj03g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.509545Z'
diff --git a/qai_hub_models/models/resnet101_quantized/README.md b/qai_hub_models/models/resnet101_quantized/README.md
index 61c6fb55..f8ee5f88 100644
--- a/qai_hub_models/models/resnet101_quantized/README.md
+++ b/qai_hub_models/models/resnet101_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnet101_quantized/export.py b/qai_hub_models/models/resnet101_quantized/export.py
index aacb445c..46a96d0c 100644
--- a/qai_hub_models/models/resnet101_quantized/export.py
+++ b/qai_hub_models/models/resnet101_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnet101_quantized/perf.yaml b/qai_hub_models/models/resnet101_quantized/perf.yaml
index 30dae2ef..e3636200 100644
--- a/qai_hub_models/models/resnet101_quantized/perf.yaml
+++ b/qai_hub_models/models/resnet101_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNet101Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1171.0
-      throughput: 853.9709649871904
+      inference_time: 1181.0
+      throughput: 846.740050804403
       estimated_peak_memory_range:
-        min: 28672
-        max: 1746016
+        min: 40960
+        max: 2202864
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 148
-      job_id: jz5709vrg
+      job_id: jlpevddo5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1394.0
-      throughput: 717.3601147776184
+      inference_time: 1381.0
+      throughput: 724.112961622013
       estimated_peak_memory_range:
-        min: 12288
-        max: 186309248
+        min: 172032
+        max: 8857136
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,22 +69,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 146
-      job_id: jopr8w1e5
+      job_id: jmg94llw5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1804.0
-      throughput: 554.3237250554324
+      inference_time: 1574.0
+      throughput: 635.3240152477764
       estimated_peak_memory_range:
-        min: 12288
-        max: 70503128
+        min: 28672
+        max: 151107432
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 154
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jogk78rop
+        total_layers: 154
+      job_id: jqp4wll8g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -91,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.075407Z'
+    timestamp: '2024-05-20T16:35:30.539412Z'
   - torchscript_onnx_tflite:
-      inference_time: 922.0
-      throughput: 1084.5986984815618
+      inference_time: 889.0
+      throughput: 1124.859392575928
       estimated_peak_memory_range:
-        min: 16384
-        max: 92718400
+        min: 12288
+        max: 92553280
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 148
-      job_id: jo5mq8vqp
+      job_id: jygz733op
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1061.0
-      throughput: 942.5070688030161
+      inference_time: 1045.0
+      throughput: 956.9377990430622
       estimated_peak_memory_range:
-        min: 167936
-        max: 59048544
+        min: 116203520
+        max: 179474976
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -120,22 +122,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 146
-      job_id: jep20e3mg
+      job_id: jnp18448g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1380.0
-      throughput: 724.6376811594203
+      inference_time: 1217.0
+      throughput: 821.6926869350863
       estimated_peak_memory_range:
-        min: 618496
-        max: 46374032
+        min: 0
+        max: 43890864
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 154
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jn5qev9m5
+        total_layers: 154
+      job_id: j0px1kk3g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -144,51 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.075467Z'
+    timestamp: '2024-05-20T16:35:30.539439Z'
   - torchscript_onnx_tflite:
-      inference_time: 4806.0
-      throughput: 208.07324178110696
+      inference_time: 1190.0
+      throughput: 840.3361344537815
       estimated_peak_memory_range:
-        min: 24576
-        max: 27299616
+        min: 45056
+        max: 1732448
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 150
+        layers_on_npu: 148
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 150
-      job_id: jvgdemjr5
+        total_layers: 148
+      job_id: jz5w9ee3p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 1380.0
+      throughput: 724.6376811594203
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 20480
+        max: 100094264
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 146
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: j1gl6qnjg
-      job_status: Failed
-    torchscript_onnx_ort:
-      inference_time: 53190.0
-      throughput: 18.80052641473961
+        total_layers: 146
+      job_id: jz57dyyv5
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.539460Z'
+  - torchscript_onnx_tflite:
+      inference_time: 4782.0
+      throughput: 209.11752404851526
       estimated_peak_memory_range:
-        min: 12480512
-        max: 88971072
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 12288
+        max: 29359024
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 148
         layers_on_gpu: 0
-        layers_on_cpu: 156
-        total_layers: 156
-      job_id: j1gl6lelg
+        layers_on_cpu: 0
+        total_layers: 148
+      job_id: j2p0lx30p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 5013.0
+      throughput: 199.48134849391582
+      estimated_peak_memory_range:
+        min: 163840
+        max: 61142352
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: j1p3ervm5
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -197,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.075531Z'
+    timestamp: '2024-05-20T16:35:30.539477Z'
   - torchscript_onnx_tflite:
-      inference_time: 17430.0
-      throughput: 57.37234652897303
+      inference_time: 17166.0
+      throughput: 58.25468950250495
       estimated_peak_memory_range:
-        min: 16384
-        max: 2096256
+        min: 36864
+        max: 4633128
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 150
+        layers_on_npu: 148
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 150
-      job_id: jlpew6v7p
+        total_layers: 148
+      job_id: j1p8zk0qp
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -220,42 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.075558Z'
-  - torchscript_onnx_tflite:
-      inference_time: 1196.0
-      throughput: 836.1204013377926
+    timestamp: '2024-05-20T16:35:30.539488Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1408.0
+      throughput: 710.2272727272727
       estimated_peak_memory_range:
-        min: 24576
-        max: 2116496
+        min: 356352
+        max: 356352
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 150
+        layers_on_npu: 146
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 150
-      job_id: jygzo01o5
+        total_layers: 146
+      job_id: jvgdvxxrg
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 1433.0
-      throughput: 697.8367062107467
+    torchscript_onnx_ort:
+      inference_time: 1445.0
+      throughput: 692.0415224913495
       estimated_peak_memory_range:
-        min: 61440
-        max: 10820048
+        min: 43081728
+        max: 43081728
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 148
+        layers_on_npu: 154
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 148
-      job_id: jlpeen9op
+        total_layers: 154
+      job_id: jo5mznndp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 6842.0
+      throughput: 146.15609470914936
+      estimated_peak_memory_range:
+        min: 1634304
+        max: 1634304
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 156
+        total_layers: 156
+      job_id: jegne66kg
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.075622Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.539511Z'
diff --git a/qai_hub_models/models/resnet18/README.md b/qai_hub_models/models/resnet18/README.md
index 956e4791..de48498f 100644
--- a/qai_hub_models/models/resnet18/README.md
+++ b/qai_hub_models/models/resnet18/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnet18/export.py b/qai_hub_models/models/resnet18/export.py
index bb2f3c45..c7157f5b 100644
--- a/qai_hub_models/models/resnet18/export.py
+++ b/qai_hub_models/models/resnet18/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnet18/perf.yaml b/qai_hub_models/models/resnet18/perf.yaml
index 43a76a31..10d59660 100644
--- a/qai_hub_models/models/resnet18/perf.yaml
+++ b/qai_hub_models/models/resnet18/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNet18
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1398.0
-      throughput: 715.307582260372
+      inference_time: 1410.0
+      throughput: 709.2198581560284
       estimated_peak_memory_range:
-        min: 24576
-        max: 2046480
+        min: 12288
+        max: 1495520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 38
-      job_id: jw56ewq7g
+      job_id: jopryvv0g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1489.0
-      throughput: 671.591672263264
+      inference_time: 1471.0
+      throughput: 679.8096532970768
       estimated_peak_memory_range:
-        min: 12288
-        max: 83625152
+        min: 16384
+        max: 94295528
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 53
-      job_id: jwgok8edp
+      job_id: j2p0rzz9p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1543.0
-      throughput: 648.0881399870383
+      inference_time: 1335.0
+      throughput: 749.0636704119851
       estimated_peak_memory_range:
-        min: 16384
-        max: 82413040
+        min: 61440
+        max: 90905104
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 55
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j7gjzqk85
+        total_layers: 55
+      job_id: j1glkvvjp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.110933Z'
+    timestamp: '2024-05-20T16:35:30.578687Z'
   - torchscript_onnx_tflite:
-      inference_time: 987.0
-      throughput: 1013.1712259371834
+      inference_time: 981.0
+      throughput: 1019.367991845056
       estimated_peak_memory_range:
         min: 12288
-        max: 24202432
+        max: 24130336
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 38
-      job_id: j1p3v6qzg
+      job_id: jep2mkkr5
       job_status: Passed
     torchscript_onnx_qnn:
       inference_time: 1015.0
       throughput: 985.2216748768473
       estimated_peak_memory_range:
-        min: 0
-        max: 31898144
+        min: 618496
+        max: 30836368
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 53
-      job_id: j1pv07zm5
+      job_id: j1p87qqk5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1128.0
-      throughput: 886.5248226950355
+      inference_time: 947.0
+      throughput: 1055.9662090813094
       estimated_peak_memory_range:
-        min: 618496
-        max: 19073216
+        min: 0
+        max: 20884768
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 55
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jlpeey40p
+        total_layers: 55
+      job_id: jw561yy6p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.110984Z'
+    timestamp: '2024-05-20T16:35:30.578715Z'
   - torchscript_onnx_tflite:
-      inference_time: 1376.0
-      throughput: 726.7441860465116
+      inference_time: 1408.0
+      throughput: 710.2272727272727
       estimated_peak_memory_range:
-        min: 20480
-        max: 1963688
+        min: 24576
+        max: 1608360
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 38
-      job_id: j2p03xqnp
+      job_id: jqpyd118p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1485.0
-      throughput: 673.4006734006734
+      inference_time: 1473.0
+      throughput: 678.8866259334691
       estimated_peak_memory_range:
-        min: 16384
-        max: 83668248
+        min: 20480
+        max: 83818904
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 53
-      job_id: j1gl6qrmg
+      job_id: jn5q266n5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.111013Z'
+    timestamp: '2024-05-20T16:35:30.578738Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1572.0
+      throughput: 636.1323155216285
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 53
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 53
+      job_id: jogkyeewp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1329.0
+      throughput: 752.4454477050414
+      estimated_peak_memory_range:
+        min: 32423936
+        max: 32423936
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 55
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 55
+      job_id: j1p3mjj3g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 6023.0
+      throughput: 166.03021749958492
+      estimated_peak_memory_range:
+        min: 22114304
+        max: 22114304
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 26
+        total_layers: 26
+      job_id: jwgov22q5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.578760Z'
diff --git a/qai_hub_models/models/resnet18_quantized/README.md b/qai_hub_models/models/resnet18_quantized/README.md
index 4096efd2..5232da7c 100644
--- a/qai_hub_models/models/resnet18_quantized/README.md
+++ b/qai_hub_models/models/resnet18_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnet18_quantized/export.py b/qai_hub_models/models/resnet18_quantized/export.py
index d3a4be9d..51943fbc 100644
--- a/qai_hub_models/models/resnet18_quantized/export.py
+++ b/qai_hub_models/models/resnet18_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnet18_quantized/perf.yaml b/qai_hub_models/models/resnet18_quantized/perf.yaml
index f0cea05c..a879df3c 100644
--- a/qai_hub_models/models/resnet18_quantized/perf.yaml
+++ b/qai_hub_models/models/resnet18_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNet18Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 427.0
-      throughput: 2341.92037470726
+      inference_time: 421.0
+      throughput: 2375.296912114014
       estimated_peak_memory_range:
-        min: 24576
-        max: 14744816
+        min: 16384
+        max: 14552648
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 39
-      job_id: jz5w24mj5
+      job_id: j1pvw6qkg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 633.0
-      throughput: 1579.778830963665
+      inference_time: 636.0
+      throughput: 1572.3270440251572
       estimated_peak_memory_range:
         min: 16384
-        max: 61110464
+        max: 29686208
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,22 +69,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 37
-      job_id: jnp1y6qlp
+      job_id: jygz732op
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 977.0
-      throughput: 1023.5414534288639
+      inference_time: 752.0
+      throughput: 1329.787234042553
       estimated_peak_memory_range:
-        min: 45056
-        max: 142126416
+        min: 24576
+        max: 30406712
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 45
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz5w24x65
+        total_layers: 45
+      job_id: jvgdvxnrg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -91,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.135184Z'
+    timestamp: '2024-05-20T16:35:30.609007Z'
   - torchscript_onnx_tflite:
-      inference_time: 351.0
-      throughput: 2849.002849002849
+      inference_time: 343.0
+      throughput: 2915.451895043732
       estimated_peak_memory_range:
         min: 12288
-        max: 24268608
+        max: 23898080
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 39
-      job_id: jmg9jd9v5
+      job_id: j7gjlvdvp
       job_status: Passed
     torchscript_onnx_qnn:
       inference_time: 480.0
       throughput: 2083.3333333333335
       estimated_peak_memory_range:
-        min: 0
-        max: 26088768
+        min: 163840
+        max: 27124384
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -120,22 +122,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 37
-      job_id: jvgde27l5
+      job_id: jz5w9ew3p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 750.0
-      throughput: 1333.3333333333333
+      inference_time: 631.0
+      throughput: 1584.7860538827258
       estimated_peak_memory_range:
         min: 0
-        max: 19250192
+        max: 21432704
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 45
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jmg9jd8l5
+        total_layers: 45
+      job_id: jz57dy2v5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -144,51 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.135231Z'
+    timestamp: '2024-05-20T16:35:30.609034Z'
   - torchscript_onnx_tflite:
-      inference_time: 1555.0
-      throughput: 643.0868167202573
+      inference_time: 419.0
+      throughput: 2386.634844868735
       estimated_peak_memory_range:
-        min: 16384
-        max: 14843920
+        min: 12288
+        max: 1584296
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 41
+        layers_on_npu: 39
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 41
-      job_id: jygzo0kx5
+        total_layers: 39
+      job_id: jlpevdoo5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 630.0
+      throughput: 1587.3015873015872
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 20480
+        max: 29451032
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 37
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jqpyry0l5
-      job_status: Failed
-    torchscript_onnx_ort:
-      inference_time: 11826.0
-      throughput: 84.5594452900389
+        total_layers: 37
+      job_id: jnp18428g
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.609055Z'
+  - torchscript_onnx_tflite:
+      inference_time: 1452.0
+      throughput: 688.7052341597796
       estimated_peak_memory_range:
-        min: 1556480
-        max: 29105488
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 12288
+        max: 14834800
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 39
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 39
+      job_id: jo5m3lqyg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1817.0
+      throughput: 550.357732526142
+      estimated_peak_memory_range:
+        min: 12288
+        max: 24293456
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 37
         layers_on_gpu: 0
-        layers_on_cpu: 47
-        total_layers: 47
-      job_id: jvgde20e5
+        layers_on_cpu: 0
+        total_layers: 37
+      job_id: j1p8zkmzp
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -197,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.135269Z'
+    timestamp: '2024-05-20T16:35:30.609072Z'
   - torchscript_onnx_tflite:
-      inference_time: 7308.0
-      throughput: 136.83634373289544
+      inference_time: 7043.0
+      throughput: 141.9849495953429
       estimated_peak_memory_range:
         min: 12288
-        max: 6786960
+        max: 6989920
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 41
+        layers_on_npu: 39
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 41
-      job_id: jygzjz7zp
+        total_layers: 39
+      job_id: jegn3wmv5
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -220,42 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.135284Z'
-  - torchscript_onnx_tflite:
-      inference_time: 463.0
-      throughput: 2159.827213822894
+    timestamp: '2024-05-20T16:35:30.609083Z'
+  - torchscript_onnx_qnn:
+      inference_time: 768.0
+      throughput: 1302.0833333333333
       estimated_peak_memory_range:
-        min: 20480
-        max: 15182520
+        min: 569344
+        max: 569344
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 41
+        layers_on_npu: 37
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 41
-      job_id: jlpeen3vp
+        total_layers: 37
+      job_id: jmg94l0w5
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 680.0
-      throughput: 1470.5882352941176
+    torchscript_onnx_ort:
+      inference_time: 714.0
+      throughput: 1400.5602240896358
       estimated_peak_memory_range:
-        min: 24576
-        max: 60765408
+        min: 11710464
+        max: 11710464
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 39
+        layers_on_npu: 45
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 39
-      job_id: jo5mqly9p
+        total_layers: 45
+      job_id: jqp4wln8g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 143079.0
+      throughput: 6.989145856484879
+      estimated_peak_memory_range:
+        min: 7467008
+        max: 7467008
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j0px1k93g
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.135313Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.609107Z'
diff --git a/qai_hub_models/models/resnet50/README.md b/qai_hub_models/models/resnet50/README.md
index 9723fbec..6abe8dfa 100644
--- a/qai_hub_models/models/resnet50/README.md
+++ b/qai_hub_models/models/resnet50/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnet50/export.py b/qai_hub_models/models/resnet50/export.py
index d36c2229..b7c78bc7 100644
--- a/qai_hub_models/models/resnet50/export.py
+++ b/qai_hub_models/models/resnet50/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnet50/perf.yaml b/qai_hub_models/models/resnet50/perf.yaml
index 91e9cb28..9c0750f2 100644
--- a/qai_hub_models/models/resnet50/perf.yaml
+++ b/qai_hub_models/models/resnet50/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNet50
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2302.0
-      throughput: 434.4048653344918
+      inference_time: 2272.0
+      throughput: 440.14084507042253
       estimated_peak_memory_range:
-        min: 20480
-        max: 2370264
+        min: 12288
+        max: 1939880
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jqp4k38vg
+      job_id: jo5mznedp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2340.0
-      throughput: 427.35042735042737
+      inference_time: 2382.0
+      throughput: 419.81528127623847
       estimated_peak_memory_range:
-        min: 20480
-        max: 185567384
+        min: 622592
+        max: 186262680
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jegnlkxr5
+      job_id: jep2mkxr5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2587.0
-      throughput: 386.5481252415926
+      inference_time: 2370.0
+      throughput: 421.9409282700422
       estimated_peak_memory_range:
         min: 12288
-        max: 217558712
+        max: 205580248
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 128
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jep20ej4g
+        total_layers: 128
+      job_id: jogkyevwp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.170270Z'
+    timestamp: '2024-05-20T16:35:30.648313Z'
   - torchscript_onnx_tflite:
-      inference_time: 1648.0
-      throughput: 606.7961165048544
+      inference_time: 1645.0
+      throughput: 607.90273556231
       estimated_peak_memory_range:
         min: 16384
-        max: 69510112
+        max: 70261792
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jo5mq84wp
+      job_id: jegne60kg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1630.0
-      throughput: 613.4969325153374
+      inference_time: 1682.0
+      throughput: 594.5303210463734
       estimated_peak_memory_range:
         min: 618496
-        max: 51350896
+        max: 50091680
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jopr8w995
+      job_id: jqpyd1z8p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1868.0
-      throughput: 535.3319057815846
+      inference_time: 1734.0
+      throughput: 576.7012687427913
       estimated_peak_memory_range:
-        min: 0
-        max: 35536992
+        min: 142139392
+        max: 174512736
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 128
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jqpyrmn75
+        total_layers: 128
+      job_id: jn5q260n5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.170321Z'
+    timestamp: '2024-05-20T16:35:30.648341Z'
   - torchscript_onnx_tflite:
-      inference_time: 2299.0
-      throughput: 434.97172683775557
+      inference_time: 2272.0
+      throughput: 440.14084507042253
       estimated_peak_memory_range:
-        min: 24576
-        max: 2160472
+        min: 28672
+        max: 2414432
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jvgdey1z5
+      job_id: jopryv60g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2343.0
-      throughput: 426.8032437046522
+      inference_time: 2386.0
+      throughput: 419.11148365465215
       estimated_peak_memory_range:
-        min: 626688
-        max: 186221872
+        min: 618496
+        max: 186113032
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jopr8m375
+      job_id: j1p87q2k5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.170363Z'
+    timestamp: '2024-05-20T16:35:30.648359Z'
+  - torchscript_onnx_qnn:
+      inference_time: 2691.0
+      throughput: 371.6090672612412
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: j2p0rz49p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2284.0
+      throughput: 437.82837127845886
+      estimated_peak_memory_range:
+        min: 76500992
+        max: 76500992
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 128
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 128
+      job_id: j1glkv4jp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 15563.0
+      throughput: 64.2549636959455
+      estimated_peak_memory_range:
+        min: 40939520
+        max: 40939520
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 59
+        total_layers: 59
+      job_id: jw561y26p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.648386Z'
diff --git a/qai_hub_models/models/resnext101/README.md b/qai_hub_models/models/resnext101/README.md
index f738cf44..dbe49a2a 100644
--- a/qai_hub_models/models/resnext101/README.md
+++ b/qai_hub_models/models/resnext101/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnext101/export.py b/qai_hub_models/models/resnext101/export.py
index a8654b64..703436a5 100644
--- a/qai_hub_models/models/resnext101/export.py
+++ b/qai_hub_models/models/resnext101/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnext101/perf.yaml b/qai_hub_models/models/resnext101/perf.yaml
index 779de30a..042adf13 100644
--- a/qai_hub_models/models/resnext101/perf.yaml
+++ b/qai_hub_models/models/resnext101/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNeXt101
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 6665.0
-      throughput: 150.03750937734435
+      inference_time: 6708.0
+      throughput: 149.0757304710793
       estimated_peak_memory_range:
-        min: 53248
-        max: 3235600
+        min: 24576
+        max: 2889376
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 147
-      job_id: jogk78o2p
+      job_id: jo5mzn69p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 6665.0
-      throughput: 150.03750937734435
+      inference_time: 6648.0
+      throughput: 150.42117930204572
       estimated_peak_memory_range:
-        min: 94208
-        max: 34973960
+        min: 16384
+        max: 35804344
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: j1gl6lo8g
+      job_id: jep2mk9q5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 7040.0
-      throughput: 142.04545454545453
+      inference_time: 6983.0
+      throughput: 143.20492624946297
       estimated_peak_memory_range:
-        min: 0
-        max: 454692632
+        min: 32768
+        max: 451743424
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 247
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p3v6xlg
+        total_layers: 247
+      job_id: jogkyeqnp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.282825Z'
+    timestamp: '2024-05-20T16:35:30.789744Z'
   - torchscript_onnx_tflite:
-      inference_time: 4816.0
-      throughput: 207.64119601328903
+      inference_time: 4868.0
+      throughput: 205.42317173377157
       estimated_peak_memory_range:
         min: 20480
-        max: 366481792
+        max: 365272832
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 147
-      job_id: jn5qevz45
+      job_id: jegne6mqg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 4797.0
-      throughput: 208.46362309776944
+      inference_time: 4799.0
+      throughput: 208.37674515524068
       estimated_peak_memory_range:
-        min: 618496
-        max: 131176640
+        min: 0
+        max: 123278800
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: jw56ewr0g
+      job_id: jqpyd1jlp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 5231.0
-      throughput: 191.16803670426305
+      inference_time: 5124.0
+      throughput: 195.160031225605
       estimated_peak_memory_range:
-        min: 618496
-        max: 100656704
+        min: 626688
+        max: 90094496
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 247
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jwgok8oxp
+        total_layers: 247
+      job_id: jn5q26ro5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.282896Z'
+    timestamp: '2024-05-20T16:35:30.789769Z'
   - torchscript_onnx_tflite:
-      inference_time: 6712.0
-      throughput: 148.98688915375448
+      inference_time: 6665.0
+      throughput: 150.03750937734435
       estimated_peak_memory_range:
-        min: 36864
-        max: 3053288
+        min: 57344
+        max: 2680608
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 147
-      job_id: j7gjz6215
+      job_id: jopryv27g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 6586.0
-      throughput: 151.83723048891588
+      inference_time: 6622.0
+      throughput: 151.01177891875565
       estimated_peak_memory_range:
-        min: 16384
-        max: 36067624
+        min: 0
+        max: 37100696
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 245
-      job_id: jmg9j7ym5
+      job_id: j1p87qmo5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.282959Z'
+    timestamp: '2024-05-20T16:35:30.789786Z'
+  - torchscript_onnx_qnn:
+      inference_time: 9078.0
+      throughput: 110.15642211940956
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 245
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 245
+      job_id: j2p0rz2np
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 6736.0
+      throughput: 148.45605700712588
+      estimated_peak_memory_range:
+        min: 108900352
+        max: 108900352
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 247
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 247
+      job_id: j1glkv3mp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 63884.0
+      throughput: 15.653371736271993
+      estimated_peak_memory_range:
+        min: 101425152
+        max: 101425152
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 110
+        total_layers: 110
+      job_id: jw561ynyp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.789808Z'
diff --git a/qai_hub_models/models/resnext101_quantized/README.md b/qai_hub_models/models/resnext101_quantized/README.md
index bc51c825..ae6f1db4 100644
--- a/qai_hub_models/models/resnext101_quantized/README.md
+++ b/qai_hub_models/models/resnext101_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnext101_quantized/export.py b/qai_hub_models/models/resnext101_quantized/export.py
index 50847416..d449e32d 100644
--- a/qai_hub_models/models/resnext101_quantized/export.py
+++ b/qai_hub_models/models/resnext101_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnext101_quantized/perf.yaml b/qai_hub_models/models/resnext101_quantized/perf.yaml
index 1e112f32..f6ab2ff3 100644
--- a/qai_hub_models/models/resnext101_quantized/perf.yaml
+++ b/qai_hub_models/models/resnext101_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNeXt101Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2913.0
-      throughput: 343.2887058015791
+      inference_time: 3033.0
+      throughput: 329.7065611605671
       estimated_peak_memory_range:
-        min: 24576
-        max: 1706912
+        min: 16384
+        max: 2184152
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,22 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 148
-      job_id: j7gjzqox5
+      job_id: jlpevdkv5
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 3921.0
-      throughput: 255.03698036215252
+    torchscript_onnx_qnn:
+      inference_time: 3107.0
+      throughput: 321.853878339234
       estimated_peak_memory_range:
         min: 12288
-        max: 136560960
+        max: 32784840
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jmg94lw85
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 3421.0
+      throughput: 292.3121894182987
+      estimated_peak_memory_range:
+        min: 0
+        max: 137016264
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 154
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jygzon8k5
+        total_layers: 154
+      job_id: jqp4wlv1g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.306938Z'
+    timestamp: '2024-05-20T16:35:30.820290Z'
   - torchscript_onnx_tflite:
-      inference_time: 2167.0
-      throughput: 461.4674665436087
+      inference_time: 2053.0
+      throughput: 487.0920603994155
       estimated_peak_memory_range:
         min: 12288
-        max: 262604528
+        max: 258014032
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,22 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 148
-      job_id: jlpeey81p
+      job_id: jygz73rxp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 2280.0
+      throughput: 438.5964912280702
+      estimated_peak_memory_range:
+        min: 12288
+        max: 118044256
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jnp184e7g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2990.0
-      throughput: 334.44816053511704
+      inference_time: 2540.0
+      throughput: 393.7007874015748
       estimated_peak_memory_range:
         min: 618496
-        max: 95251808
+        max: 92001632
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 154
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz5w24165
+        total_layers: 154
+      job_id: j0px1kylg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,36 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.306993Z'
+    timestamp: '2024-05-20T16:35:30.820316Z'
   - torchscript_onnx_tflite:
-      inference_time: 10468.0
-      throughput: 95.52923194497517
+      inference_time: 2932.0
+      throughput: 341.06412005457025
       estimated_peak_memory_range:
-        min: 32768
-        max: 199144352
+        min: 24576
+        max: 2554384
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 150
+        layers_on_npu: 148
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 150
-      job_id: jlpee0v8p
+        total_layers: 148
+      job_id: jz5w9eqmp
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 88885.0
-      throughput: 11.250492209034146
+    torchscript_onnx_qnn:
+      inference_time: 3081.0
+      throughput: 324.5699448231094
       estimated_peak_memory_range:
-        min: 8159232
-        max: 88001424
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 16384
+        max: 35435296
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 146
         layers_on_gpu: 0
-        layers_on_cpu: 156
-        total_layers: 156
-      job_id: jmg9jdxl5
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jz57dyx95
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.820333Z'
+  - torchscript_onnx_tflite:
+      inference_time: 10331.0
+      throughput: 96.79605072113058
+      estimated_peak_memory_range:
+        min: 45056
+        max: 199157040
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 148
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 148
+      job_id: jygzr07z5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 11010.0
+      throughput: 90.82652134423252
+      estimated_peak_memory_range:
+        min: 167936
+        max: 124990144
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 146
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 146
+      job_id: jqp4v2wqp
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -152,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.307043Z'
+    timestamp: '2024-05-20T16:35:30.820350Z'
   - torchscript_onnx_tflite:
-      inference_time: 134216.0
-      throughput: 7.450676521428146
+      inference_time: 133798.0
+      throughput: 7.473953272844138
       estimated_peak_memory_range:
-        min: 24576
-        max: 357047544
+        min: 184320
+        max: 355878408
       primary_compute_unit: GPU
       precision: int8
       layer_info:
-        layers_on_npu: 14
+        layers_on_npu: 12
         layers_on_gpu: 125
         layers_on_cpu: 11
-        total_layers: 150
-      job_id: jmg9yo4q5
+        total_layers: 148
+      job_id: jz5wqr9z5
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -175,27 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.307071Z'
-  - torchscript_onnx_tflite:
-      inference_time: 2909.0
-      throughput: 343.7607425232039
+    timestamp: '2024-05-20T16:35:30.820361Z'
+  - torchscript_onnx_qnn:
+      inference_time: 3328.0
+      throughput: 300.4807692307692
       estimated_peak_memory_range:
-        min: 16384
-        max: 2753672
+        min: 249856
+        max: 249856
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 150
+        layers_on_npu: 146
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 150
-      job_id: j1p3vlqxg
+        total_layers: 146
+      job_id: jvgdvxozg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 3366.0
+      throughput: 297.08853238265004
+      estimated_peak_memory_range:
+        min: 137375744
+        max: 137375744
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 154
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 154
+      job_id: jo5mzn39p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 228816.0
+      throughput: 4.370323753583666
+      estimated_peak_memory_range:
+        min: 1384448
+        max: 1384448
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 156
+        total_layers: 156
+      job_id: jegne63qg
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.307097Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.820384Z'
diff --git a/qai_hub_models/models/resnext50/README.md b/qai_hub_models/models/resnext50/README.md
index 18cce04c..57bd0206 100644
--- a/qai_hub_models/models/resnext50/README.md
+++ b/qai_hub_models/models/resnext50/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnext50/export.py b/qai_hub_models/models/resnext50/export.py
index 25d7ee96..3fc8a566 100644
--- a/qai_hub_models/models/resnext50/export.py
+++ b/qai_hub_models/models/resnext50/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnext50/perf.yaml b/qai_hub_models/models/resnext50/perf.yaml
index b9ae5b5d..8552317d 100644
--- a/qai_hub_models/models/resnext50/perf.yaml
+++ b/qai_hub_models/models/resnext50/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNeXt50
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2502.0
-      throughput: 399.68025579536373
+      inference_time: 2512.0
+      throughput: 398.0891719745223
       estimated_peak_memory_range:
-        min: 16384
-        max: 2039136
+        min: 12288
+        max: 2465560
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jnp1y6v2p
+      job_id: jopryve7g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2619.0
-      throughput: 381.82512409316536
+      inference_time: 2556.0
+      throughput: 391.23630672926447
       estimated_peak_memory_range:
-        min: 12288
-        max: 67332096
+        min: 16384
+        max: 87753520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jz57097lg
+      job_id: j2p0rzlnp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2938.0
-      throughput: 340.3675970047652
+      inference_time: 2844.0
+      throughput: 351.6174402250352
       estimated_peak_memory_range:
-        min: 90112
-        max: 153500352
+        min: 229376
+        max: 171515144
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 128
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j0pxnxd15
+        total_layers: 128
+      job_id: j1glkvkmp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.333746Z'
+    timestamp: '2024-05-20T16:35:30.859710Z'
   - torchscript_onnx_tflite:
-      inference_time: 1788.0
-      throughput: 559.2841163310962
+      inference_time: 1790.0
+      throughput: 558.659217877095
       estimated_peak_memory_range:
         min: 16384
-        max: 164107600
+        max: 160881424
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jvgde2ze5
+      job_id: jep2mklq5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1857.0
-      throughput: 538.5029617662897
+      inference_time: 1858.0
+      throughput: 538.2131324004306
       estimated_peak_memory_range:
-        min: 0
-        max: 60102256
+        min: 618496
+        max: 60637072
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jqp4k39vg
+      job_id: j1p87qzo5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2158.0
-      throughput: 463.3920296570899
+      inference_time: 2037.0
+      throughput: 490.9180166912126
       estimated_peak_memory_range:
         min: 618496
-        max: 42526736
+        max: 41012496
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 128
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jo5mq8dwp
+        total_layers: 128
+      job_id: jw561y1yp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.333800Z'
+    timestamp: '2024-05-20T16:35:30.859737Z'
   - torchscript_onnx_tflite:
-      inference_time: 2497.0
-      throughput: 400.4805766920304
+      inference_time: 2499.0
+      throughput: 400.16006402561027
       estimated_peak_memory_range:
-        min: 53248
-        max: 2221936
+        min: 24576
+        max: 2189296
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jnp1yk3kp
+      job_id: jqpyd16lp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2594.0
-      throughput: 385.50501156515037
+      inference_time: 2548.0
+      throughput: 392.4646781789639
       estimated_peak_memory_range:
-        min: 618496
-        max: 68165536
+        min: 622592
+        max: 88624416
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: j0pxn8mj5
+      job_id: jn5q263o5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.333840Z'
+    timestamp: '2024-05-20T16:35:30.859755Z'
+  - torchscript_onnx_qnn:
+      inference_time: 2925.0
+      throughput: 341.88034188034186
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jogkye3np
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2645.0
+      throughput: 378.0718336483932
+      estimated_peak_memory_range:
+        min: 75046912
+        max: 75046912
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 128
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 128
+      job_id: j1p3mjmng
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 23055.0
+      throughput: 43.37453914552158
+      estimated_peak_memory_range:
+        min: 31170560
+        max: 31170560
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 59
+        total_layers: 59
+      job_id: jwgov2vk5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.859779Z'
diff --git a/qai_hub_models/models/resnext50_quantized/README.md b/qai_hub_models/models/resnext50_quantized/README.md
index a4d1fd1d..60f8d368 100644
--- a/qai_hub_models/models/resnext50_quantized/README.md
+++ b/qai_hub_models/models/resnext50_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/r
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/resnext50_quantized/export.py b/qai_hub_models/models/resnext50_quantized/export.py
index ae5a4d3a..b6afc50c 100644
--- a/qai_hub_models/models/resnext50_quantized/export.py
+++ b/qai_hub_models/models/resnext50_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -207,7 +218,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/resnext50_quantized/perf.yaml b/qai_hub_models/models/resnext50_quantized/perf.yaml
index 57e9183e..5e55abcf 100644
--- a/qai_hub_models/models/resnext50_quantized/perf.yaml
+++ b/qai_hub_models/models/resnext50_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: ResNeXt50Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 949.0
-      throughput: 1053.740779768177
+      inference_time: 945.0
+      throughput: 1058.2010582010582
       estimated_peak_memory_range:
-        min: 40960
-        max: 32336880
+        min: 16384
+        max: 1978440
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,22 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 80
-      job_id: jopr8wn95
+      job_id: j1pvw6wrg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1182.0
+      throughput: 846.0236886632825
+      estimated_peak_memory_range:
+        min: 12288
+        max: 98529472
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 78
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 78
+      job_id: jygz737xp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1749.0
-      throughput: 571.7552887364208
+      inference_time: 1456.0
+      throughput: 686.8131868131868
       estimated_peak_memory_range:
         min: 12288
-        max: 65405552
+        max: 110506920
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jqpyrm775
+        total_layers: 86
+      job_id: jvgdvxvzg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.357768Z'
+    timestamp: '2024-05-20T16:35:30.890153Z'
   - torchscript_onnx_tflite:
-      inference_time: 724.0
-      throughput: 1381.2154696132598
+      inference_time: 710.0
+      throughput: 1408.4507042253522
       estimated_peak_memory_range:
         min: 12288
-        max: 99522896
+        max: 99630928
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,22 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 80
-      job_id: jep20ev4g
+      job_id: j7gjlvlep
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 882.0
+      throughput: 1133.7868480725624
+      estimated_peak_memory_range:
+        min: 167936
+        max: 57911616
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 78
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 78
+      job_id: jz5w9e9mp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1274.0
-      throughput: 784.9293563579278
+      inference_time: 1110.0
+      throughput: 900.9009009009009
       estimated_peak_memory_range:
         min: 0
-        max: 42945536
+        max: 41473776
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j2p036v6p
+        total_layers: 86
+      job_id: jz57dyd95
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -114,36 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.357805Z'
+    timestamp: '2024-05-20T16:35:30.890180Z'
   - torchscript_onnx_tflite:
-      inference_time: 3105.0
-      throughput: 322.061191626409
+      inference_time: 940.0
+      throughput: 1063.8297872340424
       estimated_peak_memory_range:
         min: 12288
-        max: 54933392
+        max: 1980696
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 82
+        layers_on_npu: 80
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 82
-      job_id: jygzoq8z5
+        total_layers: 80
+      job_id: jlpevdvv5
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 31790.0
-      throughput: 31.456432840515884
+    torchscript_onnx_qnn:
+      inference_time: 1178.0
+      throughput: 848.8964346349745
       estimated_peak_memory_range:
-        min: 8765440
-        max: 56053712
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 167936
+        max: 10889000
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 78
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 78
+      job_id: jnp18487g
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.890198Z'
+  - torchscript_onnx_tflite:
+      inference_time: 3222.0
+      throughput: 310.36623215394167
+      estimated_peak_memory_range:
+        min: 16384
+        max: 54803712
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 80
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 80
+      job_id: jlpeknr7p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 3456.0
+      throughput: 289.35185185185185
+      estimated_peak_memory_range:
+        min: 163840
+        max: 51993520
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 78
         layers_on_gpu: 0
-        layers_on_cpu: 88
-        total_layers: 88
-      job_id: j1p8014xg
+        layers_on_cpu: 0
+        total_layers: 78
+      job_id: jz5wqrdj5
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -152,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.357842Z'
+    timestamp: '2024-05-20T16:35:30.890215Z'
   - torchscript_onnx_tflite:
-      inference_time: 64556.0
-      throughput: 15.49042691616581
+      inference_time: 65861.0
+      throughput: 15.183492506946449
       estimated_peak_memory_range:
-        min: 0
-        max: 94711912
+        min: 8355840
+        max: 26461872
       primary_compute_unit: GPU
       precision: int8
       layer_info:
-        layers_on_npu: 14
+        layers_on_npu: 12
         layers_on_gpu: 57
         layers_on_cpu: 11
-        total_layers: 82
-      job_id: jnp1wo8kg
+        total_layers: 80
+      job_id: jygzr0xz5
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -175,27 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.357866Z'
-  - torchscript_onnx_tflite:
-      inference_time: 987.0
-      throughput: 1013.1712259371834
+    timestamp: '2024-05-20T16:35:30.890226Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1353.0
+      throughput: 739.0983000739099
+      estimated_peak_memory_range:
+        min: 438272
+        max: 438272
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 78
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 78
+      job_id: jmg94l485
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1402.0
+      throughput: 713.2667617689016
       estimated_peak_memory_range:
-        min: 24576
-        max: 1688688
+        min: 52183040
+        max: 52183040
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 82
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 82
-      job_id: jogk7woyp
+        total_layers: 86
+      job_id: jqp4wlw1g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 202223.0
+      throughput: 4.945035925686
+      estimated_peak_memory_range:
+        min: 20660224
+        max: 20660224
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j0px1k1lg
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.357887Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.890248Z'
diff --git a/qai_hub_models/models/riffusion_quantized/README.md b/qai_hub_models/models/riffusion_quantized/README.md
new file mode 100644
index 00000000..e4090d13
--- /dev/null
+++ b/qai_hub_models/models/riffusion_quantized/README.md
@@ -0,0 +1,83 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [Riffusion: State-of-the-art generative AI model used to generate spectrogram images given any text input. These spectrograms can be converted into audio clips](#)
+
+Generates high resolution spectrograms images from text prompts using a latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, and VAE based decoder to generate the final image.
+
+This is based on the implementation of Riffusion found
+[here](https://github.com/CompVis/stable-diffusion/tree/main). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](#).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+Install the package via pip:
+```bash
+pip install "qai_hub_models[riffusion_quantized]"
+```
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.riffusion_quantized.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.riffusion_quantized.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of Riffusion can be found
+  [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)
+* [Source Model Implementation](https://github.com/CompVis/stable-diffusion/tree/main)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
+## Usage and Limitations
+
+This model may not be used for or in connection with any of the following applications:
+
+- Accessing essential private and public services and benefits;
+- Administration of justice and democratic processes;
+- Assessing or recognizing the emotional state of a person;
+- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics;
+- Education and vocational training;
+- Employment and workers management;
+- Exploitation of the vulnerabilities of persons resulting in harmful behavior;
+- General purpose social scoring;
+- Law enforcement;
+- Management and operation of critical infrastructure;
+- Migration, asylum and border control management;
+- Predictive policing;
+- Real-time remote biometric identification in public spaces;
+- Recommender systems of social media platforms;
+- Scraping of facial images (from the internet or otherwise); and/or
+- Subliminal manipulation
+
+
diff --git a/qai_hub_models/models/riffusion_quantized/__init__.py b/qai_hub_models/models/riffusion_quantized/__init__.py
new file mode 100644
index 00000000..74856492
--- /dev/null
+++ b/qai_hub_models/models/riffusion_quantized/__init__.py
@@ -0,0 +1,8 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.riffusion_quantized.model import MODEL_ID  # noqa: F401
+from qai_hub_models.models.riffusion_quantized.model import (  # noqa: F401
+    RiffusionQuantized as Model,
+)
diff --git a/qai_hub_models/models/riffusion_quantized/demo.py b/qai_hub_models/models/riffusion_quantized/demo.py
new file mode 100644
index 00000000..410a1e06
--- /dev/null
+++ b/qai_hub_models/models/riffusion_quantized/demo.py
@@ -0,0 +1,53 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel
+from transformers import CLIPTokenizer
+
+from qai_hub_models.models._shared.stable_diffusion.demo import stable_diffusion_demo
+from qai_hub_models.models.riffusion_quantized.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    ClipVITTextEncoder,
+    Unet,
+    VAEDecoder,
+)
+
+
+# Run Riffuison end-to-end on a given prompt. The demo will output an
+# AI-generated image based on the description in the prompt.
+def main(is_test: bool = False):
+    tokenizer = CLIPTokenizer.from_pretrained(
+        "openai/clip-vit-large-patch14", subfolder="", revision="main"
+    )
+
+    scheduler = DPMSolverMultistepScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    time_embedding = UNet2DConditionModel.from_pretrained(
+        "riffusion/riffusion-model-v1", subfolder="unet"
+    ).time_embedding
+
+    text_encoder = ClipVITTextEncoder.from_precompiled()
+    unet = Unet.from_precompiled()
+    vae_decoder = VAEDecoder.from_precompiled()
+    stable_diffusion_demo(
+        model_id=MODEL_ID,
+        model_asset_version=MODEL_ASSET_VERSION,
+        text_encoder=text_encoder,
+        unet=unet,
+        vae_decoder=vae_decoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+        time_embedding=time_embedding,
+        is_test=is_test,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/stable_diffusion_quantized/export.py b/qai_hub_models/models/riffusion_quantized/export.py
similarity index 97%
rename from qai_hub_models/models/stable_diffusion_quantized/export.py
rename to qai_hub_models/models/riffusion_quantized/export.py
index 7242bbb7..428d3b0f 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/export.py
+++ b/qai_hub_models/models/riffusion_quantized/export.py
@@ -13,7 +13,7 @@
 
 import qai_hub as hub
 
-from qai_hub_models.models.stable_diffusion_quantized import Model
+from qai_hub_models.models.riffusion_quantized import Model
 from qai_hub_models.utils.args import export_parser
 from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime
 from qai_hub_models.utils.printing import print_profile_metrics_from_job
@@ -74,7 +74,7 @@ def export_model(
             * A ProfileJob containing metadata about the profile job (None if profiling skipped).
             * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
     """
-    model_name = "stable_diffusion_quantized"
+    model_name = "riffusion_quantized"
     output_path = Path(output_dir or Path.cwd() / "build" / model_name)
     if chipset:
         hub_device = hub.Device(attributes=f"chipset:{chipset}")
@@ -87,8 +87,8 @@ def export_model(
             raise ValueError(f"Invalid component {component_name}.")
     if not can_access_qualcomm_ai_hub():
         return export_without_hub_access(
-            "stable_diffusion_quantized",
-            "Stable-Diffusion",
+            "riffusion_quantized",
+            "Riffusion",
             device,
             skip_profiling,
             skip_inferencing,
diff --git a/qai_hub_models/models/riffusion_quantized/info.yaml b/qai_hub_models/models/riffusion_quantized/info.yaml
new file mode 100644
index 00000000..e2a2669c
--- /dev/null
+++ b/qai_hub_models/models/riffusion_quantized/info.yaml
@@ -0,0 +1,39 @@
+name: Riffusion
+id: riffusion_quantized
+status: public
+headline: State-of-the-art generative AI model used to generate spectrogram images given
+  any text input. These spectrograms can be converted into audio clips.
+domain: Generative AI
+description: Generates high resolution spectrograms images from text prompts using a
+  latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based
+  latent denoising, and VAE based decoder to generate the final image.
+use_case: Image Generation
+tags:
+  - generative-ai
+  - quantized
+research_paper: https://arxiv.org/abs/2112.10752
+research_paper_title: High-Resolution Image Synthesis with Latent Diffusion Models
+license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE
+deploy_license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE
+source_repo: https://github.com/CompVis/stable-diffusion/tree/main
+technical_details:
+  Input: Text prompt to generate spectrogram image
+  QNN-SDK: '2.20'
+  Text Encoder Number of parameters: 340M
+  UNet Number of parameters: 865M
+  VAE Decoder Number of parameters: 83M
+  Model size: 1GB
+applicable_scenarios:
+  - Music Generation
+  - Music Editing
+  - Content Creation
+related_models:
+  - stable_diffusion_v1_5_quantized
+form_factors:
+  - Phone
+  - Tablet
+has_static_banner: yes
+has_animated_banner: yes
+license_type: creativeml-openrail-m
+deploy_license_type: creativeml-openrail-m
+dataset: []
diff --git a/qai_hub_models/models/riffusion_quantized/model.py b/qai_hub_models/models/riffusion_quantized/model.py
new file mode 100644
index 00000000..8e26b375
--- /dev/null
+++ b/qai_hub_models/models/riffusion_quantized/model.py
@@ -0,0 +1,105 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from qai_hub_models.models.protocols import FromPrecompiledProtocol
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
+from qai_hub_models.utils.base_model import BasePrecompiledModel, CollectionModel
+from qai_hub_models.utils.input_spec import InputSpec
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 1
+TEXT_ENCODER = "text_encoder.serialized.bin"
+UNET_DIFFUSER = "unet.serialized.bin"
+VAE_DECODER = "vae_decoder.serialized.bin"
+
+
+class RiffusionQuantized(FromPrecompiledProtocol, CollectionModel):
+    """
+    Riffusion wrapper class consists of
+        - Text Encoder
+        - UNet based diffuser
+        - VAE decoder
+
+    All three models are pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    def __init__(self, text_encoder, unet, vae_decoder) -> None:
+        self.text_encoder = text_encoder
+        self.unet = unet
+        self.vae_decoder = vae_decoder
+
+    @classmethod
+    def from_precompiled(cls) -> "RiffusionQuantized":
+        return RiffusionQuantized(
+            text_encoder=ClipVITTextEncoder.from_precompiled(),
+            unet=Unet.from_precompiled(),
+            vae_decoder=VAEDecoder.from_precompiled(),
+        )
+
+
+class ClipVITTextEncoder(BasePrecompiledModel):
+    """
+    CLIP-ViT based Text Encoder.
+
+    Pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    @classmethod
+    def from_precompiled(cls) -> "ClipVITTextEncoder":
+        text_encoder_path = CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, TEXT_ENCODER
+        ).fetch()
+        return ClipVITTextEncoder(text_encoder_path)
+
+    @staticmethod
+    def get_input_spec() -> InputSpec:
+        return {"input_1": ((1, 77), "int32")}
+
+
+class Unet(BasePrecompiledModel):
+    """
+    UNet model to denoise image in latent space.
+
+    Pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    @classmethod
+    def from_precompiled(cls) -> "Unet":
+        model_path = CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, UNET_DIFFUSER
+        ).fetch()
+        return Unet(model_path)
+
+    @staticmethod
+    def get_input_spec() -> InputSpec:
+        return {
+            "input_1": ((1, 64, 64, 4), "float32"),
+            "input_2": ((1, 1280), "float32"),
+            "input_3": ((1, 77, 768), "float32"),
+        }
+
+
+class VAEDecoder(BasePrecompiledModel):
+    """
+    Decodes image from latent into output generated image.
+
+    Pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    @classmethod
+    def from_precompiled(cls) -> "VAEDecoder":
+        model_path = CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, VAE_DECODER
+        ).fetch()
+        return VAEDecoder(model_path)
+
+    @staticmethod
+    def get_input_spec() -> InputSpec:
+        return {"input_1": ((1, 64, 64, 4), "float32")}
diff --git a/qai_hub_models/models/stable_diffusion_quantized/requirements.txt b/qai_hub_models/models/riffusion_quantized/requirements.txt
similarity index 100%
rename from qai_hub_models/models/stable_diffusion_quantized/requirements.txt
rename to qai_hub_models/models/riffusion_quantized/requirements.txt
diff --git a/qai_hub_models/models/riffusion_quantized/test.py b/qai_hub_models/models/riffusion_quantized/test.py
new file mode 100644
index 00000000..b4f5bd0e
--- /dev/null
+++ b/qai_hub_models/models/riffusion_quantized/test.py
@@ -0,0 +1,28 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import pytest
+
+from qai_hub_models.models._shared.stable_diffusion.test_utils import (
+    export_for_component,
+)
+from qai_hub_models.models.riffusion_quantized.demo import main as demo_main
+from qai_hub_models.models.riffusion_quantized.export import export_model
+from qai_hub_models.models.riffusion_quantized.model import RiffusionQuantized
+
+
+def test_from_precompiled():
+    RiffusionQuantized.from_precompiled()
+
+
+@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
+@pytest.mark.slow_cloud
+def test_export():
+    export_for_component(export_model, "TextEncoder_Quantized")
+
+
+@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
+@pytest.mark.slow_cloud
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/sam/README.md b/qai_hub_models/models/sam/README.md
index 2297b9ae..e4bc8748 100644
--- a/qai_hub_models/models/sam/README.md
+++ b/qai_hub_models/models/sam/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/sam/export.py b/qai_hub_models/models/sam/export.py
index 0216f3ab..56e34545 100644
--- a/qai_hub_models/models/sam/export.py
+++ b/qai_hub_models/models/sam/export.py
@@ -31,7 +31,6 @@
 )
 
 ALL_COMPONENTS = ["SAMDecoder", "SAMEncoder"]
-DEFAULT_COMPONENTS = ["SAMDecoder"]
 
 
 def export_model(
@@ -97,7 +96,7 @@ def export_model(
     else:
         hub_device = hub.Device(name=device)
     component_arg = components
-    components = components or DEFAULT_COMPONENTS
+    components = components or ALL_COMPONENTS
     for component_name in components:
         if component_name not in ALL_COMPONENTS:
             raise ValueError(f"Invalid component {component_name}.")
@@ -145,7 +144,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
diff --git a/qai_hub_models/models/sam/model.py b/qai_hub_models/models/sam/model.py
index 7b730c71..2b8d3c3d 100644
--- a/qai_hub_models/models/sam/model.py
+++ b/qai_hub_models/models/sam/model.py
@@ -6,7 +6,6 @@
 
 import os
 import sys
-import tempfile
 from typing import Callable, Tuple
 
 import numpy as np
@@ -16,6 +15,7 @@
     CachedWebModelAsset,
     load_path,
     maybe_clone_git_repo,
+    qaihm_temp_dir,
 )
 from qai_hub_models.utils.base_model import BaseModel, CollectionModel
 from qai_hub_models.utils.input_spec import InputSpec
@@ -290,7 +290,7 @@ def load_sam_model(
 ) -> torch.nn.Module:
     """Loads SAM model of given model type"""
     weights_url = _get_weights_url(model_type)
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         weights_path = load_path(weights_url, tmpdir)
         sam = sam_model_registry[model_type](weights_path)
     sam.eval()
@@ -311,8 +311,10 @@ def _patch_sam_with_qaihm_modules():
         SamPredictor: segment_anything.SamPredictor
             Python class wrapper to call image encoder - decoder
     """
-    sam_repo_path = maybe_clone_git_repo(
-        SAM_SOURCE_REPO, SAM_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION
+    sam_repo_path = str(
+        maybe_clone_git_repo(
+            SAM_SOURCE_REPO, SAM_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION
+        )
     )
     cwd = os.getcwd()
     try:
diff --git a/qai_hub_models/models/sam/perf.yaml b/qai_hub_models/models/sam/perf.yaml
index c39ffd7a..894f56df 100644
--- a/qai_hub_models/models/sam/perf.yaml
+++ b/qai_hub_models/models/sam/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,38 +31,39 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: SAMDecoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 47957.0
-      throughput: 20.852013261880433
+      inference_time: 48417.0
+      throughput: 20.653902554887747
       estimated_peak_memory_range:
-        min: 4009984
-        max: 23686696
+        min: 4046848
+        max: 13471792
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 340
+        layers_on_npu: 342
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 340
-      job_id: jogk7892p
+        total_layers: 342
+      job_id: jo5mznz9p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1089085.0
-      throughput: 0.9182019768888563
+      inference_time: 35687.0
+      throughput: 28.021408355983972
       estimated_peak_memory_range:
-        min: 15695872
-        max: 53847464
+        min: 21266432
+        max: 62118592
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 2
+        layers_on_npu: 351
         layers_on_gpu: 0
         layers_on_cpu: 1
-        total_layers: 3
-      job_id: j1gl6l18g
+        total_layers: 352
+      job_id: j1p87q7o5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,36 +72,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.384775Z'
+    timestamp: '2024-05-20T16:35:30.929656Z'
   - torchscript_onnx_tflite:
-      inference_time: 33609.0
-      throughput: 29.75393495789818
+      inference_time: 34847.0
+      throughput: 28.696874910322265
       estimated_peak_memory_range:
-        min: 61440
-        max: 246507888
+        min: 2396160
+        max: 250176160
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 340
+        layers_on_npu: 342
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 340
-      job_id: jn5qevm45
+        total_layers: 342
+      job_id: jopryvy7g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 809800.0
-      throughput: 1.2348728081007656
+      inference_time: 25375.0
+      throughput: 39.40886699507389
       estimated_peak_memory_range:
-        min: 19857408
-        max: 115862864
+        min: 27185152
+        max: 114627488
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 2
+        layers_on_npu: 351
         layers_on_gpu: 0
         layers_on_cpu: 1
-        total_layers: 3
-      job_id: jw56ewd0g
+        total_layers: 352
+      job_id: jn5q262o5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,21 +110,21 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.384834Z'
+    timestamp: '2024-05-20T16:35:30.929678Z'
   - torchscript_onnx_tflite:
-      inference_time: 48295.0
-      throughput: 20.706077233668083
+      inference_time: 48322.0
+      throughput: 20.694507677662347
       estimated_peak_memory_range:
-        min: 3977216
-        max: 12384360
+        min: 4030464
+        max: 7393624
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 340
+        layers_on_npu: 342
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 340
-      job_id: jnp1yk7kp
+        total_layers: 342
+      job_id: jqpyd1dlp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +133,181 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.384881Z'
+    timestamp: '2024-05-20T16:35:30.929690Z'
+  - torchscript_onnx_ort:
+      inference_time: 35991.0
+      throughput: 27.78472395876747
+      estimated_peak_memory_range:
+        min: 38920192
+        max: 38920192
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 351
+        layers_on_gpu: 0
+        layers_on_cpu: 1
+        total_layers: 352
+      job_id: jw561y3yp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jwgov21k5
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.929711Z'
+- name: SAMEncoder
+  performance_metrics:
+  - torchscript_onnx_tflite:
+      inference_time: 12002934.0
+      throughput: 0.08331296331380311
+      estimated_peak_memory_range:
+        min: 2745298944
+        max: 2749256400
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 37
+        layers_on_cpu: 771
+        total_layers: 808
+      job_id: jegne6eqg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jogkyeynp
+      job_status: Failed
+    reference_device_info:
+      name: Samsung Galaxy S23
+      os: '13'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 2
+    timestamp: '2024-05-20T16:35:30.929729Z'
+  - torchscript_onnx_tflite:
+      inference_time: 10788785.0
+      throughput: 0.09268884309030165
+      estimated_peak_memory_range:
+        min: 2551681024
+        max: 2911589120
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 37
+        layers_on_cpu: 771
+        total_layers: 808
+      job_id: jep2mkmq5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1glkv0mp
+      job_status: Failed
+    reference_device_info:
+      name: Samsung Galaxy S24
+      os: '14'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 3
+    timestamp: '2024-05-20T16:35:30.929747Z'
+  - torchscript_onnx_tflite:
+      inference_time: 11903922.0
+      throughput: 0.08400592678614661
+      estimated_peak_memory_range:
+        min: 2721533952
+        max: 2726534168
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 37
+        layers_on_cpu: 771
+        total_layers: 808
+      job_id: j2p0rzrnp
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:30.929757Z'
+  - torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1p3mj4ng
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1pvw61rg
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.929773Z'
diff --git a/qai_hub_models/models/sesr_m5/README.md b/qai_hub_models/models/sesr_m5/README.md
index 37bc4f6d..eb36ea36 100644
--- a/qai_hub_models/models/sesr_m5/README.md
+++ b/qai_hub_models/models/sesr_m5/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/sesr_m5/export.py b/qai_hub_models/models/sesr_m5/export.py
index eba502da..56a3d124 100644
--- a/qai_hub_models/models/sesr_m5/export.py
+++ b/qai_hub_models/models/sesr_m5/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/sesr_m5/perf.yaml b/qai_hub_models/models/sesr_m5/perf.yaml
index f52e91e4..9f832592 100644
--- a/qai_hub_models/models/sesr_m5/perf.yaml
+++ b/qai_hub_models/models/sesr_m5/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: SESR-M5
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2236.0
-      throughput: 447.2271914132379
+      inference_time: 2229.0
+      throughput: 448.63167339614176
       estimated_peak_memory_range:
-        min: 24576
-        max: 1639560
+        min: 28672
+        max: 1751584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 25
-      job_id: jwgok84xp
+      job_id: j7gjlv0ep
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2141.0
-      throughput: 467.07146193367583
+      inference_time: 2149.0
+      throughput: 465.33271288971616
       estimated_peak_memory_range:
-        min: 217088
-        max: 66412728
+        min: 24576
+        max: 3705880
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: j7gjzqwx5
+      job_id: jz5w9edmp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2959.0
-      throughput: 337.95201081446436
+      inference_time: 2907.0
+      throughput: 343.9972480220158
       estimated_peak_memory_range:
-        min: 28672
-        max: 6879728
+        min: 12288
+        max: 5644152
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 33
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jygzon4k5
+        total_layers: 33
+      job_id: jz5w9ed4p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.402615Z'
+    timestamp: '2024-05-20T16:35:30.969933Z'
   - torchscript_onnx_tflite:
-      inference_time: 1608.0
-      throughput: 621.8905472636816
+      inference_time: 1652.0
+      throughput: 605.3268765133172
       estimated_peak_memory_range:
         min: 16384
-        max: 24474768
+        max: 24934032
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 25
-      job_id: j1pv079j5
+      job_id: jlpevdrv5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1452.0
-      throughput: 688.7052341597796
+      inference_time: 1450.0
+      throughput: 689.6551724137931
       estimated_peak_memory_range:
-        min: 208896
-        max: 24978944
+        min: 9527296
+        max: 32336704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,23 +116,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: jlpeeyl1p
+      job_id: jmg94l385
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2024.0
-      throughput: 494.0711462450593
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 208896
-        max: 16041184
-      primary_compute_unit: NPU
-      precision: fp16
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 0
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz5w24465
-      job_status: Passed
+        total_layers: 0
+      job_id: jmg94l3m5
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.402651Z'
+    timestamp: '2024-05-20T16:35:30.969973Z'
   - torchscript_onnx_tflite:
-      inference_time: 2223.0
-      throughput: 449.842555105713
+      inference_time: 2266.0
+      throughput: 441.306266548985
       estimated_peak_memory_range:
-        min: 20480
-        max: 8844744
+        min: 12607488
+        max: 14159192
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 25
-      job_id: j0pxn8d95
+      job_id: jygz73xxp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2148.0
-      throughput: 465.54934823091247
+      inference_time: 2141.0
+      throughput: 467.07146193367583
       estimated_peak_memory_range:
-        min: 229376
-        max: 4684448
+        min: 221184
+        max: 4063112
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: jep20qvmg
+      job_id: jvgdvxrzg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.402678Z'
+    timestamp: '2024-05-20T16:35:30.969990Z'
+  - torchscript_onnx_qnn:
+      inference_time: 2969.0
+      throughput: 336.81374200067364
+      estimated_peak_memory_range:
+        min: 212992
+        max: 212992
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 31
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 31
+      job_id: jnp184d7g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2971.0
+      throughput: 336.58700774150117
+      estimated_peak_memory_range:
+        min: 13090816
+        max: 13090816
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 33
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 33
+      job_id: jnp184dng
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 17098.0
+      throughput: 58.486372675166685
+      estimated_peak_memory_range:
+        min: 83427328
+        max: 83427328
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jvgdvxr6g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:30.970016Z'
diff --git a/qai_hub_models/models/sesr_m5_quantized/README.md b/qai_hub_models/models/sesr_m5_quantized/README.md
index 1f6bd0dc..e93dd579 100644
--- a/qai_hub_models/models/sesr_m5_quantized/README.md
+++ b/qai_hub_models/models/sesr_m5_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/sesr_m5_quantized/export.py b/qai_hub_models/models/sesr_m5_quantized/export.py
index 03f22916..27c722a5 100644
--- a/qai_hub_models/models/sesr_m5_quantized/export.py
+++ b/qai_hub_models/models/sesr_m5_quantized/export.py
@@ -122,9 +122,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -166,8 +173,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -201,7 +210,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/sesr_m5_quantized/model.py b/qai_hub_models/models/sesr_m5_quantized/model.py
index 8782ceaf..de5c875e 100644
--- a/qai_hub_models/models/sesr_m5_quantized/model.py
+++ b/qai_hub_models/models/sesr_m5_quantized/model.py
@@ -14,20 +14,24 @@
 
 import torch
 from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
 from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
 
 from qai_hub_models.models._shared.sesr.common import _load_sesr_source_model
-from qai_hub_models.models.common import SourceModelFormat, TargetRuntime
 from qai_hub_models.models.sesr_m5.model import (
     NUM_CHANNELS,
     NUM_LBLOCKS,
     SCALING_FACTOR,
     SESR_M5,
 )
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
 from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
+from qai_hub_models.utils.quantization_aimet import (
+    constrain_quantized_inputs_to_image_range,
+)
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 2
+MODEL_ASSET_VERSION = 3
 
 # Weights and config stored in S3 are sourced from
 # https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/sesr/model/model_cards/sesr_m5_4x_w8a8.json:
@@ -37,7 +41,6 @@
 # Encodings were generated with AIMET QuantSim library
 QUANTIZED_WEIGHTS = "sesr_m5_4x_checkpoint_int8.pth"
 AIMET_ENCODINGS = "sesr_m5_quantized_encodings.json"
-AIMET_CONFIG = "default_config_per_channel.json"
 
 
 class SESR_M5Quantizable(AIMETQuantizableMixin, SESR_M5):
@@ -51,9 +54,7 @@ def __init__(
         sesr_model: QuantizationSimModel,
     ) -> None:
         SESR_M5.__init__(self, sesr_model.model)
-        AIMETQuantizableMixin.__init__(
-            self, sesr_model, needs_onnx_direct_aimet_export=False
-        )
+        AIMETQuantizableMixin.__init__(self, sesr_model)
 
     @classmethod
     def from_pretrained(
@@ -62,32 +63,31 @@ def from_pretrained(
     ) -> SESR_M5Quantizable:
         # Load Model
         sesr = _load_sesr_source_model(SCALING_FACTOR, NUM_CHANNELS, NUM_LBLOCKS)
+        # The model is collapsed pre-quantization - see
+        # https://github.com/quic/aimet-model-zoo/blob/d09d2b0404d10f71a7640a87e9d5e5257b028802/aimet_zoo_torch/common/super_resolution/models.py#L110
+        sesr.collapse()
         input_shape = SESR_M5.get_input_spec()["image"][0]
+        sesr = prepare_model(sesr)
         equalize_model(sesr, input_shape)
 
         # Download weights and quantization parameters
         weights = CachedWebModelAsset.from_asset_store(
             MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS
         ).fetch()
-        aimet_config = CachedWebModelAsset.from_asset_store(
-            MODEL_ID, MODEL_ASSET_VERSION, AIMET_CONFIG
-        ).fetch()
 
         # Load the model weights and quantization parameters
         state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"]
-        # Here we collapse before loading the quantized weights.
-        # The model is collapsed pre-quantization - see
-        # https://github.com/quic/aimet-model-zoo/blob/d09d2b0404d10f71a7640a87e9d5e5257b028802/aimet_zoo_torch/common/super_resolution/models.py#L110
-        sesr.collapse()
         sesr.load_state_dict(state_dict)
         sim = QuantizationSimModel(
             sesr,
             quant_scheme="tf_enhanced",
             default_param_bw=8,
             default_output_bw=8,
-            config_file=aimet_config,
+            config_file=get_default_aimet_config(),
             dummy_input=torch.rand(input_shape),
         )
+        constrain_quantized_inputs_to_image_range(sim)
+
         if aimet_encodings:
             if aimet_encodings == "DEFAULT":
                 aimet_encodings = CachedWebModelAsset.from_asset_store(
@@ -98,11 +98,3 @@ def from_pretrained(
         sim.model.eval()
 
         return cls(sim)
-
-    def preferred_hub_source_model_format(
-        self, target_runtime: TargetRuntime
-    ) -> SourceModelFormat:
-        if target_runtime == TargetRuntime.QNN:
-            return SourceModelFormat.ONNX
-        else:
-            return SourceModelFormat.TORCHSCRIPT
diff --git a/qai_hub_models/models/sesr_m5_quantized/perf.yaml b/qai_hub_models/models/sesr_m5_quantized/perf.yaml
index 581307ec..e83193b4 100644
--- a/qai_hub_models/models/sesr_m5_quantized/perf.yaml
+++ b/qai_hub_models/models/sesr_m5_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: SESR-M5-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1356.0
-      throughput: 737.4631268436578
+      inference_time: 1329.0
+      throughput: 752.4454477050414
       estimated_peak_memory_range:
-        min: 24576
-        max: 1678184
+        min: 32768
+        max: 2149856
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,7 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 14
-      job_id: jnp1y662p
+      job_id: jz57dyjn5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 774.0
+      throughput: 1291.9896640826873
+      estimated_peak_memory_range:
+        min: 28672
+        max: 18606256
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: jegne69jg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1153.0
+      throughput: 867.3026886383348
+      estimated_peak_memory_range:
+        min: 2109440
+        max: 19388976
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: j2p0rze0p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -61,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.426676Z'
+    timestamp: '2024-05-20T16:35:31.000448Z'
   - torchscript_onnx_tflite:
-      inference_time: 1067.0
-      throughput: 937.207122774133
+      inference_time: 1111.0
+      throughput: 900.0900090009001
       estimated_peak_memory_range:
         min: 12288
-        max: 21689744
+        max: 21726352
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -75,7 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 14
-      job_id: jvgde22e5
+      job_id: jqp4wlx2g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 530.0
+      throughput: 1886.7924528301887
+      estimated_peak_memory_range:
+        min: 65536
+        max: 16933392
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: jopryv4kg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 834.0
+      throughput: 1199.0407673860911
+      estimated_peak_memory_range:
+        min: 212992
+        max: 13346208
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: j1p87qwq5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -84,21 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.426696Z'
+    timestamp: '2024-05-20T16:35:31.000474Z'
   - torchscript_onnx_tflite:
-      inference_time: 3752.0
-      throughput: 266.52452025586354
+      inference_time: 1328.0
+      throughput: 753.0120481927711
       estimated_peak_memory_range:
-        min: 49152
-        max: 14587664
+        min: 24576
+        max: 1624240
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 13
+        layers_on_npu: 11
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 16
-      job_id: jwgok74dp
+        total_layers: 14
+      job_id: j0px1k78g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 778.0
+      throughput: 1285.3470437017995
+      estimated_peak_memory_range:
+        min: 28672
+        max: 12397048
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: jqpyd140p
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.000504Z'
+  - torchscript_onnx_tflite:
+      inference_time: 3342.0
+      throughput: 299.22202274087374
+      estimated_peak_memory_range:
+        min: 45056
+        max: 14433024
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 11
+        layers_on_gpu: 0
+        layers_on_cpu: 3
+        total_layers: 14
+      job_id: jw56n0q7g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1809.0
+      throughput: 552.791597567717
+      estimated_peak_memory_range:
+        min: 61440
+        max: 17655776
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: jygzr0v65
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -107,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.426710Z'
+    timestamp: '2024-05-20T16:35:31.000520Z'
   - torchscript_onnx_tflite:
-      inference_time: 12810.0
-      throughput: 78.06401249024199
+      inference_time: 5039.0
+      throughput: 198.45207382417146
       estimated_peak_memory_range:
-        min: 5787648
-        max: 13604584
+        min: 1916928
+        max: 9296352
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 12
+        layers_on_npu: 10
         layers_on_gpu: 0
         layers_on_cpu: 4
-        total_layers: 16
-      job_id: jvgdq6vk5
+        total_layers: 14
+      job_id: j1p3erqz5
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -130,27 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.426724Z'
-  - torchscript_onnx_tflite:
-      inference_time: 1743.0
-      throughput: 573.7234652897304
+    timestamp: '2024-05-20T16:35:31.000535Z'
+  - torchscript_onnx_qnn:
+      inference_time: 745.0
+      throughput: 1342.2818791946308
       estimated_peak_memory_range:
-        min: 28672
-        max: 1454440
+        min: 49152
+        max: 49152
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 13
+        layers_on_npu: 14
         layers_on_gpu: 0
-        layers_on_cpu: 3
-        total_layers: 16
-      job_id: j1p3vlwzg
+        layers_on_cpu: 0
+        total_layers: 14
+      job_id: jep2mk765
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1179.0
+      throughput: 848.1764206955047
+      estimated_peak_memory_range:
+        min: 8998912
+        max: 8998912
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 19
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 19
+      job_id: jogkyervp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 72803.0
+      throughput: 13.735697704764913
+      estimated_peak_memory_range:
+        min: 32956416
+        max: 32956416
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jn5q269e5
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.426738Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.000558Z'
diff --git a/qai_hub_models/models/sesr_m5_quantized/test.py b/qai_hub_models/models/sesr_m5_quantized/test.py
index 86bb6543..0ed36c55 100644
--- a/qai_hub_models/models/sesr_m5_quantized/test.py
+++ b/qai_hub_models/models/sesr_m5_quantized/test.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 import os
-import tempfile
 import zipfile
 
 import numpy as np
@@ -18,7 +17,11 @@
     MODEL_ID,
     SESR_M5Quantizable,
 )
-from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    qaihm_temp_dir,
+)
 from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check
 
 OUTPUT_IMAGE_LOCAL_PATH = "sesr_m5_quantized_demo_output.png"
@@ -69,7 +72,7 @@ def test_trace():
 def test_aimet_export():
     model = SESR_M5Quantizable.from_pretrained()
     name = model.__class__.__name__
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         output_zip = model.convert_to_onnx_and_aimet_encodings(
             tmpdir,
             model.get_input_spec(),
diff --git a/qai_hub_models/models/shufflenet_v2/README.md b/qai_hub_models/models/shufflenet_v2/README.md
index 420ef994..97694e8a 100644
--- a/qai_hub_models/models/shufflenet_v2/README.md
+++ b/qai_hub_models/models/shufflenet_v2/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/shufflenet_v2/export.py b/qai_hub_models/models/shufflenet_v2/export.py
index 9fe96bb7..046c346c 100644
--- a/qai_hub_models/models/shufflenet_v2/export.py
+++ b/qai_hub_models/models/shufflenet_v2/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/shufflenet_v2/perf.yaml b/qai_hub_models/models/shufflenet_v2/perf.yaml
index 686e2320..cc6f20b7 100644
--- a/qai_hub_models/models/shufflenet_v2/perf.yaml
+++ b/qai_hub_models/models/shufflenet_v2/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Shufflenet-v2
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1290.0
-      throughput: 775.1937984496124
+      inference_time: 1228.0
+      throughput: 814.3322475570033
       estimated_peak_memory_range:
-        min: 16384
-        max: 6876504
+        min: 12288
+        max: 2415688
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 204
-      job_id: jz57099lg
+      job_id: j1p3mjqmg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 797.0
-      throughput: 1254.7051442910915
+      inference_time: 765.0
+      throughput: 1307.18954248366
       estimated_peak_memory_range:
-        min: 622592
-        max: 68665608
+        min: 16384
+        max: 4038080
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 158
-      job_id: j0pxnxx15
+      job_id: j7gjlvk1p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1264.0
-      throughput: 791.1392405063291
+      inference_time: 1085.0
+      throughput: 921.6589861751152
       estimated_peak_memory_range:
-        min: 12288
-        max: 11265544
+        min: 315392
+        max: 4250040
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 223
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jegnlkkr5
+        total_layers: 223
+      job_id: jmg94l9m5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.447120Z'
+    timestamp: '2024-05-20T16:35:31.039692Z'
   - torchscript_onnx_tflite:
-      inference_time: 855.0
-      throughput: 1169.5906432748538
+      inference_time: 791.0
+      throughput: 1264.2225031605562
       estimated_peak_memory_range:
-        min: 16384
-        max: 33284208
+        min: 20480
+        max: 33699040
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 204
-      job_id: jqp4k33vg
+      job_id: jwgov2e15
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 528.0
-      throughput: 1893.939393939394
+      inference_time: 515.0
+      throughput: 1941.7475728155339
       estimated_peak_memory_range:
-        min: 618496
-        max: 53183776
+        min: 12288
+        max: 56897984
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 158
-      job_id: jo5mq88wp
+      job_id: jlpevd485
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 836.0
-      throughput: 1196.1722488038276
+      inference_time: 742.0
+      throughput: 1347.7088948787061
       estimated_peak_memory_range:
         min: 12288
-        max: 17464352
+        max: 24844160
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 223
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jopr8ww95
+        total_layers: 223
+      job_id: jnp184qng
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.447190Z'
+    timestamp: '2024-05-20T16:35:31.039719Z'
   - torchscript_onnx_tflite:
-      inference_time: 1291.0
-      throughput: 774.5933384972889
+      inference_time: 1227.0
+      throughput: 814.9959250203749
       estimated_peak_memory_range:
         min: 20480
-        max: 6952312
+        max: 1798552
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 204
-      job_id: jz5w201j5
+      job_id: j1pvw6zzg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 803.0
-      throughput: 1245.3300124533
+      inference_time: 762.0
+      throughput: 1312.3359580052493
       estimated_peak_memory_range:
-        min: 618496
-        max: 103577192
+        min: 622592
+        max: 4805336
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 158
-      job_id: j0pxn8x95
+      job_id: jz5w9em4p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.447245Z'
+    timestamp: '2024-05-20T16:35:31.039737Z'
+  - torchscript_onnx_qnn:
+      inference_time: 929.0
+      throughput: 1076.4262648008612
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 158
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 158
+      job_id: jygz73v4p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1125.0
+      throughput: 888.8888888888889
+      estimated_peak_memory_range:
+        min: 10477568
+        max: 10477568
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 223
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 223
+      job_id: jvgdvx76g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 1715.0
+      throughput: 583.0903790087464
+      estimated_peak_memory_range:
+        min: 12304384
+        max: 12304384
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jz57dyvn5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.039760Z'
diff --git a/qai_hub_models/models/shufflenet_v2_quantized/README.md b/qai_hub_models/models/shufflenet_v2_quantized/README.md
index 3a11090f..2d50ee72 100644
--- a/qai_hub_models/models/shufflenet_v2_quantized/README.md
+++ b/qai_hub_models/models/shufflenet_v2_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/shufflenet_v2_quantized/export.py b/qai_hub_models/models/shufflenet_v2_quantized/export.py
index 59ef9fee..beda56ff 100644
--- a/qai_hub_models/models/shufflenet_v2_quantized/export.py
+++ b/qai_hub_models/models/shufflenet_v2_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
diff --git a/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml
index b3cf96f4..fe83b379 100644
--- a/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml
+++ b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Shufflenet-v2Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 644.0
-      throughput: 1552.7950310559006
+      inference_time: 629.0
+      throughput: 1589.825119236884
       estimated_peak_memory_range:
         min: 12288
-        max: 1838712
+        max: 1960224
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 205
-      job_id: jqpyrmm75
+      job_id: jqp4wlj2g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 592.0
-      throughput: 1689.1891891891892
+      inference_time: 584.0
+      throughput: 1712.3287671232877
       estimated_peak_memory_range:
-        min: 172032
-        max: 9372520
+        min: 24576
+        max: 3645424
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,8 +69,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 122
-      job_id: j1p8011xg
+      job_id: jegne6rjg
       job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j2p0rzk0p
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -76,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.471327Z'
+    timestamp: '2024-05-20T16:48:45.827261Z'
   - torchscript_onnx_tflite:
-      inference_time: 464.0
-      throughput: 2155.1724137931033
+      inference_time: 458.0
+      throughput: 2183.406113537118
       estimated_peak_memory_range:
         min: 12288
-        max: 22792592
+        max: 22451232
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -90,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 205
-      job_id: j2p03666p
+      job_id: j0px1ke8g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 424.0
-      throughput: 2358.490566037736
+      inference_time: 419.0
+      throughput: 2386.634844868735
       estimated_peak_memory_range:
         min: 163840
-        max: 45354944
+        max: 45935136
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,8 +122,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 122
-      job_id: jogk7882p
+      job_id: jopryv1kg
       job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1p87q8q5
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -114,37 +146,75 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.471384Z'
+    timestamp: '2024-05-20T16:48:45.827325Z'
   - torchscript_onnx_tflite:
-      inference_time: 1064.0
-      throughput: 939.8496240601504
+      inference_time: 649.0
+      throughput: 1540.8320493066255
       estimated_peak_memory_range:
         min: 12288
-        max: 16582800
+        max: 1657808
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 207
+        layers_on_npu: 205
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 207
-      job_id: jogk7w8op
+        total_layers: 205
+      job_id: jn5q3dwmp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 585.0
+      throughput: 1709.4017094017095
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 16384
+        max: 13811248
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 122
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jwgok78dp
-      job_status: Failed
+        total_layers: 122
+      job_id: jqpyd1v0p
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:48:45.827382Z'
+  - torchscript_onnx_tflite:
+      inference_time: 946.0
+      throughput: 1057.0824524312895
+      estimated_peak_memory_range:
+        min: 12288
+        max: 16954944
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 205
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 205
+      job_id: j1gl3q7lg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1140.0
+      throughput: 877.1929824561404
+      estimated_peak_memory_range:
+        min: 294912
+        max: 42839088
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 122
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 122
+      job_id: jlpekn20p
+      job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
       os: '12'
@@ -152,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.471427Z'
+    timestamp: '2024-05-20T16:48:45.827429Z'
   - torchscript_onnx_tflite:
-      inference_time: 10090.0
-      throughput: 99.10802775024777
+      inference_time: 8918.0
+      throughput: 112.13276519398968
       estimated_peak_memory_range:
-        min: 12288
-        max: 6455280
+        min: 53248
+        max: 6490632
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
-        layers_on_npu: 44
+        layers_on_npu: 43
         layers_on_gpu: 9
-        layers_on_cpu: 154
-        total_layers: 207
-      job_id: jz5w3y9jp
+        layers_on_cpu: 153
+        total_layers: 205
+      job_id: jw56n0v7g
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -175,42 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.471459Z'
-  - torchscript_onnx_tflite:
-      inference_time: 667.0
-      throughput: 1499.2503748125937
+    timestamp: '2024-05-20T16:48:45.827459Z'
+  - torchscript_onnx_qnn:
+      inference_time: 669.0
+      throughput: 1494.7683109118086
       estimated_peak_memory_range:
-        min: 24576
-        max: 2164120
+        min: 532480
+        max: 532480
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 207
+        layers_on_npu: 122
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 207
-      job_id: jn5qexvm5
+        total_layers: 122
+      job_id: jep2mk365
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 618.0
-      throughput: 1618.1229773462783
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 634880
-        max: 8982056
-      primary_compute_unit: NPU
-      precision: int8
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
-        layers_on_npu: 124
+        layers_on_npu: 0
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 124
-      job_id: j7gjz6q85
+        total_layers: 0
+      job_id: jogkyedvp
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 1478.0
+      throughput: 676.5899864682003
+      estimated_peak_memory_range:
+        min: 6258688
+        max: 6258688
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 263
+        total_layers: 263
+      job_id: jn5q26we5
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.471511Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:48:45.827519Z'
diff --git a/qai_hub_models/models/sinet/README.md b/qai_hub_models/models/sinet/README.md
index 48577d92..601b6d46 100644
--- a/qai_hub_models/models/sinet/README.md
+++ b/qai_hub_models/models/sinet/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/sinet/export.py b/qai_hub_models/models/sinet/export.py
index ad102c99..6840f297 100644
--- a/qai_hub_models/models/sinet/export.py
+++ b/qai_hub_models/models/sinet/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/sinet/perf.yaml b/qai_hub_models/models/sinet/perf.yaml
index 9f70c128..f4e03b55 100644
--- a/qai_hub_models/models/sinet/perf.yaml
+++ b/qai_hub_models/models/sinet/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: SINet
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1826.0
-      throughput: 547.645125958379
+      inference_time: 1797.0
+      throughput: 556.4830272676684
       estimated_peak_memory_range:
         min: 12288
-        max: 2609144
+        max: 2452968
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 240
-      job_id: jn5qevv45
+      job_id: j1glkv72p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1184.0
-      throughput: 844.5945945945946
+      inference_time: 1171.0
+      throughput: 853.9709649871904
       estimated_peak_memory_range:
-        min: 618496
-        max: 4714320
+        min: 2113536
+        max: 14886760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,23 +63,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 186
-      job_id: jw56eww0g
+      job_id: jwgov2m15
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 2285.0
+      throughput: 437.636761487965
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 618496
+        max: 35536752
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 229
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jwgok88xp
-      job_status: Failed
+        total_layers: 229
+      job_id: jygz73w4p
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.500477Z'
+    timestamp: '2024-05-20T16:35:31.109606Z'
   - torchscript_onnx_tflite:
-      inference_time: 1171.0
-      throughput: 853.9709649871904
+      inference_time: 1169.0
+      throughput: 855.4319931565441
       estimated_peak_memory_range:
         min: 12288
-        max: 25301888
+        max: 25617584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 240
-      job_id: j1gl6ll8g
+      job_id: jw561yvnp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 799.0
-      throughput: 1251.5644555694619
+      inference_time: 780.0
+      throughput: 1282.051282051282
       estimated_peak_memory_range:
-        min: 12288
-        max: 64850320
+        min: 618496
+        max: 71418032
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,23 +116,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 186
-      job_id: j1p3v66lg
+      job_id: j1pvw64zg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 1599.0
+      throughput: 625.3908692933084
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 12288
+        max: 27418000
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 229
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: j1pv077j5
-      job_status: Failed
+        total_layers: 229
+      job_id: jz5w9ex4p
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.500551Z'
+    timestamp: '2024-05-20T16:35:31.109633Z'
   - torchscript_onnx_tflite:
-      inference_time: 1823.0
-      throughput: 548.5463521667581
+      inference_time: 1810.0
+      throughput: 552.4861878453039
       estimated_peak_memory_range:
-        min: 24576
-        max: 1974184
+        min: 16384
+        max: 2390784
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 240
-      job_id: jz57014rg
+      job_id: j1p3mj8mg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1185.0
-      throughput: 843.8818565400844
+      inference_time: 1168.0
+      throughput: 856.1643835616438
       estimated_peak_memory_range:
-        min: 634880
-        max: 5992992
+        min: 626688
+        max: 8177944
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 186
-      job_id: jopr8m0e5
+      job_id: jlpevd285
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.500612Z'
+    timestamp: '2024-05-20T16:35:31.109650Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1401.0
+      throughput: 713.7758743754462
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 186
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 186
+      job_id: j7gjlv11p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 2469.0
+      throughput: 405.0222762251924
+      estimated_peak_memory_range:
+        min: 3219456
+        max: 3219456
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 229
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 229
+      job_id: jmg94l8m5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2976.0
+      throughput: 336.02150537634407
+      estimated_peak_memory_range:
+        min: 13578240
+        max: 13578240
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jnp1843ng
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.109674Z'
diff --git a/qai_hub_models/models/squeezenet1_1/README.md b/qai_hub_models/models/squeezenet1_1/README.md
index e35838b5..879ef789 100644
--- a/qai_hub_models/models/squeezenet1_1/README.md
+++ b/qai_hub_models/models/squeezenet1_1/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/squeezenet1_1/export.py b/qai_hub_models/models/squeezenet1_1/export.py
index c1840cfa..21488484 100644
--- a/qai_hub_models/models/squeezenet1_1/export.py
+++ b/qai_hub_models/models/squeezenet1_1/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/squeezenet1_1/perf.yaml b/qai_hub_models/models/squeezenet1_1/perf.yaml
index 0326ac8b..e6eb2648 100644
--- a/qai_hub_models/models/squeezenet1_1/perf.yaml
+++ b/qai_hub_models/models/squeezenet1_1/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: SqueezeNet-1_1
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 672.0
-      throughput: 1488.095238095238
+      inference_time: 664.0
+      throughput: 1506.0240963855422
       estimated_peak_memory_range:
         min: 12288
-        max: 1740976
+        max: 1506784
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 41
-      job_id: jlpeeyy1p
+      job_id: jvgdvx06g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 711.0
-      throughput: 1406.4697609001407
+      inference_time: 712.0
+      throughput: 1404.4943820224719
       estimated_peak_memory_range:
-        min: 638976
-        max: 12256680
+        min: 618496
+        max: 7468520
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 70
-      job_id: jz5w24765
+      job_id: j0px1km8g
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 861.0
-      throughput: 1161.4401858304298
+      inference_time: 651.0
+      throughput: 1536.0983102918588
       estimated_peak_memory_range:
         min: 12288
-        max: 10395112
+        max: 7201352
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 71
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jnp1y6j2p
+        total_layers: 71
+      job_id: jep2mkj65
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.524625Z'
+    timestamp: '2024-05-20T16:35:31.140012Z'
   - torchscript_onnx_tflite:
-      inference_time: 453.0
-      throughput: 2207.5055187637968
+      inference_time: 477.0
+      throughput: 2096.4360587002097
       estimated_peak_memory_range:
-        min: 12288
-        max: 22540768
+        min: 0
+        max: 22219968
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 41
-      job_id: jygzonnk5
+      job_id: jz57dy6n5
       job_status: Passed
     torchscript_onnx_qnn:
       inference_time: 490.0
       throughput: 2040.8163265306123
       estimated_peak_memory_range:
         min: 618496
-        max: 28785760
+        max: 27578288
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 70
-      job_id: jmg9jdml5
+      job_id: jo5mzn47p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 618.0
-      throughput: 1618.1229773462783
+      inference_time: 488.0
+      throughput: 2049.1803278688526
       estimated_peak_memory_range:
-        min: 618496
-        max: 20314848
+        min: 24576
+        max: 17829040
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 71
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jvgde23e5
+        total_layers: 71
+      job_id: jqpyd1n0p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.524666Z'
+    timestamp: '2024-05-20T16:35:31.140037Z'
   - torchscript_onnx_tflite:
-      inference_time: 672.0
-      throughput: 1488.095238095238
+      inference_time: 664.0
+      throughput: 1506.0240963855422
       estimated_peak_memory_range:
-        min: 12288
-        max: 1757808
+        min: 20480
+        max: 1789832
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 41
-      job_id: j1pv0ydm5
+      job_id: jqp4wl82g
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 718.0
-      throughput: 1392.757660167131
+      inference_time: 701.0
+      throughput: 1426.5335235378031
       estimated_peak_memory_range:
-        min: 618496
-        max: 75568808
+        min: 626688
+        max: 3276112
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 70
-      job_id: jnp1ykjlp
+      job_id: jopryv9kg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.524695Z'
+    timestamp: '2024-05-20T16:35:31.140054Z'
+  - torchscript_onnx_qnn:
+      inference_time: 828.0
+      throughput: 1207.729468599034
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 70
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 70
+      job_id: jegne6xjg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 696.0
+      throughput: 1436.7816091954023
+      estimated_peak_memory_range:
+        min: 3063808
+        max: 3063808
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 71
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 71
+      job_id: j2p0rzd0p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 2093.0
+      throughput: 477.78308647873865
+      estimated_peak_memory_range:
+        min: 9494528
+        max: 9494528
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 41
+        total_layers: 41
+      job_id: j1p87q6q5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.140076Z'
diff --git a/qai_hub_models/models/squeezenet1_1_quantized/README.md b/qai_hub_models/models/squeezenet1_1_quantized/README.md
index 305ab62f..f13301a8 100644
--- a/qai_hub_models/models/squeezenet1_1_quantized/README.md
+++ b/qai_hub_models/models/squeezenet1_1_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/squeezenet1_1_quantized/export.py b/qai_hub_models/models/squeezenet1_1_quantized/export.py
index 202478bd..964eb563 100644
--- a/qai_hub_models/models/squeezenet1_1_quantized/export.py
+++ b/qai_hub_models/models/squeezenet1_1_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -205,7 +216,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml
index 0ee10d5b..d7ad8b01 100644
--- a/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml
+++ b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: SqueezeNet-1_1Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 218.0
-      throughput: 4587.155963302752
+      inference_time: 221.0
+      throughput: 4524.886877828054
       estimated_peak_memory_range:
-        min: 24576
-        max: 1453208
+        min: 12288
+        max: 2523424
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 41
-      job_id: jmg9jdmw5
+      job_id: jogkyeovp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 466.0
-      throughput: 2145.922746781116
+      inference_time: 467.0
+      throughput: 2141.3276231263385
       estimated_peak_memory_range:
-        min: 12288
-        max: 10115704
+        min: 176128
+        max: 80481816
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,22 +69,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 45
-      job_id: jvgde23r5
+      job_id: jw561yrnp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 811.0
-      throughput: 1233.0456226880394
+      inference_time: 550.0
+      throughput: 1818.1818181818182
       estimated_peak_memory_range:
         min: 618496
-        max: 5355192
+        max: 7743200
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 49
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jqp4k318g
+        total_layers: 49
+      job_id: j7gjlvo1p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -91,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.548624Z'
+    timestamp: '2024-05-20T16:35:31.170251Z'
   - torchscript_onnx_tflite:
-      inference_time: 178.0
-      throughput: 5617.9775280898875
+      inference_time: 184.0
+      throughput: 5434.782608695652
       estimated_peak_memory_range:
         min: 12288
-        max: 21783424
+        max: 22090256
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 41
-      job_id: jnp1y6j8p
+      job_id: jn5q26ze5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 343.0
-      throughput: 2915.451895043732
+      inference_time: 341.0
+      throughput: 2932.551319648094
       estimated_peak_memory_range:
-        min: 167936
-        max: 23042032
+        min: 163840
+        max: 26837472
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -120,22 +122,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 45
-      job_id: jz57094vg
+      job_id: j1p3mjxmg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 632.0
-      throughput: 1582.2784810126582
+      inference_time: 421.0
+      throughput: 2375.296912114014
       estimated_peak_memory_range:
         min: 12288
-        max: 16606592
+        max: 16755200
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 49
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j0pxnx435
+        total_layers: 49
+      job_id: jlpevd885
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -144,51 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.548662Z'
+    timestamp: '2024-05-20T16:35:31.170278Z'
   - torchscript_onnx_tflite:
-      inference_time: 645.0
-      throughput: 1550.3875968992247
+      inference_time: 225.0
+      throughput: 4444.444444444444
       estimated_peak_memory_range:
-        min: 12288
-        max: 14710928
+        min: 28672
+        max: 1537872
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 43
+        layers_on_npu: 41
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 43
-      job_id: jogk7w2op
+        total_layers: 41
+      job_id: j1glkvo2p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 471.0
+      throughput: 2123.1422505307855
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 12288
+        max: 9792120
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 45
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: j1pv0ylm5
-      job_status: Failed
-    torchscript_onnx_ort:
-      inference_time: 3597.0
-      throughput: 278.00945232137894
+        total_layers: 45
+      job_id: j1pvw6ezg
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.170295Z'
+  - torchscript_onnx_tflite:
+      inference_time: 538.0
+      throughput: 1858.736059479554
       estimated_peak_memory_range:
-        min: 0
-        max: 28318256
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 12288
+        max: 14558896
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 41
         layers_on_gpu: 0
-        layers_on_cpu: 51
-        total_layers: 51
-      job_id: jo5mq8mdp
+        layers_on_cpu: 0
+        total_layers: 41
+      job_id: jmg9wqkvp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 958.0
+      throughput: 1043.8413361169103
+      estimated_peak_memory_range:
+        min: 163840
+        max: 22853712
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 45
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 45
+      job_id: jo5m3ldqg
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -197,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.548701Z'
+    timestamp: '2024-05-20T16:35:31.170312Z'
   - torchscript_onnx_tflite:
-      inference_time: 4261.0
-      throughput: 234.6866932644919
+      inference_time: 4066.0
+      throughput: 245.94195769798327
       estimated_peak_memory_range:
-        min: 90112
-        max: 1970416
+        min: 28672
+        max: 6476760
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 43
+        layers_on_npu: 41
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 43
-      job_id: jmg9yo4v5
+        total_layers: 41
+      job_id: jnp1em7lg
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -220,42 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.548716Z'
-  - torchscript_onnx_tflite:
-      inference_time: 246.0
-      throughput: 4065.040650406504
+    timestamp: '2024-05-20T16:35:31.170322Z'
+  - torchscript_onnx_qnn:
+      inference_time: 580.0
+      throughput: 1724.1379310344828
       estimated_peak_memory_range:
-        min: 12288
-        max: 1876064
+        min: 622592
+        max: 622592
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 43
+        layers_on_npu: 45
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 43
-      job_id: j1gl69ylg
+        total_layers: 45
+      job_id: jwgov2o15
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 507.0
-      throughput: 1972.3865877712033
+    torchscript_onnx_ort:
+      inference_time: 571.0
+      throughput: 1751.3134851138354
       estimated_peak_memory_range:
-        min: 528384
-        max: 12189432
+        min: 1773568
+        max: 1773568
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 47
+        layers_on_npu: 49
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 47
-      job_id: jygzoql65
+        total_layers: 49
+      job_id: jygz7384p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 979.0
+      throughput: 1021.4504596527069
+      estimated_peak_memory_range:
+        min: 4251648
+        max: 4251648
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 51
+        total_layers: 51
+      job_id: jz5w9e84p
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.548742Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.170345Z'
diff --git a/qai_hub_models/models/stable_diffusion_quantized/README.md b/qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md
similarity index 85%
rename from qai_hub_models/models/stable_diffusion_quantized/README.md
rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md
index de6f68d1..e7447ff1 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/README.md
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/README.md
@@ -1,31 +1,33 @@
 [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
 
 
-# [Stable-Diffusion: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions](https://aihub.qualcomm.com/models/stable_diffusion_quantized)
+# [Stable-Diffusion-v1.5: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions](https://aihub.qualcomm.com/models/stable_diffusion_v1_5_quantized)
 
 Generates high resolution images from text prompts using a latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, and VAE based decoder to generate the final image.
 
-This is based on the implementation of Stable-Diffusion found
+This is based on the implementation of Stable-Diffusion-v1.5 found
 [here](https://github.com/CompVis/stable-diffusion/tree/main). This repository contains scripts for optimized on-device
 export suitable to run on Qualcomm® devices. More details on model performance
-accross various devices, can be found [here](https://aihub.qualcomm.com/models/stable_diffusion_quantized).
+accross various devices, can be found [here](https://aihub.qualcomm.com/models/stable_diffusion_v1_5_quantized).
 
 [Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
 ```bash
-pip install "qai_hub_models[stable_diffusion_quantized]"
+pip install "qai_hub_models[stable_diffusion_v1_5_quantized]"
 ```
 
 
 Once installed, run the following simple CLI demo:
 
 ```bash
-python -m qai_hub_models.models.stable_diffusion_quantized.demo
+python -m qai_hub_models.models.stable_diffusion_v1_5_quantized.demo
 ```
 More details on the CLI tool can be found with the `--help` option. See
 [demo.py](demo.py) for sample usage of the model including pre/post processing
@@ -38,13 +40,13 @@ This repository contains export scripts that produce a model optimized for
 on-device deployment. This can be run as follows:
 
 ```bash
-python -m qai_hub_models.models.stable_diffusion_quantized.export
+python -m qai_hub_models.models.stable_diffusion_v1_5_quantized.export
 ```
 Additional options are documented with the `--help` option. Note that the above
 script requires access to Deployment instructions for Qualcomm® AI Hub.
 
 ## License
-- The license for the original implementation of Stable-Diffusion can be found
+- The license for the original implementation of Stable-Diffusion-v1.5 can be found
   [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE).
 - The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
 
diff --git a/qai_hub_models/models/stable_diffusion_quantized/__init__.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/__init__.py
similarity index 58%
rename from qai_hub_models/models/stable_diffusion_quantized/__init__.py
rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/__init__.py
index 7cc325fb..bb1d5cf9 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/__init__.py
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/__init__.py
@@ -2,11 +2,9 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-from qai_hub_models.models.stable_diffusion_quantized.model import (  # noqa: F401
+from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import (  # noqa: F401
     MODEL_ID,
 )
-from qai_hub_models.models.stable_diffusion_quantized.model import (  # noqa: F401
+from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import (  # noqa: F401
     StableDiffusionQuantized as Model,
 )
-
-from .app import StableDiffusionApp as App  # noqa: F401
diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/demo.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/demo.py
new file mode 100644
index 00000000..ff364004
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/demo.py
@@ -0,0 +1,53 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel
+from transformers import CLIPTokenizer
+
+from qai_hub_models.models._shared.stable_diffusion.demo import stable_diffusion_demo
+from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    ClipVITTextEncoder,
+    Unet,
+    VAEDecoder,
+)
+
+
+# Run Stable Diffuison end-to-end on a given prompt. The demo will output an
+# AI-generated image based on the description in the prompt.
+def main(is_test: bool = False):
+    tokenizer = CLIPTokenizer.from_pretrained(
+        "openai/clip-vit-large-patch14", subfolder="", revision="main"
+    )
+
+    scheduler = DPMSolverMultistepScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    time_embedding = UNet2DConditionModel.from_pretrained(
+        "runwayml/stable-diffusion-v1-5", subfolder="unet"
+    ).time_embedding
+
+    text_encoder = ClipVITTextEncoder.from_precompiled()
+    unet = Unet.from_precompiled()
+    vae_decoder = VAEDecoder.from_precompiled()
+    stable_diffusion_demo(
+        model_id=MODEL_ID,
+        model_asset_version=MODEL_ASSET_VERSION,
+        text_encoder=text_encoder,
+        unet=unet,
+        vae_decoder=vae_decoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+        time_embedding=time_embedding,
+        is_test=is_test,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/export.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/export.py
new file mode 100644
index 00000000..c6394ccb
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/export.py
@@ -0,0 +1,191 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, Optional, Tuple, cast
+
+import qai_hub as hub
+
+from qai_hub_models.models.stable_diffusion_v1_5_quantized import Model
+from qai_hub_models.utils.args import export_parser
+from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime
+from qai_hub_models.utils.printing import print_profile_metrics_from_job
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+)
+
+ALL_COMPONENTS = ["TextEncoder_Quantized", "UNet_Quantized", "VAEDecoder_Quantized"]
+DEFAULT_COMPONENTS = ["TextEncoder_Quantized", "VAEDecoder_Quantized", "UNet_Quantized"]
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    components: Optional[List[str]] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Mapping[str, Tuple[Optional[hub.ProfileJob], Optional[hub.InferenceJob]]] | List[
+    str
+]:
+    """
+    This function accomplishes 5 main tasks:
+
+        1. Initialize model.
+        2. Upload model assets to hub.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Summarizes the results from profiling.
+
+    Each of the last three steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        components: List of sub-components of the model that will be exported.
+            Each component is compiled and profiled separately.
+            Defaults to ALL_COMPONENTS if not specified.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_precompiled`
+
+    Returns:
+        A Mapping from component_name to a 2-tuple of:
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "stable_diffusion_v1_5_quantized"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    component_arg = components
+    components = components or DEFAULT_COMPONENTS
+    for component_name in components:
+        if component_name not in ALL_COMPONENTS:
+            raise ValueError(f"Invalid component {component_name}.")
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "stable_diffusion_v1_5_quantized",
+            "Stable-Diffusion-v1.5",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            False,
+            skip_summary,
+            output_path,
+            TargetRuntime.QNN,
+            "",
+            profile_options,
+            component_arg,
+        )
+
+    target_runtime = TargetRuntime.TFLITE
+    # 1. Initialize model
+    print("Initializing model class")
+    model = Model.from_precompiled()
+    components_dict: Dict[str, BasePrecompiledModel] = {}
+    if "TextEncoder_Quantized" in components:
+        components_dict["TextEncoder_Quantized"] = model.text_encoder  # type: ignore
+    if "UNet_Quantized" in components:
+        components_dict["UNet_Quantized"] = model.unet  # type: ignore
+    if "VAEDecoder_Quantized" in components:
+        components_dict["VAEDecoder_Quantized"] = model.vae_decoder  # type: ignore
+
+    # 2. Upload model assets to hub
+    print("Uploading model assets on hub")
+    uploaded_models = {}
+    for component_name in components:
+        uploaded_models[component_name] = hub.upload_model(
+            components_dict[component_name].get_target_model_path()
+        )
+
+    # 3. Profile the model assets on real devices
+    profile_jobs: Dict[str, hub.client.ProfileJob] = {}
+    if not skip_profiling:
+        for component_name in components:
+            profile_options_all = components_dict[
+                component_name
+            ].get_hub_profile_options(target_runtime, profile_options)
+            print(f"Profiling model {component_name} on a hosted device.")
+            submitted_profile_job = hub.submit_profile_job(
+                model=uploaded_models[component_name],
+                device=hub_device,
+                name=f"{model_name}_{component_name}",
+                options=profile_options_all,
+            )
+            profile_jobs[component_name] = cast(
+                hub.client.ProfileJob, submitted_profile_job
+            )
+
+    # 4. Run inference on-device with sample inputs
+    inference_jobs: Dict[str, hub.client.InferenceJob] = {}
+    if not skip_inferencing:
+        for component_name in components:
+            print(
+                f"Running inference for {component_name} on a hosted device with example inputs."
+            )
+            profile_options_all = components_dict[
+                component_name
+            ].get_hub_profile_options(target_runtime, profile_options)
+            sample_inputs = components_dict[component_name].sample_inputs()
+            submitted_inference_job = hub.submit_inference_job(
+                model=uploaded_models[component_name],
+                inputs=sample_inputs,
+                device=hub_device,
+                name=f"{model_name}_{component_name}",
+                options=profile_options_all,
+            )
+            inference_jobs[component_name] = cast(
+                hub.client.InferenceJob, submitted_inference_job
+            )
+
+    # 5. Summarize the results from profiling
+    if not skip_summary and not skip_profiling:
+        for component_name in components:
+            profile_job = profile_jobs[component_name]
+            assert profile_job is not None and profile_job.wait().success
+            profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+            print_profile_metrics_from_job(profile_job, profile_data)
+
+    return {
+        component_name: (
+            profile_jobs.get(component_name, None),
+            inference_jobs.get(component_name, None),
+        )
+        for component_name in components
+    }
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(
+        model_cls=Model, components=ALL_COMPONENTS, exporting_compiled_model=True
+    )
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/stable_diffusion_quantized/info.yaml b/qai_hub_models/models/stable_diffusion_v1_5_quantized/info.yaml
similarity index 91%
rename from qai_hub_models/models/stable_diffusion_quantized/info.yaml
rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/info.yaml
index ceac7d79..7bf7c3d0 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/info.yaml
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/info.yaml
@@ -1,5 +1,5 @@
-name: Stable-Diffusion
-id: stable_diffusion_quantized
+name: Stable-Diffusion-v1.5
+id: stable_diffusion_v1_5_quantized
 status: public
 headline: State-of-the-art generative AI model used to generate detailed images conditioned
   on text descriptions.
@@ -18,7 +18,7 @@ deploy_license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE
 source_repo: https://github.com/CompVis/stable-diffusion/tree/main
 technical_details:
   Input: Text prompt to generate image
-  QNN-SDK: '2.19'
+  QNN-SDK: '2.20'
   Text Encoder Number of parameters: 340M
   UNet Number of parameters: 865M
   VAE Decoder Number of parameters: 83M
@@ -28,6 +28,7 @@ applicable_scenarios:
   - Image Editing
   - Content Creation
 related_models:
+  - stable_diffusion_v2_1_quantized
   - controlnet_quantized
 form_factors:
   - Phone
diff --git a/qai_hub_models/models/stable_diffusion_quantized/model.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/model.py
similarity index 99%
rename from qai_hub_models/models/stable_diffusion_quantized/model.py
rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/model.py
index f9da4488..0325e913 100644
--- a/qai_hub_models/models/stable_diffusion_quantized/model.py
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/model.py
@@ -13,7 +13,7 @@
 
 MODEL_ID = __name__.split(".")[-2]
 MODEL_ASSET_VERSION = 1
-QNN_SDK_PREFIX = "QNN219"
+QNN_SDK_PREFIX = "QNN220"
 TEXT_ENCODER = os.path.join(QNN_SDK_PREFIX, "text_encoder.serialized.bin")
 UNET_DIFFUSER = os.path.join(QNN_SDK_PREFIX, "unet.serialized.bin")
 VAE_DECODER = os.path.join(QNN_SDK_PREFIX, "vae_decoder.serialized.bin")
diff --git a/qai_hub_models/models/stable_diffusion_quantized/perf.yaml b/qai_hub_models/models/stable_diffusion_v1_5_quantized/perf.yaml
similarity index 100%
rename from qai_hub_models/models/stable_diffusion_quantized/perf.yaml
rename to qai_hub_models/models/stable_diffusion_v1_5_quantized/perf.yaml
diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/requirements.txt b/qai_hub_models/models/stable_diffusion_v1_5_quantized/requirements.txt
new file mode 100644
index 00000000..83aa3d48
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.27.4
+diffusers[torch]==0.21.4
diff --git a/qai_hub_models/models/stable_diffusion_v1_5_quantized/test.py b/qai_hub_models/models/stable_diffusion_v1_5_quantized/test.py
new file mode 100644
index 00000000..5cd49388
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v1_5_quantized/test.py
@@ -0,0 +1,30 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import pytest
+
+from qai_hub_models.models._shared.stable_diffusion.test_utils import (
+    export_for_component,
+)
+from qai_hub_models.models.stable_diffusion_v1_5_quantized.demo import main as demo_main
+from qai_hub_models.models.stable_diffusion_v1_5_quantized.export import export_model
+from qai_hub_models.models.stable_diffusion_v1_5_quantized.model import (
+    StableDiffusionQuantized,
+)
+
+
+def test_from_precompiled():
+    StableDiffusionQuantized.from_precompiled()
+
+
+@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
+@pytest.mark.slow_cloud
+def test_export():
+    export_for_component(export_model, "TextEncoder_Quantized")
+
+
+@pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
+@pytest.mark.slow_cloud
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md b/qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md
new file mode 100644
index 00000000..2c8bd7d6
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/README.md
@@ -0,0 +1,83 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [Stable-Diffusion-v2.1: State-of-the-art generative AI model used to generate detailed images conditioned on text descriptions](#)
+
+Generates high resolution images from text prompts using a latent diffusion model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising, and VAE based decoder to generate the final image.
+
+This is based on the implementation of Stable-Diffusion-v2.1 found
+[here](https://github.com/CompVis/stable-diffusion/tree/main). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](#).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+Install the package via pip:
+```bash
+pip install "qai_hub_models[stable_diffusion_v2_1_quantized]"
+```
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.stable_diffusion_v2_1_quantized.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.stable_diffusion_v2_1_quantized.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of Stable-Diffusion-v2.1 can be found
+  [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)
+* [Source Model Implementation](https://github.com/CompVis/stable-diffusion/tree/main)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
+## Usage and Limitations
+
+This model may not be used for or in connection with any of the following applications:
+
+- Accessing essential private and public services and benefits;
+- Administration of justice and democratic processes;
+- Assessing or recognizing the emotional state of a person;
+- Biometric and biometrics-based systems, including categorization of persons based on sensitive characteristics;
+- Education and vocational training;
+- Employment and workers management;
+- Exploitation of the vulnerabilities of persons resulting in harmful behavior;
+- General purpose social scoring;
+- Law enforcement;
+- Management and operation of critical infrastructure;
+- Migration, asylum and border control management;
+- Predictive policing;
+- Real-time remote biometric identification in public spaces;
+- Recommender systems of social media platforms;
+- Scraping of facial images (from the internet or otherwise); and/or
+- Subliminal manipulation
+
+
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/__init__.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/__init__.py
new file mode 100644
index 00000000..7a6b1a25
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/__init__.py
@@ -0,0 +1,10 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import (  # noqa: F401
+    MODEL_ID,
+)
+from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import (  # noqa: F401
+    StableDiffusionQuantized as Model,
+)
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/demo.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/demo.py
new file mode 100644
index 00000000..2ee347ec
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/demo.py
@@ -0,0 +1,54 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from diffusers import DPMSolverMultistepScheduler, UNet2DConditionModel
+from transformers import CLIPTokenizer
+
+from qai_hub_models.models._shared.stable_diffusion.demo import stable_diffusion_demo
+from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    ClipVITTextEncoder,
+    Unet,
+    VAEDecoder,
+)
+
+
+# Run Stable Diffuison end-to-end on a given prompt. The demo will output an
+# AI-generated image based on the description in the prompt.
+def main(is_test: bool = False):
+    tokenizer = CLIPTokenizer.from_pretrained(
+        "stabilityai/stable-diffusion-2-1-base", subfolder="tokenizer", revision="main"
+    )
+
+    scheduler = DPMSolverMultistepScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        num_train_timesteps=1000,
+    )
+
+    time_embedding = UNet2DConditionModel.from_pretrained(
+        "stabilityai/stable-diffusion-2-1-base", subfolder="unet", revision="main"
+    ).time_embedding
+
+    text_encoder = ClipVITTextEncoder.from_precompiled()
+    unet = Unet.from_precompiled()
+    vae_decoder = VAEDecoder.from_precompiled()
+    stable_diffusion_demo(
+        model_id=MODEL_ID,
+        model_asset_version=MODEL_ASSET_VERSION,
+        text_encoder=text_encoder,
+        unet=unet,
+        vae_decoder=vae_decoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+        time_embedding=time_embedding,
+        channel_last_latent=False,
+        is_test=is_test,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/export.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/export.py
new file mode 100644
index 00000000..d2b0dffd
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/export.py
@@ -0,0 +1,191 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, Optional, Tuple, cast
+
+import qai_hub as hub
+
+from qai_hub_models.models.stable_diffusion_v2_1_quantized import Model
+from qai_hub_models.utils.args import export_parser
+from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime
+from qai_hub_models.utils.printing import print_profile_metrics_from_job
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+)
+
+ALL_COMPONENTS = ["TextEncoder_Quantized", "UNet_Quantized", "VAEDecoder_Quantized"]
+DEFAULT_COMPONENTS = ["TextEncoder_Quantized", "VAEDecoder_Quantized", "UNet_Quantized"]
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    components: Optional[List[str]] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Mapping[str, Tuple[Optional[hub.ProfileJob], Optional[hub.InferenceJob]]] | List[
+    str
+]:
+    """
+    This function accomplishes 5 main tasks:
+
+        1. Initialize model.
+        2. Upload model assets to hub.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Summarizes the results from profiling.
+
+    Each of the last three steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        components: List of sub-components of the model that will be exported.
+            Each component is compiled and profiled separately.
+            Defaults to ALL_COMPONENTS if not specified.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_precompiled`
+
+    Returns:
+        A Mapping from component_name to a 2-tuple of:
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "stable_diffusion_v2_1_quantized"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    component_arg = components
+    components = components or DEFAULT_COMPONENTS
+    for component_name in components:
+        if component_name not in ALL_COMPONENTS:
+            raise ValueError(f"Invalid component {component_name}.")
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "stable_diffusion_v2_1_quantized",
+            "Stable-Diffusion-v2.1",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            False,
+            skip_summary,
+            output_path,
+            TargetRuntime.QNN,
+            "",
+            profile_options,
+            component_arg,
+        )
+
+    target_runtime = TargetRuntime.TFLITE
+    # 1. Initialize model
+    print("Initializing model class")
+    model = Model.from_precompiled()
+    components_dict: Dict[str, BasePrecompiledModel] = {}
+    if "TextEncoder_Quantized" in components:
+        components_dict["TextEncoder_Quantized"] = model.text_encoder  # type: ignore
+    if "UNet_Quantized" in components:
+        components_dict["UNet_Quantized"] = model.unet  # type: ignore
+    if "VAEDecoder_Quantized" in components:
+        components_dict["VAEDecoder_Quantized"] = model.vae_decoder  # type: ignore
+
+    # 2. Upload model assets to hub
+    print("Uploading model assets on hub")
+    uploaded_models = {}
+    for component_name in components:
+        uploaded_models[component_name] = hub.upload_model(
+            components_dict[component_name].get_target_model_path()
+        )
+
+    # 3. Profile the model assets on real devices
+    profile_jobs: Dict[str, hub.client.ProfileJob] = {}
+    if not skip_profiling:
+        for component_name in components:
+            profile_options_all = components_dict[
+                component_name
+            ].get_hub_profile_options(target_runtime, profile_options)
+            print(f"Profiling model {component_name} on a hosted device.")
+            submitted_profile_job = hub.submit_profile_job(
+                model=uploaded_models[component_name],
+                device=hub_device,
+                name=f"{model_name}_{component_name}",
+                options=profile_options_all,
+            )
+            profile_jobs[component_name] = cast(
+                hub.client.ProfileJob, submitted_profile_job
+            )
+
+    # 4. Run inference on-device with sample inputs
+    inference_jobs: Dict[str, hub.client.InferenceJob] = {}
+    if not skip_inferencing:
+        for component_name in components:
+            print(
+                f"Running inference for {component_name} on a hosted device with example inputs."
+            )
+            profile_options_all = components_dict[
+                component_name
+            ].get_hub_profile_options(target_runtime, profile_options)
+            sample_inputs = components_dict[component_name].sample_inputs()
+            submitted_inference_job = hub.submit_inference_job(
+                model=uploaded_models[component_name],
+                inputs=sample_inputs,
+                device=hub_device,
+                name=f"{model_name}_{component_name}",
+                options=profile_options_all,
+            )
+            inference_jobs[component_name] = cast(
+                hub.client.InferenceJob, submitted_inference_job
+            )
+
+    # 5. Summarize the results from profiling
+    if not skip_summary and not skip_profiling:
+        for component_name in components:
+            profile_job = profile_jobs[component_name]
+            assert profile_job is not None and profile_job.wait().success
+            profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+            print_profile_metrics_from_job(profile_job, profile_data)
+
+    return {
+        component_name: (
+            profile_jobs.get(component_name, None),
+            inference_jobs.get(component_name, None),
+        )
+        for component_name in components
+    }
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(
+        model_cls=Model, components=ALL_COMPONENTS, exporting_compiled_model=True
+    )
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/info.yaml b/qai_hub_models/models/stable_diffusion_v2_1_quantized/info.yaml
new file mode 100644
index 00000000..e298e2a5
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/info.yaml
@@ -0,0 +1,40 @@
+name: Stable-Diffusion-v2.1
+id: stable_diffusion_v2_1_quantized
+status: public
+headline: State-of-the-art generative AI model used to generate detailed images conditioned
+  on text descriptions.
+domain: Generative AI
+description: Generates high resolution images from text prompts using a latent diffusion
+  model. This model uses CLIP ViT-L/14 as text encoder, U-Net based latent denoising,
+  and VAE based decoder to generate the final image.
+use_case: Image Generation
+tags:
+  - generative-ai
+  - quantized
+research_paper: https://arxiv.org/abs/2112.10752
+research_paper_title: High-Resolution Image Synthesis with Latent Diffusion Models
+license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE
+deploy_license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE
+source_repo: https://github.com/CompVis/stable-diffusion/tree/main
+technical_details:
+  Input: Text prompt to generate image
+  QNN-SDK: '2.20'
+  Text Encoder Number of parameters: 340M
+  UNet Number of parameters: 865M
+  VAE Decoder Number of parameters: 83M
+  Model size: 1GB
+applicable_scenarios:
+  - Image Generation
+  - Image Editing
+  - Content Creation
+related_models:
+  - stable_diffusion_v1_5_quantized
+  - controlnet_quantized
+form_factors:
+  - Phone
+  - Tablet
+has_static_banner: yes
+has_animated_banner: yes
+license_type: creativeml-openrail-m
+deploy_license_type: creativeml-openrail-m
+dataset: []
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/model.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/model.py
new file mode 100644
index 00000000..b1e6c86d
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/model.py
@@ -0,0 +1,105 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from qai_hub_models.models.protocols import FromPrecompiledProtocol
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
+from qai_hub_models.utils.base_model import BasePrecompiledModel, CollectionModel
+from qai_hub_models.utils.input_spec import InputSpec
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 1
+TEXT_ENCODER = "text_encoder.serialized.bin"
+UNET_DIFFUSER = "unet.serialized.bin"
+VAE_DECODER = "vae.serialized.bin"
+
+
+class StableDiffusionQuantized(FromPrecompiledProtocol, CollectionModel):
+    """
+    Stable Diffusion wrapper class consists of
+        - Text Encoder
+        - UNet based diffuser
+        - VAE decoder
+
+    All three models are pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    def __init__(self, text_encoder, unet, vae_decoder) -> None:
+        self.text_encoder = text_encoder
+        self.unet = unet
+        self.vae_decoder = vae_decoder
+
+    @classmethod
+    def from_precompiled(cls) -> "StableDiffusionQuantized":
+        return StableDiffusionQuantized(
+            text_encoder=ClipVITTextEncoder.from_precompiled(),
+            unet=Unet.from_precompiled(),
+            vae_decoder=VAEDecoder.from_precompiled(),
+        )
+
+
+class ClipVITTextEncoder(BasePrecompiledModel):
+    """
+    CLIP-ViT based Text Encoder.
+
+    Pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    @classmethod
+    def from_precompiled(cls) -> "ClipVITTextEncoder":
+        text_encoder_path = CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, TEXT_ENCODER
+        ).fetch()
+        return ClipVITTextEncoder(text_encoder_path)
+
+    @staticmethod
+    def get_input_spec() -> InputSpec:
+        return {"tokens": ((1, 77), "int32")}
+
+
+class Unet(BasePrecompiledModel):
+    """
+    UNet model to denoise image in latent space.
+
+    Pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    @classmethod
+    def from_precompiled(cls) -> "Unet":
+        model_path = CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, UNET_DIFFUSER
+        ).fetch()
+        return Unet(model_path)
+
+    @staticmethod
+    def get_input_spec() -> InputSpec:
+        return {
+            "latent": ((1, 4, 64, 64), "float32"),
+            "time_emb": ((1, 1280), "float32"),
+            "text_emb": ((1, 77, 1024), "float32"),
+        }
+
+
+class VAEDecoder(BasePrecompiledModel):
+    """
+    Decodes image from latent into output generated image.
+
+    Pre-trained, quantized (int8 weight, uint16 activations)
+    and compiled into serialized binary for Qualcomm Snapdragon Gen2+.
+    """
+
+    @classmethod
+    def from_precompiled(cls) -> "VAEDecoder":
+        model_path = CachedWebModelAsset.from_asset_store(
+            MODEL_ID, MODEL_ASSET_VERSION, VAE_DECODER
+        ).fetch()
+        return VAEDecoder(model_path)
+
+    @staticmethod
+    def get_input_spec() -> InputSpec:
+        return {"latent": ((1, 4, 64, 64), "float32")}
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/requirements.txt b/qai_hub_models/models/stable_diffusion_v2_1_quantized/requirements.txt
new file mode 100644
index 00000000..83aa3d48
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/requirements.txt
@@ -0,0 +1,2 @@
+transformers==4.27.4
+diffusers[torch]==0.21.4
diff --git a/qai_hub_models/models/stable_diffusion_v2_1_quantized/test.py b/qai_hub_models/models/stable_diffusion_v2_1_quantized/test.py
new file mode 100644
index 00000000..a19408ae
--- /dev/null
+++ b/qai_hub_models/models/stable_diffusion_v2_1_quantized/test.py
@@ -0,0 +1,30 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+import pytest
+
+from qai_hub_models.models._shared.stable_diffusion.test_utils import (
+    export_for_component,
+)
+from qai_hub_models.models.stable_diffusion_v2_1_quantized.demo import main as demo_main
+from qai_hub_models.models.stable_diffusion_v2_1_quantized.export import export_model
+from qai_hub_models.models.stable_diffusion_v2_1_quantized.model import (
+    StableDiffusionQuantized,
+)
+
+
+def test_from_precompiled():
+    StableDiffusionQuantized.from_precompiled()
+
+
+# @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
+@pytest.mark.slow_cloud
+def test_export():
+    export_for_component(export_model, "TextEncoder_Quantized")
+
+
+# @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.")
+@pytest.mark.slow_cloud
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/stylegan2/README.md b/qai_hub_models/models/stylegan2/README.md
index c3671d48..ea9e6792 100644
--- a/qai_hub_models/models/stylegan2/README.md
+++ b/qai_hub_models/models/stylegan2/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/stylegan2/demo.py b/qai_hub_models/models/stylegan2/demo.py
index dc28952f..be0c4842 100644
--- a/qai_hub_models/models/stylegan2/demo.py
+++ b/qai_hub_models/models/stylegan2/demo.py
@@ -41,8 +41,6 @@ def main(is_test: bool = False):
         help="Class[es] to use for image generation (if applicable).",
     )
     args = parser.parse_args([] if is_test else None)
-    if not args.inference_options:
-        args.inference_options = "--compute_unit gpu"
 
     # Create model and app
     model = model_from_cli_args(StyleGAN2, args)
diff --git a/qai_hub_models/models/stylegan2/export.py b/qai_hub_models/models/stylegan2/export.py
index fc61a8ec..25cd6b7f 100644
--- a/qai_hub_models/models/stylegan2/export.py
+++ b/qai_hub_models/models/stylegan2/export.py
@@ -118,9 +118,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_output output_0"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -183,8 +190,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
diff --git a/qai_hub_models/models/stylegan2/model.py b/qai_hub_models/models/stylegan2/model.py
index 906fdde7..05ac7791 100644
--- a/qai_hub_models/models/stylegan2/model.py
+++ b/qai_hub_models/models/stylegan2/model.py
@@ -4,10 +4,11 @@
 # ---------------------------------------------------------------------
 from __future__ import annotations
 
-from typing import Any, Callable, Dict, List
+from typing import Any, Callable, Dict, List, Optional
 
 import numpy as np
 import torch
+from qai_hub.client import Device
 
 from qai_hub_models.utils.asset_loaders import SourceAsRoot
 from qai_hub_models.utils.base_model import BaseModel, TargetRuntime
@@ -122,12 +123,20 @@ def sample_inputs(
         return inputs
 
     def get_hub_compile_options(
-        self, target_runtime: TargetRuntime, other_compile_options: str = ""
+        self,
+        target_runtime: TargetRuntime,
+        other_compile_options: str = "",
+        device: Optional[Device] = None,
     ) -> str:
         compile_options = super().get_hub_compile_options(
-            target_runtime, other_compile_options
+            target_runtime, other_compile_options, device
         )
-        return compile_options + " --compute_unit gpu"
+        if (
+            target_runtime == TargetRuntime.TFLITE
+            and "--compute_unit" not in compile_options
+        ):
+            compile_options = compile_options + " --compute_unit gpu"
+        return compile_options
 
     def get_hub_profile_options(
         self, target_runtime: TargetRuntime, other_profile_options: str = ""
@@ -135,7 +144,12 @@ def get_hub_profile_options(
         profile_options = super().get_hub_profile_options(
             target_runtime, other_profile_options
         )
-        return profile_options + " --compute_unit gpu"
+        if (
+            target_runtime == TargetRuntime.TFLITE
+            and "--compute_unit" not in profile_options
+        ):
+            profile_options = profile_options + " --compute_unit gpu"
+        return profile_options
 
 
 def _get_qaihm_upfirdn2d_ref(misc: Any, conv2d_gradfix: Callable, upfirdn2d: Any):
diff --git a/qai_hub_models/models/stylegan2/perf.yaml b/qai_hub_models/models/stylegan2/perf.yaml
index ede5aaaa..123e840c 100644
--- a/qai_hub_models/models/stylegan2/perf.yaml
+++ b/qai_hub_models/models/stylegan2/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: StyleGAN2
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1317970.0
-      throughput: 0.7587426117438182
+      inference_time: 1588522.0
+      throughput: 0.6295159903356705
       estimated_peak_memory_range:
-        min: 1448136704
-        max: 2566842336
+        min: 1459597312
+        max: 2294159464
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -46,9 +48,9 @@ models:
         layers_on_gpu: 78
         layers_on_cpu: 402
         total_layers: 480
-      job_id: jegnlknk5
+      job_id: jmg94lkm5
       job_status: Passed
-    torchscript_onnx_ort:
+    torchscript_onnx_qnn:
       inference_time: 'null'
       throughput: 'null'
       estimated_peak_memory_range:
@@ -61,8 +63,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jep20ewrg
+      job_id: jz57dykn5
       job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 640892.0
+      throughput: 1.560325296617839
+      estimated_peak_memory_range:
+        min: 206315520
+        max: 337724960
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 454
+        layers_on_gpu: 0
+        layers_on_cpu: 89
+        total_layers: 543
+      job_id: jo5mzno7p
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.583802Z'
+    timestamp: '2024-05-20T16:35:31.209775Z'
   - torchscript_onnx_tflite:
-      inference_time: 1012977.0
-      throughput: 0.9871892451654875
+      inference_time: 1240378.0
+      throughput: 0.8062058501521311
       estimated_peak_memory_range:
-        min: 954945536
-        max: 980253632
+        min: 1137418240
+        max: 1169458160
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -84,9 +101,9 @@ models:
         layers_on_gpu: 78
         layers_on_cpu: 402
         total_layers: 480
-      job_id: jopr8w005
+      job_id: jnp1847ng
       job_status: Passed
-    torchscript_onnx_ort:
+    torchscript_onnx_qnn:
       inference_time: 'null'
       throughput: 'null'
       estimated_peak_memory_range:
@@ -99,8 +116,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jqpyrmx85
+      job_id: jqp4wlm2g
       job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 508041.0
+      throughput: 1.9683450745117028
+      estimated_peak_memory_range:
+        min: 300343296
+        max: 1069005056
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 454
+        layers_on_gpu: 0
+        layers_on_cpu: 89
+        total_layers: 543
+      job_id: jegne6ojg
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.583878Z'
+    timestamp: '2024-05-20T16:35:31.209803Z'
   - torchscript_onnx_tflite:
-      inference_time: 1253049.0
-      throughput: 0.7980533881755622
+      inference_time: 1643379.0
+      throughput: 0.6085023600764036
       estimated_peak_memory_range:
-        min: 941391872
-        max: 2204990360
+        min: 1178169344
+        max: 1181322952
       primary_compute_unit: CPU
       precision: fp32
       layer_info:
@@ -122,8 +154,23 @@ models:
         layers_on_gpu: 78
         layers_on_cpu: 402
         total_layers: 480
-      job_id: j0pxn8015
+      job_id: jvgdvx86g
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j0px1k38g
+      job_status: Failed
     reference_device_info:
       name: QCS8550 (Proxy)
       os: '12'
@@ -131,4 +178,42 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.583943Z'
+    timestamp: '2024-05-20T16:35:31.209820Z'
+  - torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jopryvokg
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jep2mk465
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.209838Z'
diff --git a/qai_hub_models/models/stylegan2/requirements.txt b/qai_hub_models/models/stylegan2/requirements.txt
index 7317e178..2f72dd5e 100644
--- a/qai_hub_models/models/stylegan2/requirements.txt
+++ b/qai_hub_models/models/stylegan2/requirements.txt
@@ -1 +1 @@
-click==8.0
+click==8.1.7
diff --git a/qai_hub_models/models/swin_base/README.md b/qai_hub_models/models/swin_base/README.md
index 8c239d97..e1b53caa 100644
--- a/qai_hub_models/models/swin_base/README.md
+++ b/qai_hub_models/models/swin_base/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/swin_base/export.py b/qai_hub_models/models/swin_base/export.py
index 5847165f..fcf1640d 100644
--- a/qai_hub_models/models/swin_base/export.py
+++ b/qai_hub_models/models/swin_base/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/swin_base/perf.yaml b/qai_hub_models/models/swin_base/perf.yaml
index c847057a..bedb6e0b 100644
--- a/qai_hub_models/models/swin_base/perf.yaml
+++ b/qai_hub_models/models/swin_base/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Swin-Base
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 61028.0
-      throughput: 16.38592121649079
+      inference_time: 38211.0
+      throughput: 26.170474470702153
       estimated_peak_memory_range:
-        min: 106496
-        max: 3418200
+        min: 0
+        max: 7586888
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1568
-      job_id: j1p801xkg
+      job_id: jqpyd1q0p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 31640.0
+      throughput: 31.605562579013906
+      estimated_peak_memory_range:
+        min: 40960
+        max: 49217704
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1255
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1255
+      job_id: jogkye9vp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 72900.0
-      throughput: 13.717421124828531
+      inference_time: 64134.0
+      throughput: 15.592353509838775
       estimated_peak_memory_range:
-        min: 118784
-        max: 421108168
+        min: 114688
+        max: 476901736
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1163
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jn5qevyn5
+        total_layers: 1163
+      job_id: j1p3mjwmg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.601688Z'
+    timestamp: '2024-05-20T16:35:31.233759Z'
   - torchscript_onnx_tflite:
-      inference_time: 39474.0
-      throughput: 25.333130668287986
+      inference_time: 26230.0
+      throughput: 38.12428516965307
       estimated_peak_memory_range:
-        min: 73728
-        max: 512044160
+        min: 53248
+        max: 498968400
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1568
-      job_id: jogk784wp
+      job_id: j2p0rzv0p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 21887.0
+      throughput: 45.68922191255083
+      estimated_peak_memory_range:
+        min: 0
+        max: 408673168
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1255
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1255
+      job_id: jn5q26me5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 51726.0
-      throughput: 19.332637358388432
+      inference_time: 44459.0
+      throughput: 22.49263366247554
       estimated_peak_memory_range:
-        min: 651264
-        max: 268896832
+        min: 626688
+        max: 202092528
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1163
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1gl6lxjg
+        total_layers: 1163
+      job_id: jwgov2415
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.601867Z'
+    timestamp: '2024-05-20T16:35:31.233787Z'
   - torchscript_onnx_tflite:
-      inference_time: 61645.0
-      throughput: 16.221915808256956
+      inference_time: 38283.0
+      throughput: 26.121254865083717
       estimated_peak_memory_range:
-        min: 28672
-        max: 3282368
+        min: 98304
+        max: 3696992
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1568
-      job_id: jw56e9o0g
+      job_id: j1p87q4q5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 31310.0
+      throughput: 31.938677738741617
+      estimated_peak_memory_range:
+        min: 45056
+        max: 48773208
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1255
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1255
+      job_id: jw561ydnp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.602032Z'
+    timestamp: '2024-05-20T16:35:31.233806Z'
+  - torchscript_onnx_qnn:
+      inference_time: 38967.0
+      throughput: 25.662740267405752
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1255
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1255
+      job_id: j1glkv12p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 66278.0
+      throughput: 15.087962823259604
+      estimated_peak_memory_range:
+        min: 685105152
+        max: 685105152
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1163
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1163
+      job_id: j1pvw69zg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j7gjlvw1p
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.233833Z'
diff --git a/qai_hub_models/models/swin_small/README.md b/qai_hub_models/models/swin_small/README.md
index eae34fe2..01c8a31a 100644
--- a/qai_hub_models/models/swin_small/README.md
+++ b/qai_hub_models/models/swin_small/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/swin_small/export.py b/qai_hub_models/models/swin_small/export.py
index f7a264ad..4215098d 100644
--- a/qai_hub_models/models/swin_small/export.py
+++ b/qai_hub_models/models/swin_small/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/swin_small/perf.yaml b/qai_hub_models/models/swin_small/perf.yaml
index 41e018e7..fc750018 100644
--- a/qai_hub_models/models/swin_small/perf.yaml
+++ b/qai_hub_models/models/swin_small/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Swin-Small
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 46059.0
-      throughput: 21.711283353959054
+      inference_time: 29128.0
+      throughput: 34.33122768470201
       estimated_peak_memory_range:
-        min: 28672
-        max: 8907776
+        min: 36864
+        max: 2408576
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1563
-      job_id: j1p3v693g
+      job_id: jlpevdl85
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 23681.0
+      throughput: 42.22794645496389
+      estimated_peak_memory_range:
+        min: 16384
+        max: 45345336
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1246
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1246
+      job_id: jmg94lxm5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 61104.0
-      throughput: 16.365540717465304
+      inference_time: 56992.0
+      throughput: 17.54632229084784
       estimated_peak_memory_range:
-        min: 12288
-        max: 250842792
+        min: 40960
+        max: 225148824
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1158
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1pv07lk5
+        total_layers: 1158
+      job_id: jmg94lxq5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.619812Z'
+    timestamp: '2024-05-20T16:35:31.258354Z'
   - torchscript_onnx_tflite:
-      inference_time: 29579.0
-      throughput: 33.80776902532202
+      inference_time: 19660.0
+      throughput: 50.8646998982706
       estimated_peak_memory_range:
-        min: 45056
-        max: 479603376
+        min: 49152
+        max: 467994720
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1563
-      job_id: jwgok8rqp
+      job_id: jygz7344p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 16138.0
+      throughput: 61.96554715578139
+      estimated_peak_memory_range:
+        min: 0
+        max: 376584720
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1246
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1246
+      job_id: jnp184vng
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 43618.0
-      throughput: 22.926314824155167
+      inference_time: 39508.0
+      throughput: 25.311329351017516
       estimated_peak_memory_range:
-        min: 696320
-        max: 646499600
+        min: 88776704
+        max: 260548080
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 1158
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j7gjzqrv5
+        total_layers: 1158
+      job_id: jnp184vkg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.619995Z'
+    timestamp: '2024-05-20T16:35:31.258381Z'
   - torchscript_onnx_tflite:
-      inference_time: 45406.0
-      throughput: 22.023521120556754
+      inference_time: 29352.0
+      throughput: 34.06922867266285
       estimated_peak_memory_range:
-        min: 94208
-        max: 3127248
+        min: 20480
+        max: 8413168
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 1563
-      job_id: jz5701nlg
+      job_id: jz5w9e14p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 23705.0
+      throughput: 42.185192997257964
+      estimated_peak_memory_range:
+        min: 53248
+        max: 45854248
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1246
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1246
+      job_id: jz5w9e1zp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.620167Z'
+    timestamp: '2024-05-20T16:35:31.258398Z'
+  - torchscript_onnx_qnn:
+      inference_time: 23881.0
+      throughput: 41.87429337129936
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1246
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1246
+      job_id: jvgdvxz6g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 59131.0
+      throughput: 16.91160305085319
+      estimated_peak_memory_range:
+        min: 473104384
+        max: 473104384
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1158
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1158
+      job_id: jvgdvxzkg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 449448.0
+      throughput: 2.2249514960573857
+      estimated_peak_memory_range:
+        min: 1191936
+        max: 1191936
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 1050
+        total_layers: 1050
+      job_id: jz57dy7q5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.258421Z'
diff --git a/qai_hub_models/models/swin_tiny/README.md b/qai_hub_models/models/swin_tiny/README.md
index 25b9d845..8549a629 100644
--- a/qai_hub_models/models/swin_tiny/README.md
+++ b/qai_hub_models/models/swin_tiny/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/s
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/swin_tiny/export.py b/qai_hub_models/models/swin_tiny/export.py
index ae43d850..05142b6c 100644
--- a/qai_hub_models/models/swin_tiny/export.py
+++ b/qai_hub_models/models/swin_tiny/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/swin_tiny/perf.yaml b/qai_hub_models/models/swin_tiny/perf.yaml
index cc35a05a..9a0129a2 100644
--- a/qai_hub_models/models/swin_tiny/perf.yaml
+++ b/qai_hub_models/models/swin_tiny/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Swin-Tiny
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 28481.0
-      throughput: 35.11112671605632
+      inference_time: 17594.0
+      throughput: 56.83755825849722
       estimated_peak_memory_range:
-        min: 217088
-        max: 74292680
+        min: 0
+        max: 2690144
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 837
-      job_id: jygzonlo5
+      job_id: jqp4wl9qg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 15006.0
+      throughput: 66.6400106624017
+      estimated_peak_memory_range:
+        min: 0
+        max: 28760920
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 700
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 700
+      job_id: jegne6kvg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 27887.0
-      throughput: 35.85900240255316
+      inference_time: 34124.0
+      throughput: 29.304888055327627
       estimated_peak_memory_range:
-        min: 16384
-        max: 164109776
+        min: 65536
+        max: 157394912
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 624
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jmg9jdzw5
+        total_layers: 624
+      job_id: j2p0rz62p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.637970Z'
+    timestamp: '2024-05-20T16:35:31.283034Z'
   - torchscript_onnx_tflite:
-      inference_time: 18310.0
-      throughput: 54.614964500273075
+      inference_time: 11804.0
+      throughput: 84.71704506946797
       estimated_peak_memory_range:
         min: 40960
-        max: 293649808
+        max: 289709760
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 837
-      job_id: jz5w24l35
+      job_id: j0px1kdjg
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jopryvwvg
+      job_status: Failed
     torchscript_onnx_ort:
-      inference_time: 19785.0
-      throughput: 50.543340914834474
+      inference_time: 23681.0
+      throughput: 42.22794645496389
       estimated_peak_memory_range:
-        min: 634880
-        max: 162638432
+        min: 28672
+        max: 109585264
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 624
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jnp1y6n8p
+        total_layers: 624
+      job_id: j1p87q1z5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.638080Z'
+    timestamp: '2024-05-20T16:35:31.283062Z'
   - torchscript_onnx_tflite:
-      inference_time: 28405.0
-      throughput: 35.205069530012324
+      inference_time: 17554.0
+      throughput: 56.96707303178763
       estimated_peak_memory_range:
-        min: 57344
-        max: 3112384
+        min: 28672
+        max: 2913592
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 837
-      job_id: jep20qd4g
+      job_id: jo5mzndyp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 14942.0
+      throughput: 66.9254450542096
+      estimated_peak_memory_range:
+        min: 225280
+        max: 27331792
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 700
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 700
+      job_id: jqpyd1mrp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.638179Z'
+    timestamp: '2024-05-20T16:35:31.283079Z'
+  - torchscript_onnx_qnn:
+      inference_time: 14251.0
+      throughput: 70.17051434987019
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 700
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 700
+      job_id: jep2mkex5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 35507.0
+      throughput: 28.16346072605402
+      estimated_peak_memory_range:
+        min: 241229824
+        max: 241229824
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 624
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 624
+      job_id: jogkye8yp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 17912.0
+      throughput: 55.828494863778474
+      estimated_peak_memory_range:
+        min: 1433600
+        max: 1433600
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 564
+        total_layers: 564
+      job_id: jn5q26v75
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.283101Z'
diff --git a/qai_hub_models/models/trocr/README.md b/qai_hub_models/models/trocr/README.md
index 9343aea7..8e1b963a 100644
--- a/qai_hub_models/models/trocr/README.md
+++ b/qai_hub_models/models/trocr/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/t
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/trocr/export.py b/qai_hub_models/models/trocr/export.py
index b005f639..d0528db8 100644
--- a/qai_hub_models/models/trocr/export.py
+++ b/qai_hub_models/models/trocr/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -227,10 +227,7 @@ def export_model(
 def main():
     warnings.filterwarnings("ignore")
     parser = export_parser(
-        model_cls=Model,
-        components=ALL_COMPONENTS,
-        supports_qnn=False,
-        supports_ort=False,
+        model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False
     )
     args = parser.parse_args()
     export_model(**vars(args))
diff --git a/qai_hub_models/models/trocr/perf.yaml b/qai_hub_models/models/trocr/perf.yaml
index 9538686c..2bf9904d 100644
--- a/qai_hub_models/models/trocr/perf.yaml
+++ b/qai_hub_models/models/trocr/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: TrOCREncoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 216492.0
-      throughput: 4.619108327328492
+      inference_time: 149663.0
+      throughput: 6.68167817028925
       estimated_peak_memory_range:
-        min: 7274496
-        max: 10306224
+        min: 7266304
+        max: 10722008
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 592
-      job_id: jz5709evg
+      job_id: j1glkvlep
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 123961.0
+      throughput: 8.067053347423787
+      estimated_peak_memory_range:
+        min: 32768
+        max: 24931512
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 469
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 469
+      job_id: jlpevdy75
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 189041.0
-      throughput: 5.289857755724949
+      inference_time: 111209.0
+      throughput: 8.992077979300236
       estimated_peak_memory_range:
-        min: 69632
-        max: 125141888
+        min: 143360
+        max: 114159672
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 396
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jegnlkzk5
+        total_layers: 396
+      job_id: jz57dy9q5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.655937Z'
+    timestamp: '2024-05-20T16:35:31.307706Z'
   - torchscript_onnx_tflite:
-      inference_time: 162590.0
-      throughput: 6.1504397564425854
+      inference_time: 111478.0
+      throughput: 8.970379805880981
       estimated_peak_memory_range:
-        min: 5963776
-        max: 327025904
+        min: 6787072
+        max: 349351296
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 592
-      job_id: j0pxnxl35
+      job_id: j1p3mj6xg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 92809.0
+      throughput: 10.77481709747977
+      estimated_peak_memory_range:
+        min: 1785856
+        max: 169310384
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 469
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 469
+      job_id: jz5w9e4zp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 143879.0
-      throughput: 6.95028461415495
+      inference_time: 84299.0
+      throughput: 11.86253692214617
       estimated_peak_memory_range:
-        min: 14708736
-        max: 90842000
+        min: 11382784
+        max: 88625792
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 396
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jep20errg
+        total_layers: 396
+      job_id: j0px1kxjg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.656015Z'
+    timestamp: '2024-05-20T16:35:31.307733Z'
   - torchscript_onnx_tflite:
-      inference_time: 216411.0
-      throughput: 4.620837203284491
+      inference_time: 149781.0
+      throughput: 6.676414231444576
       estimated_peak_memory_range:
         min: 7274496
-        max: 10398120
+        max: 10723104
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 592
-      job_id: jlpee0x1p
+      job_id: j1pvw677g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 123679.0
+      throughput: 8.085447003937613
+      estimated_peak_memory_range:
+        min: 1929216
+        max: 24597888
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 469
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 469
+      job_id: jnp1846kg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,15 +178,53 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.656082Z'
+    timestamp: '2024-05-20T16:35:31.307750Z'
+  - torchscript_onnx_ort:
+      inference_time: 111834.0
+      throughput: 8.941824489868912
+      estimated_peak_memory_range:
+        min: 34922496
+        max: 34922496
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 396
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 396
+      job_id: jegne6nvg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 39277.0
+      throughput: 25.46019298826285
+      estimated_peak_memory_range:
+        min: 2703360
+        max: 2703360
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 365
+        total_layers: 365
+      job_id: jep2mkwx5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.307768Z'
 - name: TrOCRDecoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2684.0
-      throughput: 372.5782414307005
+      inference_time: 2717.0
+      throughput: 368.052999631947
       estimated_peak_memory_range:
-        min: 16384
-        max: 2557552
+        min: 20480
+        max: 2492240
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -147,22 +232,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 370
-      job_id: jqp4k3y8g
+      job_id: jw561ywvp
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jygz73nzp
+      job_status: Failed
     torchscript_onnx_ort:
-      inference_time: 2944.0
-      throughput: 339.67391304347825
+      inference_time: 2875.0
+      throughput: 347.82608695652175
       estimated_peak_memory_range:
-        min: 28672
-        max: 392358928
+        min: 0
+        max: 575282800
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 352
         layers_on_gpu: 0
         layers_on_cpu: 1
-        total_layers: 2
-      job_id: jopr8wl05
+        total_layers: 353
+      job_id: jqp4wl3qg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -171,13 +271,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.656134Z'
+    timestamp: '2024-05-20T16:35:31.307792Z'
   - torchscript_onnx_tflite:
-      inference_time: 1948.0
-      throughput: 513.347022587269
+      inference_time: 1998.0
+      throughput: 500.5005005005005
       estimated_peak_memory_range:
         min: 12288
-        max: 192910976
+        max: 192263456
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -185,22 +285,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 370
-      job_id: jo5mq80dp
+      job_id: jwgov2845
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jmg94ldq5
+      job_status: Failed
     torchscript_onnx_ort:
-      inference_time: 2482.0
-      throughput: 402.90088638195004
+      inference_time: 2139.0
+      throughput: 467.50818139317437
       estimated_peak_memory_range:
         min: 0
-        max: 36159696
+        max: 45855536
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 352
         layers_on_gpu: 0
         layers_on_cpu: 1
-        total_layers: 2
-      job_id: jqpyrmo85
+        total_layers: 353
+      job_id: jo5mzn8yp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -209,13 +324,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.656185Z'
+    timestamp: '2024-05-20T16:35:31.307816Z'
   - torchscript_onnx_tflite:
-      inference_time: 2691.0
-      throughput: 371.6090672612412
+      inference_time: 2735.0
+      throughput: 365.6307129798903
       estimated_peak_memory_range:
         min: 16384
-        max: 2038272
+        max: 2426968
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -223,8 +338,23 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 370
-      job_id: jygzoqyk5
+      job_id: j7gjlvq7p
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jvgdvx2kg
+      job_status: Failed
     reference_device_info:
       name: QCS8550 (Proxy)
       os: '12'
@@ -232,4 +362,42 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.656227Z'
+    timestamp: '2024-05-20T16:35:31.307832Z'
+  - torchscript_onnx_ort:
+      inference_time: 2647.0
+      throughput: 377.7861730260673
+      estimated_peak_memory_range:
+        min: 355991552
+        max: 355991552
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 352
+        layers_on_gpu: 0
+        layers_on_cpu: 1
+        total_layers: 353
+      job_id: jopryv0vg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 1426.0
+      throughput: 701.2622720897616
+      estimated_peak_memory_range:
+        min: 7168000
+        max: 7168000
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 300
+        total_layers: 300
+      job_id: jqpyd1xrp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.307850Z'
diff --git a/qai_hub_models/models/unet_segmentation/README.md b/qai_hub_models/models/unet_segmentation/README.md
index 03162771..78dfce4f 100644
--- a/qai_hub_models/models/unet_segmentation/README.md
+++ b/qai_hub_models/models/unet_segmentation/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/u
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/unet_segmentation/export.py b/qai_hub_models/models/unet_segmentation/export.py
index be9c6471..2ecf01e9 100644
--- a/qai_hub_models/models/unet_segmentation/export.py
+++ b/qai_hub_models/models/unet_segmentation/export.py
@@ -120,12 +120,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -163,8 +167,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -192,8 +198,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -205,7 +215,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/unet_segmentation/perf.yaml b/qai_hub_models/models/unet_segmentation/perf.yaml
index c45f70ad..dfcd3e11 100644
--- a/qai_hub_models/models/unet_segmentation/perf.yaml
+++ b/qai_hub_models/models/unet_segmentation/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Unet-Segmentation
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 155616.0
-      throughput: 6.4260744396463085
+      inference_time: 161691.0
+      throughput: 6.184636126933472
       estimated_peak_memory_range:
-        min: 6692864
-        max: 229373376
+        min: 16384
+        max: 237098920
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: jogk782wp
+      job_id: j2p0rzj2p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 150609.0
-      throughput: 6.63970944631463
+      inference_time: 149965.0
+      throughput: 6.668222585269897
       estimated_peak_memory_range:
-        min: 9854976
-        max: 34064640
+        min: 9981952
+        max: 30872736
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 51
-      job_id: j1gl6lyjg
+      job_id: jn5q26y75
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 150132.0
-      throughput: 6.6608051581275145
+      inference_time: 157701.0
+      throughput: 6.341113880064172
       estimated_peak_memory_range:
-        min: 13246464
-        max: 147066768
+        min: 13557760
+        max: 158096808
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 53
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p3v6z3g
+        total_layers: 53
+      job_id: jwgov2r45
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.687955Z'
+    timestamp: '2024-05-20T16:35:31.352363Z'
   - torchscript_onnx_tflite:
-      inference_time: 112866.0
-      throughput: 8.860064146864424
+      inference_time: 115442.0
+      throughput: 8.662358586996067
       estimated_peak_memory_range:
-        min: 5500928
-        max: 359682512
+        min: 4841472
+        max: 335577584
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: jn5qevln5
+      job_id: j1p87qxz5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 111273.0
-      throughput: 8.98690607784458
+      inference_time: 109130.0
+      throughput: 9.163383121048291
       estimated_peak_memory_range:
-        min: 9814016
-        max: 110733232
+        min: 9969664
+        max: 88942624
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 51
-      job_id: jw56ew86g
+      job_id: j1glkvxep
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 110582.0
-      throughput: 9.043063066321825
+      inference_time: 118569.0
+      throughput: 8.433907682446508
       estimated_peak_memory_range:
-        min: 16162816
-        max: 113694432
+        min: 22605824
+        max: 100595248
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 53
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jwgok8lqp
+        total_layers: 53
+      job_id: j1pvw6d7g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.687994Z'
+    timestamp: '2024-05-20T16:35:31.352389Z'
   - torchscript_onnx_tflite:
-      inference_time: 160844.0
-      throughput: 6.2172042475939415
+      inference_time: 157031.0
+      throughput: 6.368169342359152
       estimated_peak_memory_range:
-        min: 323584
-        max: 237497504
+        min: 6692864
+        max: 464186128
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 31
-      job_id: jw56e900g
+      job_id: jogkye4yp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 150008.0
-      throughput: 6.666311130073063
+      inference_time: 146356.0
+      throughput: 6.832654622974118
       estimated_peak_memory_range:
-        min: 9900032
-        max: 34159264
+        min: 9895936
+        max: 31713392
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 51
-      job_id: jo5mq11wp
+      job_id: j1p3mj9xg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.688021Z'
+    timestamp: '2024-05-20T16:35:31.352406Z'
+  - torchscript_onnx_qnn:
+      inference_time: 190735.0
+      throughput: 5.24287624190631
+      estimated_peak_memory_range:
+        min: 9850880
+        max: 9850880
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 51
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 51
+      job_id: jw561y7vp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 146581.0
+      throughput: 6.82216658366364
+      estimated_peak_memory_range:
+        min: 9854976
+        max: 9854976
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 53
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 53
+      job_id: j7gjlv77p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 1963452.0
+      throughput: 0.5093070775348723
+      estimated_peak_memory_range:
+        min: 1940811776
+        max: 1940811776
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 31
+        total_layers: 31
+      job_id: jlpevdz75
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.352430Z'
diff --git a/qai_hub_models/models/vit/README.md b/qai_hub_models/models/vit/README.md
index 3a7735f0..314d20ef 100644
--- a/qai_hub_models/models/vit/README.md
+++ b/qai_hub_models/models/vit/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/v
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/vit/export.py b/qai_hub_models/models/vit/export.py
index 9b96fb31..62a9bca5 100644
--- a/qai_hub_models/models/vit/export.py
+++ b/qai_hub_models/models/vit/export.py
@@ -121,9 +121,16 @@ def export_model(
         model.to("cpu"), make_torch_inputs(input_spec), check_trace=False
     )
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -161,8 +168,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -199,7 +210,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/vit/perf.yaml b/qai_hub_models/models/vit/perf.yaml
index f7c0334b..09b5ee29 100644
--- a/qai_hub_models/models/vit/perf.yaml
+++ b/qai_hub_models/models/vit/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: VIT
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 119744.0
-      throughput: 8.351149118118654
+      inference_time: 79223.0
+      throughput: 12.622596973101246
       estimated_peak_memory_range:
-        min: 196608
-        max: 3447072
+        min: 126976
+        max: 3307040
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 535
-      job_id: j7gjzq3v5
+      job_id: jygz73mzp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 67117.0
+      throughput: 14.899354857934652
+      estimated_peak_memory_range:
+        min: 32768
+        max: 42487808
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 386
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 386
+      job_id: jnp184jkg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 128755.0
-      throughput: 7.766688672284571
+      inference_time: 104492.0
+      throughput: 9.570110630478888
       estimated_peak_memory_range:
-        min: 36864
-        max: 430908512
+        min: 73728
+        max: 437745512
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 376
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jygzonzo5
+        total_layers: 376
+      job_id: j0px1k4jg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.712224Z'
+    timestamp: '2024-05-20T16:35:31.382740Z'
   - torchscript_onnx_tflite:
-      inference_time: 89024.0
-      throughput: 11.23292595255212
+      inference_time: 56817.0
+      throughput: 17.60036608761462
       estimated_peak_memory_range:
-        min: 151552
-        max: 407939792
+        min: 114688
+        max: 373000000
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 535
-      job_id: jlpeey6op
+      job_id: jz5w9e7zp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 48402.0
+      throughput: 20.660303293252344
+      estimated_peak_memory_range:
+        min: 0
+        max: 164302880
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 386
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 386
+      job_id: jvgdvx3kg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 98667.0
-      throughput: 10.135100894929408
+      inference_time: 76327.0
+      throughput: 13.101523707207148
       estimated_peak_memory_range:
-        min: 663552
-        max: 874006192
+        min: 638976
+        max: 514001424
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 376
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jz5w24y35
+        total_layers: 376
+      job_id: jo5mznmyp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.712295Z'
+    timestamp: '2024-05-20T16:35:31.382767Z'
   - torchscript_onnx_tflite:
-      inference_time: 119402.0
-      throughput: 8.375069094320029
+      inference_time: 78953.0
+      throughput: 12.665763175560143
       estimated_peak_memory_range:
-        min: 135168
-        max: 4419520
+        min: 143360
+        max: 3490600
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 535
-      job_id: jqpyrkk75
+      job_id: jmg94lmq5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 67350.0
+      throughput: 14.847809948032666
+      estimated_peak_memory_range:
+        min: 12288
+        max: 46277240
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 386
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 386
+      job_id: jqp4wl1qg
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.712360Z'
+    timestamp: '2024-05-20T16:35:31.382784Z'
+  - torchscript_onnx_qnn:
+      inference_time: 65972.0
+      throughput: 15.157945795185837
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 385
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 385
+      job_id: jz57dy4q5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 103551.0
+      throughput: 9.657077189017972
+      estimated_peak_memory_range:
+        min: 176091136
+        max: 176091136
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 376
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 376
+      job_id: jegne6zvg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jopryvlvg
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.382809Z'
diff --git a/qai_hub_models/models/whisper_base_en/README.md b/qai_hub_models/models/whisper_base_en/README.md
index 6939e9d7..d751e49d 100644
--- a/qai_hub_models/models/whisper_base_en/README.md
+++ b/qai_hub_models/models/whisper_base_en/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/whisper_base_en/export.py b/qai_hub_models/models/whisper_base_en/export.py
index 2b462de6..12f78f5b 100644
--- a/qai_hub_models/models/whisper_base_en/export.py
+++ b/qai_hub_models/models/whisper_base_en/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -226,12 +226,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(
-        model_cls=Model,
-        components=ALL_COMPONENTS,
-        supports_qnn=False,
-        supports_ort=False,
-    )
+    parser = export_parser(model_cls=Model, components=ALL_COMPONENTS)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/whisper_base_en/perf.yaml b/qai_hub_models/models/whisper_base_en/perf.yaml
index 277067ac..c88444dd 100644
--- a/qai_hub_models/models/whisper_base_en/perf.yaml
+++ b/qai_hub_models/models/whisper_base_en/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,39 +31,55 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: WhisperEncoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 154415.0
-      throughput: 6.476054787423502
+      inference_time: 159429.0
+      throughput: 6.272384572442905
       estimated_peak_memory_range:
-        min: 36925440
-        max: 139242008
+        min: 25227264
+        max: 130754096
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 303
+        layers_on_gpu: 419
         layers_on_cpu: 0
-        total_layers: 303
-      job_id: jnp1y6o8p
+        total_layers: 419
+      job_id: jep2mkrx5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 622656.0
+      throughput: 1.6060232295199919
+      estimated_peak_memory_range:
+        min: 12288
+        max: 87059512
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 580
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 580
+      job_id: j1glkvyep
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 394707.0
+      throughput: 2.53352486781334
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 75538432
+        max: 255421288
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 380
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: j0pxnx035
-      job_status: Failed
+        total_layers: 380
+      job_id: jz5w9elzp
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -70,37 +87,52 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.730149Z'
+    timestamp: '2024-05-20T16:35:31.407290Z'
   - torchscript_onnx_tflite:
-      inference_time: 118628.0
-      throughput: 8.42971305256769
+      inference_time: 122468.0
+      throughput: 8.16539830812947
       estimated_peak_memory_range:
-        min: 36814848
-        max: 61467824
+        min: 0
+        max: 42440336
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 303
+        layers_on_gpu: 419
         layers_on_cpu: 0
-        total_layers: 303
-      job_id: jz5709ovg
+        total_layers: 419
+      job_id: j2p0rzm2p
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+    torchscript_onnx_qnn:
+      inference_time: 454603.0
+      throughput: 2.1997215152561687
       estimated_peak_memory_range:
         min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        max: 198547792
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 580
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jegnlk1k5
-      job_status: Failed
+        total_layers: 580
+      job_id: j1p3mjzxg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 304852.0
+      throughput: 3.280280267146025
+      estimated_peak_memory_range:
+        min: 73445376
+        max: 277367024
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 380
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 380
+      job_id: jnp184nkg
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -108,21 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.730201Z'
+    timestamp: '2024-05-20T16:35:31.407316Z'
   - torchscript_onnx_tflite:
-      inference_time: 157798.0
-      throughput: 6.337215934295745
+      inference_time: 157524.0
+      throughput: 6.348238998501816
       estimated_peak_memory_range:
-        min: 25370624
-        max: 124671888
+        min: 29507584
+        max: 129166896
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 303
+        layers_on_gpu: 419
         layers_on_cpu: 0
-        total_layers: 303
-      job_id: jlpee001p
+        total_layers: 419
+      job_id: jogkye2yp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 625414.0
+      throughput: 1.5989408615732938
+      estimated_peak_memory_range:
+        min: 1048576
+        max: 78119600
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 580
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 580
+      job_id: jlpevd775
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,38 +178,106 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.730243Z'
+    timestamp: '2024-05-20T16:35:31.407333Z'
+  - torchscript_onnx_qnn:
+      inference_time: 459784.0
+      throughput: 2.1749343169836273
+      estimated_peak_memory_range:
+        min: 962560
+        max: 962560
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 579
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 579
+      job_id: j1pvw6l7g
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 390367.0
+      throughput: 2.56169194629669
+      estimated_peak_memory_range:
+        min: 139673600
+        max: 139673600
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 380
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 380
+      job_id: jz57dyeq5
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j0px1kljg
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.407358Z'
 - name: WhisperDecoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 13793.0
-      throughput: 72.50054375407815
+      inference_time: 23342.0
+      throughput: 42.84123040013709
       estimated_peak_memory_range:
-        min: 5775360
-        max: 8469096
+        min: 5783552
+        max: 8760040
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 447
+        layers_on_npu: 983
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 449
-      job_id: jvgde26r5
+        layers_on_cpu: 0
+        total_layers: 983
+      job_id: jqpyd1orp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 23335.0
+      throughput: 42.854081851296336
+      estimated_peak_memory_range:
+        min: 41029632
+        max: 57664648
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 821
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 821
+      job_id: jw561y8vp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 17653.0
-      throughput: 56.64759530957911
+      inference_time: 24574.0
+      throughput: 40.6934158053227
       estimated_peak_memory_range:
-        min: 11657216
-        max: 330606792
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 11902976
+        max: 207621344
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 844
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 3
-      job_id: jo5mq89dp
+        layers_on_cpu: 0
+        total_layers: 844
+      job_id: jmg94lzq5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -171,36 +286,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.730305Z'
+    timestamp: '2024-05-20T16:35:31.407380Z'
   - torchscript_onnx_tflite:
-      inference_time: 10194.0
-      throughput: 98.09691975671964
+      inference_time: 19155.0
+      throughput: 52.205690420255806
       estimated_peak_memory_range:
-        min: 3768320
-        max: 98615936
+        min: 3674112
+        max: 90342624
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 447
+        layers_on_npu: 983
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 449
-      job_id: jqp4k3e8g
+        layers_on_cpu: 0
+        total_layers: 983
+      job_id: j1p87qez5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 18519.0
+      throughput: 53.99859603650305
+      estimated_peak_memory_range:
+        min: 131715072
+        max: 412276656
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 821
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 821
+      job_id: jwgov2l45
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 14072.0
-      throughput: 71.0631040363843
+      inference_time: 20701.0
+      throughput: 48.30684507994783
       estimated_peak_memory_range:
-        min: 52715520
-        max: 167779568
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 55021568
+        max: 137177376
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 844
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 3
-      job_id: jopr8wx05
+        layers_on_cpu: 0
+        total_layers: 844
+      job_id: jvgdvxdkg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -209,21 +339,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.730361Z'
+    timestamp: '2024-05-20T16:35:31.407400Z'
   - torchscript_onnx_tflite:
-      inference_time: 13928.0
-      throughput: 71.79781734635267
+      inference_time: 23210.0
+      throughput: 43.084877208099954
+      estimated_peak_memory_range:
+        min: 1146880
+        max: 5317720
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 983
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 983
+      job_id: jn5q26l75
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 23685.0
+      throughput: 42.22081486172683
       estimated_peak_memory_range:
-        min: 5758976
-        max: 8442936
+        min: 42434560
+        max: 57209568
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 447
+        layers_on_npu: 821
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 449
-      job_id: jygzoqqk5
+        layers_on_cpu: 0
+        total_layers: 821
+      job_id: jygz73lzp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -232,4 +377,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.730409Z'
+    timestamp: '2024-05-20T16:35:31.407416Z'
+  - torchscript_onnx_qnn:
+      inference_time: 13480.0
+      throughput: 74.1839762611276
+      estimated_peak_memory_range:
+        min: 42463232
+        max: 42463232
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 821
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 821
+      job_id: j7gjlvr7p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 20213.0
+      throughput: 49.47311136397368
+      estimated_peak_memory_range:
+        min: 112713728
+        max: 112713728
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 844
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 844
+      job_id: jqp4wlyqg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jo5mzn0yp
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.407452Z'
diff --git a/qai_hub_models/models/whisper_small_en/README.md b/qai_hub_models/models/whisper_small_en/README.md
index e32ddef2..f0f96498 100644
--- a/qai_hub_models/models/whisper_small_en/README.md
+++ b/qai_hub_models/models/whisper_small_en/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/whisper_small_en/export.py b/qai_hub_models/models/whisper_small_en/export.py
index 9c455526..82d19aa8 100644
--- a/qai_hub_models/models/whisper_small_en/export.py
+++ b/qai_hub_models/models/whisper_small_en/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -226,12 +226,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(
-        model_cls=Model,
-        components=ALL_COMPONENTS,
-        supports_qnn=False,
-        supports_ort=False,
-    )
+    parser = export_parser(model_cls=Model, components=ALL_COMPONENTS)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/whisper_small_en/perf.yaml b/qai_hub_models/models/whisper_small_en/perf.yaml
index c98a944d..91b45ae6 100644
--- a/qai_hub_models/models/whisper_small_en/perf.yaml
+++ b/qai_hub_models/models/whisper_small_en/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,24 +31,40 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: WhisperEncoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 600006.0
-      throughput: 1.666650000166665
+      inference_time: 615600.0
+      throughput: 1.6244314489928524
       estimated_peak_memory_range:
-        min: 79036416
-        max: 532898328
+        min: 12288
+        max: 448683040
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 585
+        layers_on_gpu: 911
         layers_on_cpu: 0
-        total_layers: 585
-      job_id: j2p036o9p
+        total_layers: 911
+      job_id: jegne61vg
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jogkye6yp
+      job_status: Failed
     torchscript_onnx_ort:
       inference_time: 'null'
       throughput: 'null'
@@ -61,7 +78,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: j1gl6lwjg
+      job_id: jlpevd675
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,37 +87,52 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.762329Z'
+    timestamp: '2024-05-20T16:35:31.453698Z'
   - torchscript_onnx_tflite:
-      inference_time: 465622.0
-      throughput: 2.1476648440151025
+      inference_time: 470667.0
+      throughput: 2.124644387645618
       estimated_peak_memory_range:
-        min: 110800896
-        max: 143440272
+        min: 108802048
+        max: 205784096
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 585
+        layers_on_gpu: 911
         layers_on_cpu: 0
-        total_layers: 585
-      job_id: jogk786wp
+        total_layers: 911
+      job_id: jep2mkox5
       job_status: Passed
-    torchscript_onnx_ort:
-      inference_time: 'null'
-      throughput: 'null'
+    torchscript_onnx_qnn:
+      inference_time: 1479203.0
+      throughput: 0.6760397322071413
       estimated_peak_memory_range:
         min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        max: 569102256
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 1474
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: j1p3v6o3g
-      job_status: Failed
+        total_layers: 1474
+      job_id: j1glkvwep
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1261557.0
+      throughput: 0.7926712784281645
+      estimated_peak_memory_range:
+        min: 999424
+        max: 563911776
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 884
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 884
+      job_id: jz5w9eyzp
+      job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -108,22 +140,37 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.762404Z'
+    timestamp: '2024-05-20T16:35:31.453725Z'
   - torchscript_onnx_tflite:
-      inference_time: 602366.0
-      throughput: 1.66012025911157
+      inference_time: 612583.0
+      throughput: 1.63243185005134
       estimated_peak_memory_range:
-        min: 72904704
-        max: 522853520
+        min: 16384
+        max: 444838416
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 585
+        layers_on_gpu: 911
         layers_on_cpu: 0
-        total_layers: 585
-      job_id: j2p038w9p
+        total_layers: 911
+      job_id: j2p0rzo2p
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1pvw627g
+      job_status: Failed
     reference_device_info:
       name: QCS8550 (Proxy)
       os: '12'
@@ -131,38 +178,106 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.762472Z'
+    timestamp: '2024-05-20T16:35:31.453757Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1707514.0
+      throughput: 0.5856467355465313
+      estimated_peak_memory_range:
+        min: 962560
+        max: 962560
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 1473
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 1473
+      job_id: j1p3mjoxg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1518658.0
+      throughput: 0.6584761019268328
+      estimated_peak_memory_range:
+        min: 555753472
+        max: 555753472
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 884
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 884
+      job_id: jnp184okg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jz5w9eyjp
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.453784Z'
 - name: WhisperDecoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 45614.0
-      throughput: 21.92309378699522
+      inference_time: 26229.0
+      throughput: 38.12573868618704
       estimated_peak_memory_range:
-        min: 16830464
-        max: 20007784
+        min: 16203776
+        max: 19541664
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 879
+        layers_on_npu: 2573
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 881
-      job_id: j1p801jkg
+        layers_on_cpu: 0
+        total_layers: 2573
+      job_id: jopryvxvg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 24425.0
+      throughput: 40.941658137154555
+      estimated_peak_memory_range:
+        min: 121384960
+        max: 195379040
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 2255
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 2255
+      job_id: jn5q26475
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 75579.0
-      throughput: 13.231188557668135
+      inference_time: 62618.0
+      throughput: 15.969848925229167
       estimated_peak_memory_range:
-        min: 40751104
-        max: 289480944
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 49823744
+        max: 691829120
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 2302
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 3
-      job_id: jw56ewo6g
+        layers_on_cpu: 0
+        total_layers: 2302
+      job_id: jygz73zzp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -171,36 +286,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.762570Z'
+    timestamp: '2024-05-20T16:35:31.453805Z'
   - torchscript_onnx_tflite:
-      inference_time: 34559.0
-      throughput: 28.936022454353424
+      inference_time: 19526.0
+      throughput: 51.21376626037079
       estimated_peak_memory_range:
-        min: 15560704
-        max: 1589538480
+        min: 16277504
+        max: 1152242688
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 879
+        layers_on_npu: 2573
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 881
-      job_id: jn5qev4n5
+        layers_on_cpu: 0
+        total_layers: 2573
+      job_id: jqpyd18rp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 19235.0
+      throughput: 51.988562516246425
+      estimated_peak_memory_range:
+        min: 110612480
+        max: 902217440
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 2255
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 2255
+      job_id: jw561yovp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 60639.0
-      throughput: 16.49103712132456
+      inference_time: 53225.0
+      throughput: 18.788163457022076
       estimated_peak_memory_range:
-        min: 160247808
-        max: 557923088
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 84680704
+        max: 354730464
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 2302
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 3
-      job_id: jwgok8dqp
+        layers_on_cpu: 0
+        total_layers: 2302
+      job_id: jmg94loq5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -209,21 +339,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.762666Z'
+    timestamp: '2024-05-20T16:35:31.453825Z'
   - torchscript_onnx_tflite:
-      inference_time: 45957.0
-      throughput: 21.75947080966991
+      inference_time: 27363.0
+      throughput: 36.54570039834813
       estimated_peak_memory_range:
         min: 16830464
-        max: 19552208
+        max: 19976992
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 2573
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 2573
+      job_id: j1p87qjz5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 25042.0
+      throughput: 39.93291270665282
+      estimated_peak_memory_range:
+        min: 127197184
+        max: 202463224
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 879
+        layers_on_npu: 2255
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 881
-      job_id: j1p80dnkg
+        layers_on_cpu: 0
+        total_layers: 2255
+      job_id: j7gjlv37p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -232,4 +377,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.762757Z'
+    timestamp: '2024-05-20T16:35:31.453845Z'
+  - torchscript_onnx_qnn:
+      inference_time: 20874.0
+      throughput: 47.906486538277285
+      estimated_peak_memory_range:
+        min: 127381504
+        max: 127381504
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 2255
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 2255
+      job_id: jwgov2d45
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 54047.0
+      throughput: 18.502414565100747
+      estimated_peak_memory_range:
+        min: 347856896
+        max: 347856896
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 2302
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 2302
+      job_id: jvgdvx6kg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jmg94lov5
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.453870Z'
diff --git a/qai_hub_models/models/whisper_tiny_en/README.md b/qai_hub_models/models/whisper_tiny_en/README.md
index 00d0e87f..e541696e 100644
--- a/qai_hub_models/models/whisper_tiny_en/README.md
+++ b/qai_hub_models/models/whisper_tiny_en/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/whisper_tiny_en/export.py b/qai_hub_models/models/whisper_tiny_en/export.py
index 038202f9..58af8d0b 100644
--- a/qai_hub_models/models/whisper_tiny_en/export.py
+++ b/qai_hub_models/models/whisper_tiny_en/export.py
@@ -134,7 +134,7 @@ def export_model(
 
         # 2. Compile the models to an on-device asset
         model_compile_options = component.get_hub_compile_options(
-            target_runtime, compile_options
+            target_runtime, compile_options, hub_device
         )
         print(f"Optimizing model {component_name} to run on-device")
         submitted_compile_job = hub.submit_compile_job(
@@ -226,12 +226,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(
-        model_cls=Model,
-        components=ALL_COMPONENTS,
-        supports_qnn=False,
-        supports_ort=False,
-    )
+    parser = export_parser(model_cls=Model, components=ALL_COMPONENTS)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/whisper_tiny_en/perf.yaml b/qai_hub_models/models/whisper_tiny_en/perf.yaml
index a16bafaf..724a32a5 100644
--- a/qai_hub_models/models/whisper_tiny_en/perf.yaml
+++ b/qai_hub_models/models/whisper_tiny_en/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,23 +31,39 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: WhisperEncoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 67351.0
-      throughput: 14.847589493845675
+      inference_time: 68887.0
+      throughput: 14.516527066064715
       estimated_peak_memory_range:
-        min: 16117760
-        max: 104999648
+        min: 11296768
+        max: 56646392
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 209
+        layers_on_gpu: 271
         layers_on_cpu: 0
-        total_layers: 209
-      job_id: jlpeeyxop
+        total_layers: 271
+      job_id: jnp184olg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 288969.0
+      throughput: 3.4605788164128333
+      estimated_peak_memory_range:
+        min: 159744
+        max: 54792792
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 338
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 338
+      job_id: jegne6qmg
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -61,7 +78,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jnp1y618p
+      job_id: j1glkv8lp
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,21 +87,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.794639Z'
+    timestamp: '2024-05-20T16:35:31.499723Z'
   - torchscript_onnx_tflite:
-      inference_time: 52682.0
-      throughput: 18.981815420826848
+      inference_time: 54355.0
+      throughput: 18.397571520559286
       estimated_peak_memory_range:
         min: 0
-        max: 28255008
+        max: 32722000
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 209
+        layers_on_gpu: 271
         layers_on_cpu: 0
-        total_layers: 209
-      job_id: jz5w24z35
+        total_layers: 271
+      job_id: jz57dynr5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 218798.0
+      throughput: 4.570425689448715
+      estimated_peak_memory_range:
+        min: 999424
+        max: 138033888
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 338
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 338
+      job_id: jep2mkdm5
       job_status: Passed
     torchscript_onnx_ort:
       inference_time: 'null'
@@ -99,7 +131,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: jz5709nvg
+      job_id: j1p3mj7zg
       job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,21 +140,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.794681Z'
+    timestamp: '2024-05-20T16:35:31.499751Z'
   - torchscript_onnx_tflite:
-      inference_time: 67311.0
-      throughput: 14.856412770572417
+      inference_time: 68575.0
+      throughput: 14.582573824279985
       estimated_peak_memory_range:
-        min: 17125376
-        max: 63332656
+        min: 12288
+        max: 95017352
       primary_compute_unit: GPU
       precision: fp16
       layer_info:
         layers_on_npu: 0
-        layers_on_gpu: 209
+        layers_on_gpu: 271
+        layers_on_cpu: 0
+        total_layers: 271
+      job_id: j0px1kr9g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 293385.0
+      throughput: 3.40849054995995
+      estimated_peak_memory_range:
+        min: 978944
+        max: 49011728
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 338
+        layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 209
-      job_id: jygzoq1o5
+        total_layers: 338
+      job_id: jogkye0op
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,38 +178,106 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.794711Z'
+    timestamp: '2024-05-20T16:35:31.499770Z'
+  - torchscript_onnx_qnn:
+      inference_time: 240121.0
+      throughput: 4.164567030788644
+      estimated_peak_memory_range:
+        min: 962560
+        max: 962560
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 337
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 337
+      job_id: j2p0rz9ep
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1pvw6mmg
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jlpevdx05
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.499797Z'
 - name: WhisperDecoder
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 7115.0
-      throughput: 140.54813773717498
+      inference_time: 3871.0
+      throughput: 258.3311805734952
       estimated_peak_memory_range:
         min: 2977792
-        max: 5417544
+        max: 5435904
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 303
+        layers_on_npu: 557
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 305
-      job_id: jygzonyo5
+        layers_on_cpu: 0
+        total_layers: 557
+      job_id: jvgdvx6lg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 3646.0
+      throughput: 274.27317608337904
+      estimated_peak_memory_range:
+        min: 9920512
+        max: 47146336
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 447
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 447
+      job_id: jopryvdeg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 8714.0
-      throughput: 114.75786091347257
+      inference_time: 5287.0
+      throughput: 189.14318138831095
       estimated_peak_memory_range:
-        min: 6172672
-        max: 212702328
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 6336512
+        max: 214447104
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 462
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 3
-      job_id: jvgde24r5
+        layers_on_cpu: 0
+        total_layers: 462
+      job_id: jw561ym7p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -171,36 +286,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.794757Z'
+    timestamp: '2024-05-20T16:35:31.499823Z'
   - torchscript_onnx_tflite:
-      inference_time: 5479.0
-      throughput: 182.5150574922431
+      inference_time: 3044.0
+      throughput: 328.515111695138
       estimated_peak_memory_range:
-        min: 2871296
-        max: 232253952
+        min: 36864
+        max: 223105088
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 303
+        layers_on_npu: 557
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 305
-      job_id: jmg9jd2w5
+        layers_on_cpu: 0
+        total_layers: 557
+      job_id: jqp4wl4lg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 2767.0
+      throughput: 361.4022406938923
+      estimated_peak_memory_range:
+        min: 9170944
+        max: 143104560
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 447
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 447
+      job_id: jqpyd124p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 6141.0
-      throughput: 162.83992835043153
+      inference_time: 4230.0
+      throughput: 236.4066193853428
       estimated_peak_memory_range:
-        min: 24158208
-        max: 103238656
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 27504640
+        max: 86953184
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 462
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 3
-      job_id: jqp4k348g
+        layers_on_cpu: 0
+        total_layers: 462
+      job_id: jwgov2wd5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -209,21 +339,36 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.794805Z'
+    timestamp: '2024-05-20T16:35:31.499844Z'
   - torchscript_onnx_tflite:
-      inference_time: 7148.0
-      throughput: 139.89927252378288
+      inference_time: 3892.0
+      throughput: 256.9373072970195
       estimated_peak_memory_range:
         min: 2977792
-        max: 5388280
+        max: 7226936
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 557
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 557
+      job_id: jo5mznkqp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 3696.0
+      throughput: 270.56277056277054
+      estimated_peak_memory_range:
+        min: 11145216
+        max: 48599472
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 303
+        layers_on_npu: 447
         layers_on_gpu: 0
-        layers_on_cpu: 2
-        total_layers: 305
-      job_id: jz5w20j35
+        layers_on_cpu: 0
+        total_layers: 447
+      job_id: jn5q261m5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -232,4 +377,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.794844Z'
+    timestamp: '2024-05-20T16:35:31.499860Z'
+  - torchscript_onnx_qnn:
+      inference_time: 3823.0
+      throughput: 261.5746795710175
+      estimated_peak_memory_range:
+        min: 21233664
+        max: 21233664
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 447
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 447
+      job_id: j1p87qr85
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 4460.0
+      throughput: 224.2152466367713
+      estimated_peak_memory_range:
+        min: 21245952
+        max: 21245952
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 462
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 462
+      job_id: j7gjlvy8p
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jygz73y6p
+      job_status: Failed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.499881Z'
diff --git a/qai_hub_models/models/wideresnet50/README.md b/qai_hub_models/models/wideresnet50/README.md
index fe17a2b9..1fd5bb18 100644
--- a/qai_hub_models/models/wideresnet50/README.md
+++ b/qai_hub_models/models/wideresnet50/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/wideresnet50/export.py b/qai_hub_models/models/wideresnet50/export.py
index 4b8f0722..0fa960e8 100644
--- a/qai_hub_models/models/wideresnet50/export.py
+++ b/qai_hub_models/models/wideresnet50/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,12 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -197,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/wideresnet50/perf.yaml b/qai_hub_models/models/wideresnet50/perf.yaml
index e403f8b2..bc894e3e 100644
--- a/qai_hub_models/models/wideresnet50/perf.yaml
+++ b/qai_hub_models/models/wideresnet50/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: WideResNet50
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 4900.0
-      throughput: 204.08163265306123
+      inference_time: 4874.0
+      throughput: 205.1702913418137
       estimated_peak_memory_range:
-        min: 49152
-        max: 2616288
+        min: 20480
+        max: 2339968
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jegnlkqk5
+      job_id: jz5w9ezjp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 5767.0
-      throughput: 173.40038148083926
+      inference_time: 5693.0
+      throughput: 175.65431231336729
       estimated_peak_memory_range:
-        min: 618496
-        max: 261398592
+        min: 643072
+        max: 344558120
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jep20edrg
+      job_id: jvgdvx4lg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 5427.0
-      throughput: 184.26386585590566
+      inference_time: 5517.0
+      throughput: 181.257930034439
       estimated_peak_memory_range:
-        min: 36864
-        max: 457326944
+        min: 24576
+        max: 414560576
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 128
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j2p03699p
+        total_layers: 128
+      job_id: jo5mznlqp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.826943Z'
+    timestamp: '2024-05-20T16:35:31.545698Z'
   - torchscript_onnx_tflite:
-      inference_time: 3655.0
-      throughput: 273.59781121751024
+      inference_time: 3649.0
+      throughput: 274.0476842970677
       estimated_peak_memory_range:
-        min: 16384
-        max: 97733152
+        min: 12288
+        max: 97464480
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: jopr8wd05
+      job_id: jmg94l2v5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 4245.0
-      throughput: 235.57126030624264
+      inference_time: 4302.0
+      throughput: 232.4500232450023
       estimated_peak_memory_range:
-        min: 618496
-        max: 53403616
+        min: 270987264
+        max: 325564848
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jqpyrm285
+      job_id: jz57dy8r5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 4122.0
-      throughput: 242.600679281902
+      inference_time: 4156.0
+      throughput: 240.61597690086623
       estimated_peak_memory_range:
         min: 618496
-        max: 39529440
+        max: 36255216
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 128
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1p801rkg
+        total_layers: 128
+      job_id: jegne6wmg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.826991Z'
+    timestamp: '2024-05-20T16:35:31.545724Z'
   - torchscript_onnx_tflite:
-      inference_time: 4907.0
-      throughput: 203.79050336254332
+      inference_time: 4864.0
+      throughput: 205.5921052631579
       estimated_peak_memory_range:
-        min: 28672
-        max: 2415760
+        min: 24576
+        max: 2245440
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 79
-      job_id: j2p038n9p
+      job_id: jnp1841lg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 5790.0
-      throughput: 172.71157167530225
+      inference_time: 5687.0
+      throughput: 175.83963425356075
       estimated_peak_memory_range:
-        min: 622592
-        max: 209332032
+        min: 647168
+        max: 355205232
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 126
-      job_id: jw56e9k6g
+      job_id: j0px1kz9g
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.827030Z'
+    timestamp: '2024-05-20T16:35:31.545745Z'
+  - torchscript_onnx_qnn:
+      inference_time: 5857.0
+      throughput: 170.73587160662456
+      estimated_peak_memory_range:
+        min: 602112
+        max: 602112
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 126
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 126
+      job_id: jqp4wl2lg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 5137.0
+      throughput: 194.66614755693985
+      estimated_peak_memory_range:
+        min: 46718976
+        max: 46718976
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 128
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 128
+      job_id: jopryv7eg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 27924.0
+      throughput: 35.8114883254548
+      estimated_peak_memory_range:
+        min: 36831232
+        max: 36831232
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jep2mkzm5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.545770Z'
diff --git a/qai_hub_models/models/wideresnet50_quantized/README.md b/qai_hub_models/models/wideresnet50_quantized/README.md
index ed33868e..cb6dc1eb 100644
--- a/qai_hub_models/models/wideresnet50_quantized/README.md
+++ b/qai_hub_models/models/wideresnet50_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/w
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/wideresnet50_quantized/export.py b/qai_hub_models/models/wideresnet50_quantized/export.py
index 83cb894f..fd3f6e92 100644
--- a/qai_hub_models/models/wideresnet50_quantized/export.py
+++ b/qai_hub_models/models/wideresnet50_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image_tensor"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image_tensor"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,12 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image_tensor", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last(
+                "image_tensor", sample_inputs, target_runtime
+            )
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -205,7 +216,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/wideresnet50_quantized/perf.yaml b/qai_hub_models/models/wideresnet50_quantized/perf.yaml
index d234b9ec..fdc9206b 100644
--- a/qai_hub_models/models/wideresnet50_quantized/perf.yaml
+++ b/qai_hub_models/models/wideresnet50_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: WideResNet50-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1807.0
-      throughput: 553.4034311012729
+      inference_time: 1821.0
+      throughput: 549.1488193300385
       estimated_peak_memory_range:
-        min: 49152
-        max: 2181928
+        min: 24576
+        max: 2584464
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,14 +54,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 80
-      job_id: jn5qev1n5
+      job_id: jqpyd1y4p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2119.0
-      throughput: 471.92071731949034
+      inference_time: 2043.0
+      throughput: 489.47626040137055
       estimated_peak_memory_range:
-        min: 0
-        max: 480120320
+        min: 16384
+        max: 250792696
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -67,22 +69,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 78
-      job_id: jw56ewm6g
+      job_id: jogkyekop
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 2464.0
-      throughput: 405.84415584415586
+      inference_time: 2117.0
+      throughput: 472.3665564478035
       estimated_peak_memory_range:
-        min: 24576
-        max: 187692992
+        min: 110592
+        max: 324998136
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jwgok8wqp
+        total_layers: 86
+      job_id: j1p3mjrzg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -91,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.850968Z'
+    timestamp: '2024-05-20T16:35:31.575903Z'
   - torchscript_onnx_tflite:
-      inference_time: 1351.0
-      throughput: 740.1924500370096
+      inference_time: 1377.0
+      throughput: 726.2164124909223
       estimated_peak_memory_range:
         min: 12288
-        max: 55206416
+        max: 54112960
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -105,14 +107,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 80
-      job_id: j1gl6l8jg
+      job_id: j2p0rzxep
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1589.0
-      throughput: 629.3266205160478
+      inference_time: 1526.0
+      throughput: 655.307994757536
       estimated_peak_memory_range:
-        min: 167936
-        max: 45857248
+        min: 0
+        max: 44606448
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -120,22 +122,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 78
-      job_id: j1p3v673g
+      job_id: jn5q26dm5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1858.0
-      throughput: 538.2131324004306
+      inference_time: 1713.0
+      throughput: 583.7711617046118
       estimated_peak_memory_range:
         min: 0
-        max: 28645856
+        max: 30424256
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: j1pv07nk5
+        total_layers: 86
+      job_id: jwgov29d5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -144,51 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.851014Z'
+    timestamp: '2024-05-20T16:35:31.575930Z'
   - torchscript_onnx_tflite:
-      inference_time: 8152.0
-      throughput: 122.6692836113837
+      inference_time: 1831.0
+      throughput: 546.1496450027307
       estimated_peak_memory_range:
-        min: 12288
-        max: 25276096
+        min: 61440
+        max: 1506248
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 82
+        layers_on_npu: 80
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 82
-      job_id: j1pv0yxk5
+        total_layers: 80
+      job_id: j1p87qk85
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 'null'
-      throughput: 'null'
+      inference_time: 2035.0
+      throughput: 491.4004914004914
       estimated_peak_memory_range:
-        min: 0
-        max: 0
-      primary_compute_unit: 'null'
-      precision: 'null'
+        min: 16384
+        max: 250480080
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 78
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 0
-      job_id: jw56ex4yg
-      job_status: Failed
-    torchscript_onnx_ort:
-      inference_time: 75852.0
-      throughput: 13.183568000843747
+        total_layers: 78
+      job_id: jw561y07p
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.575948Z'
+  - torchscript_onnx_tflite:
+      inference_time: 8208.0
+      throughput: 121.83235867446393
       estimated_peak_memory_range:
-        min: 4431872
-        max: 54054544
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 12288
+        max: 26585200
+      primary_compute_unit: NPU
+      precision: int8
       layer_info:
-        layers_on_npu: 0
+        layers_on_npu: 80
         layers_on_gpu: 0
-        layers_on_cpu: 88
-        total_layers: 88
-      job_id: j7gjzq8v5
+        layers_on_cpu: 0
+        total_layers: 80
+      job_id: j2p0lxj6p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 8312.0
+      throughput: 120.30798845043311
+      estimated_peak_memory_range:
+        min: 94208
+        max: 42576560
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 78
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 78
+      job_id: j1p3er9l5
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -197,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.851058Z'
+    timestamp: '2024-05-20T16:35:31.575965Z'
   - torchscript_onnx_tflite:
-      inference_time: 24077.0
-      throughput: 41.533413631266356
+      inference_time: 23889.0
+      throughput: 41.8602704173469
       estimated_peak_memory_range:
         min: 45056
-        max: 2559568
+        max: 2992736
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 82
+        layers_on_npu: 80
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 82
-      job_id: jnp1wo8lg
+        total_layers: 80
+      job_id: j1p8zkxxp
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -220,42 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.851077Z'
-  - torchscript_onnx_tflite:
-      inference_time: 1831.0
-      throughput: 546.1496450027307
+    timestamp: '2024-05-20T16:35:31.575975Z'
+  - torchscript_onnx_qnn:
+      inference_time: 1966.0
+      throughput: 508.646998982706
       estimated_peak_memory_range:
-        min: 32768
-        max: 1466192
+        min: 344064
+        max: 344064
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 82
+        layers_on_npu: 78
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 82
-      job_id: j2p038q9p
+        total_layers: 78
+      job_id: j1glkvqlp
       job_status: Passed
-    torchscript_onnx_qnn:
-      inference_time: 2151.0
-      throughput: 464.9000464900046
+    torchscript_onnx_ort:
+      inference_time: 1912.0
+      throughput: 523.0125523012553
       estimated_peak_memory_range:
-        min: 622592
-        max: 7136072
+        min: 115851264
+        max: 115851264
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 80
+        layers_on_npu: 86
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 80
-      job_id: jqp4k601g
+        total_layers: 86
+      job_id: j1pvw6nmg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 256303.0
+      throughput: 3.9016320526876394
+      estimated_peak_memory_range:
+        min: 20701184
+        max: 20701184
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j7gjlv88p
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.851108Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.575998Z'
diff --git a/qai_hub_models/models/xlsr/README.md b/qai_hub_models/models/xlsr/README.md
index 34d830be..1b462ab6 100644
--- a/qai_hub_models/models/xlsr/README.md
+++ b/qai_hub_models/models/xlsr/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/x
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/xlsr/export.py b/qai_hub_models/models/xlsr/export.py
index d1edd6e3..917c64ab 100644
--- a/qai_hub_models/models/xlsr/export.py
+++ b/qai_hub_models/models/xlsr/export.py
@@ -119,12 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -162,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -191,8 +197,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -201,7 +211,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/xlsr/model.py b/qai_hub_models/models/xlsr/model.py
index aaee1928..5ad0eed8 100644
--- a/qai_hub_models/models/xlsr/model.py
+++ b/qai_hub_models/models/xlsr/model.py
@@ -49,7 +49,7 @@ def from_pretrained(cls) -> XLSR:
     def get_evaluator(self) -> BaseEvaluator:
         return SuperResolutionOutputEvaluator()
 
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
+    def forward(self, image):
         """
         Run XLSR on `image`, and produce an upscaled image
 
diff --git a/qai_hub_models/models/xlsr/perf.yaml b/qai_hub_models/models/xlsr/perf.yaml
index 5931cad8..c232cb11 100644
--- a/qai_hub_models/models/xlsr/perf.yaml
+++ b/qai_hub_models/models/xlsr/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: XLSR
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2596.0
-      throughput: 385.2080123266564
+      inference_time: 2482.0
+      throughput: 402.90088638195004
       estimated_peak_memory_range:
-        min: 12288
-        max: 1829544
+        min: 16384
+        max: 1867704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,14 +48,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 16
-      job_id: jlpeeynop
+      job_id: jlpevdn05
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 971.0
-      throughput: 1029.8661174047375
+      inference_time: 1346.0
+      throughput: 742.9420505200594
       estimated_peak_memory_range:
-        min: 217088
-        max: 11994560
+        min: 16384
+        max: 5062976
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -61,22 +63,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 21
-      job_id: jz5w24r35
+      job_id: jmg94lqv5
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1502.0
-      throughput: 665.7789613848203
+      inference_time: 1552.0
+      throughput: 644.3298969072165
       estimated_peak_memory_range:
-        min: 212992
-        max: 8613544
+        min: 16384
+        max: 72227024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 23
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jnp1y6m8p
+        total_layers: 23
+      job_id: jqp4wl6lg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.886071Z'
+    timestamp: '2024-05-20T16:35:31.615271Z'
   - torchscript_onnx_tflite:
-      inference_time: 1833.0
-      throughput: 545.5537370430987
+      inference_time: 1775.0
+      throughput: 563.3802816901408
       estimated_peak_memory_range:
         min: 16384
-        max: 19549104
+        max: 20190320
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,14 +101,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 16
-      job_id: jygzon0o5
+      job_id: jygz7306p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 632.0
-      throughput: 1582.2784810126582
+      inference_time: 834.0
+      throughput: 1199.0407673860911
       estimated_peak_memory_range:
-        min: 208896
-        max: 17756816
+        min: 0
+        max: 16978032
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -114,22 +116,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 21
-      job_id: jmg9jdqw5
+      job_id: jnp184mlg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 1006.0
-      throughput: 994.0357852882704
+      inference_time: 1029.0
+      throughput: 971.8172983479105
       estimated_peak_memory_range:
-        min: 344064
-        max: 16233520
+        min: 0
+        max: 15374048
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 23
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jvgde2mr5
+        total_layers: 23
+      job_id: j0px1k89g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.886105Z'
+    timestamp: '2024-05-20T16:35:31.615298Z'
   - torchscript_onnx_tflite:
-      inference_time: 2709.0
-      throughput: 369.139904023625
+      inference_time: 2490.0
+      throughput: 401.60642570281124
       estimated_peak_memory_range:
-        min: 6631424
-        max: 8101008
+        min: 12623872
+        max: 14367408
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,14 +154,14 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 16
-      job_id: jmg9jrn85
+      job_id: jz5w9erjp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 963.0
-      throughput: 1038.4215991692627
+      inference_time: 1362.0
+      throughput: 734.2143906020558
       estimated_peak_memory_range:
-        min: 212992
-        max: 33066344
+        min: 49152
+        max: 9493856
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -167,7 +169,7 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 21
-      job_id: jqp4k7r1g
+      job_id: jz57dy1r5
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.886129Z'
+    timestamp: '2024-05-20T16:35:31.615315Z'
+  - torchscript_onnx_qnn:
+      inference_time: 3991.0
+      throughput: 250.56376847907794
+      estimated_peak_memory_range:
+        min: 237568
+        max: 237568
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 21
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 21
+      job_id: jvgdvxmlg
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1578.0
+      throughput: 633.7135614702155
+      estimated_peak_memory_range:
+        min: 8957952
+        max: 8957952
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 23
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 23
+      job_id: jo5mzn1qp
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 3324.0
+      throughput: 300.84235860409143
+      estimated_peak_memory_range:
+        min: 16203776
+        max: 16203776
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 14
+        total_layers: 14
+      job_id: jegne6dmg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.615338Z'
diff --git a/qai_hub_models/models/xlsr_quantized/README.md b/qai_hub_models/models/xlsr_quantized/README.md
index b4b99361..968a6349 100644
--- a/qai_hub_models/models/xlsr_quantized/README.md
+++ b/qai_hub_models/models/xlsr_quantized/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/x
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/xlsr_quantized/export.py b/qai_hub_models/models/xlsr_quantized/export.py
index aafd8724..ccbe279b 100644
--- a/qai_hub_models/models/xlsr_quantized/export.py
+++ b/qai_hub_models/models/xlsr_quantized/export.py
@@ -123,12 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image" + " --force_channel_last_output output_0"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime,
-        compile_options
-        + " --force_channel_last_input image"
-        + " --force_channel_last_output output_0",
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -170,8 +174,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -199,8 +205,12 @@ def export_model(
         assert inference_job is not None and inference_job.wait().success
         inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
         # Convert outputs from channel last to channel first
-        inference_result = transpose_channel_last_to_first(
-            "output_0", inference_result, target_runtime
+        inference_result = (
+            inference_result
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_last_to_first(
+                "output_0", inference_result, target_runtime
+            )
         )
         print_inference_metrics(inference_job, inference_result, torch_out)
 
@@ -209,7 +219,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/xlsr_quantized/model.py b/qai_hub_models/models/xlsr_quantized/model.py
index cbf2ec5a..7ff4cd2c 100644
--- a/qai_hub_models/models/xlsr_quantized/model.py
+++ b/qai_hub_models/models/xlsr_quantized/model.py
@@ -8,28 +8,23 @@
 # This verifies aimet is installed, and this must be included first.
 from qai_hub_models.utils.quantization_aimet import (
     AIMETQuantizableMixin,
+    constrain_quantized_inputs_to_image_range,
 )
 
 # isort: on
 
 import torch
+from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
 from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
 
-from qai_hub_models.models.common import SourceModelFormat, TargetRuntime
-from qai_hub_models.models.xlsr.model import XLSR, _load_xlsr_source_model
+from qai_hub_models.models.xlsr.model import XLSR
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
 from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
 
 MODEL_ID = __name__.split(".")[-2]
-MODEL_ASSET_VERSION = 2
-# Weights and config stored in S3 are sourced from
-# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/xlsr/model/model_cards/xlsr_4x_w8a8.json:
-# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_february_artifacts/xlsr_4x_checkpoint_int8.pth
-# and
-# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js
-# Encodings were generated with AIMET QuantSim library
-XLSR_QUANTIZED_WEIGHTS = "xlsr_4x_checkpoint_int8.pth"
-AIMET_ENCODINGS = "aimet_quantization_encodings.json"
-AIMET_CONFIG = "default_config_per_channel.json"
+MODEL_ASSET_VERSION = 3
+DEFAULT_ENCODINGS = "xlsr_quantized_encodings.json"
 SCALING_FACTOR = 4
 
 
@@ -44,9 +39,7 @@ def __init__(
         xlsr_model: QuantizationSimModel,
     ) -> None:
         XLSR.__init__(self, xlsr_model.model)
-        AIMETQuantizableMixin.__init__(
-            self, xlsr_model, needs_onnx_direct_aimet_export=True
-        )
+        AIMETQuantizableMixin.__init__(self, xlsr_model)
 
     @classmethod
     def from_pretrained(
@@ -60,40 +53,27 @@ def from_pretrained(
             elif None: Doesn't load any encodings. Used when computing encodings.
             else: Interprets as a filepath and loads the encodings stored there.
         """
-        xlsr = _load_xlsr_source_model()
-        input_shape = XLSR.get_input_spec()["image"][0]
+        fp16_model = XLSR.from_pretrained()
+        input_shape = cls.get_input_spec()["image"][0]
 
-        weights = CachedWebModelAsset.from_asset_store(
-            MODEL_ID, MODEL_ASSET_VERSION, XLSR_QUANTIZED_WEIGHTS
-        ).fetch()
-        aimet_config = CachedWebModelAsset.from_asset_store(
-            MODEL_ID, MODEL_ASSET_VERSION, AIMET_CONFIG
-        ).fetch()
+        model = prepare_model(fp16_model)
+        equalize_model(model, input_shape)
 
-        # Load the model weights and quantization parameters
-        state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"]
-        xlsr.load_state_dict(state_dict)
         sim = QuantizationSimModel(
-            xlsr,
+            model,
             quant_scheme="tf_enhanced",
             default_param_bw=8,
             default_output_bw=8,
-            config_file=aimet_config,
+            config_file=get_default_aimet_config(),
             dummy_input=torch.rand(input_shape),
         )
+        constrain_quantized_inputs_to_image_range(sim)
+
         if aimet_encodings:
             if aimet_encodings == "DEFAULT":
                 aimet_encodings = CachedWebModelAsset.from_asset_store(
-                    MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS
+                    MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
                 ).fetch()
             load_encodings_to_sim(sim, aimet_encodings)
 
         return cls(sim)
-
-    def preferred_hub_source_model_format(
-        self, target_runtime: TargetRuntime
-    ) -> SourceModelFormat:
-        if target_runtime == TargetRuntime.QNN:
-            return SourceModelFormat.ONNX
-        else:
-            return SourceModelFormat.TORCHSCRIPT
diff --git a/qai_hub_models/models/xlsr_quantized/perf.yaml b/qai_hub_models/models/xlsr_quantized/perf.yaml
index 9dc9925a..a33479bf 100644
--- a/qai_hub_models/models/xlsr_quantized/perf.yaml
+++ b/qai_hub_models/models/xlsr_quantized/perf.yaml
@@ -26,6 +26,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -36,15 +37,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: XLSR-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 1128.0
-      throughput: 886.5248226950355
+      inference_time: 1142.0
+      throughput: 875.6567425569177
       estimated_peak_memory_range:
-        min: 12288
-        max: 1590504
+        min: 20480
+        max: 1494816
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -52,7 +54,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 17
-      job_id: jmg9jdq85
+      job_id: jopryvmeg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 798.0
+      throughput: 1253.1328320802006
+      estimated_peak_memory_range:
+        min: 65536
+        max: 74050712
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 17
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 17
+      job_id: j2p0rz8ep
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1166.0
+      throughput: 857.6329331046312
+      estimated_peak_memory_range:
+        min: 12288
+        max: 10231824
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 21
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 21
+      job_id: j1glkv9lp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -61,13 +93,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.910002Z'
+    timestamp: '2024-05-20T16:35:31.645635Z'
   - torchscript_onnx_tflite:
-      inference_time: 1209.0
-      throughput: 827.129859387924
+      inference_time: 948.0
+      throughput: 1054.8523206751054
       estimated_peak_memory_range:
-        min: 53248
-        max: 20193472
+        min: 16384
+        max: 20809824
       primary_compute_unit: NPU
       precision: int8
       layer_info:
@@ -75,7 +107,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 3
         total_layers: 17
-      job_id: jnp1y6m7p
+      job_id: jep2mkqm5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 548.0
+      throughput: 1824.8175182481752
+      estimated_peak_memory_range:
+        min: 65536
+        max: 18623024
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 17
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 17
+      job_id: j1p87qd85
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 864.0
+      throughput: 1157.4074074074074
+      estimated_peak_memory_range:
+        min: 344064
+        max: 17534000
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 21
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 21
+      job_id: jw561y97p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -84,21 +146,74 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.910020Z'
+    timestamp: '2024-05-20T16:35:31.645660Z'
   - torchscript_onnx_tflite:
-      inference_time: 3053.0
-      throughput: 327.54667540124467
+      inference_time: 1133.0
+      throughput: 882.61253309797
       estimated_peak_memory_range:
-        min: 57344
-        max: 15609680
+        min: 12288
+        max: 1909504
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 16
+        layers_on_npu: 14
         layers_on_gpu: 0
         layers_on_cpu: 3
-        total_layers: 19
-      job_id: jopr8r375
+        total_layers: 17
+      job_id: jqpyd1k4p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 793.0
+      throughput: 1261.034047919294
+      estimated_peak_memory_range:
+        min: 69632
+        max: 73885472
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 17
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 17
+      job_id: jn5q26xm5
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.645677Z'
+  - torchscript_onnx_tflite:
+      inference_time: 2418.0
+      throughput: 413.564929693962
+      estimated_peak_memory_range:
+        min: 12288
+        max: 14878432
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 14
+        layers_on_gpu: 0
+        layers_on_cpu: 3
+        total_layers: 17
+      job_id: jqpy6yo75
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 1550.0
+      throughput: 645.1612903225806
+      estimated_peak_memory_range:
+        min: 65536
+        max: 17596976
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 17
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 17
+      job_id: jw56n080g
       job_status: Passed
     reference_device_info:
       name: RB3 Gen 2 (Proxy)
@@ -107,21 +222,21 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.910035Z'
+    timestamp: '2024-05-20T16:35:31.645694Z'
   - torchscript_onnx_tflite:
-      inference_time: 15998.0
-      throughput: 62.50781347668458
+      inference_time: 14145.0
+      throughput: 70.69635913750442
       estimated_peak_memory_range:
-        min: 45056
-        max: 17827664
+        min: 4235264
+        max: 15314136
       primary_compute_unit: GPU
       precision: int8
       layer_info:
-        layers_on_npu: 5
+        layers_on_npu: 3
         layers_on_gpu: 9
         layers_on_cpu: 5
-        total_layers: 19
-      job_id: jvgdq6vl5
+        total_layers: 17
+      job_id: j2p0lxm6p
       job_status: Passed
     reference_device_info:
       name: RB5 (Proxy)
@@ -130,27 +245,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8250
-    timestamp: '2024-04-23T18:42:33.910050Z'
-  - torchscript_onnx_tflite:
-      inference_time: 1313.0
-      throughput: 761.6146230007616
+    timestamp: '2024-05-20T16:35:31.645705Z'
+  - torchscript_onnx_qnn:
+      inference_time: 933.0
+      throughput: 1071.8113612004288
       estimated_peak_memory_range:
-        min: 28672
-        max: 5004672
+        min: 49152
+        max: 49152
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 16
+        layers_on_npu: 17
         layers_on_gpu: 0
-        layers_on_cpu: 3
-        total_layers: 19
-      job_id: j2p03w0np
+        layers_on_cpu: 0
+        total_layers: 17
+      job_id: jogkyewop
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 1191.0
+      throughput: 839.6305625524769
+      estimated_peak_memory_range:
+        min: 8818688
+        max: 8818688
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 21
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 21
+      job_id: j1p3mjlzg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 11552.0
+      throughput: 86.56509695290859
+      estimated_peak_memory_range:
+        min: 33103872
+        max: 33103872
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jwgov27d5
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
-      os: '12'
-      form_factor: Iot
-      os_name: Android
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.910063Z'
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.645729Z'
diff --git a/qai_hub_models/models/yolonas/README.md b/qai_hub_models/models/yolonas/README.md
new file mode 100644
index 00000000..15cb8fb0
--- /dev/null
+++ b/qai_hub_models/models/yolonas/README.md
@@ -0,0 +1,61 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [Yolo-NAS: Real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolonas)
+
+YoloNAS is a machine learning model that predicts bounding boxes and classes of objects in an image.
+
+This is based on the implementation of Yolo-NAS found
+[here](https://github.com/Deci-AI/super-gradients). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolonas).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+Install the package via pip:
+```bash
+pip install "qai_hub_models[yolonas]"
+```
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.yolonas.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.yolonas.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of Yolo-NAS can be found
+  [here](https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search](https://deci.ai/blog/yolo-nas-object-detection-foundation-model/)
+* [Source Model Implementation](https://github.com/Deci-AI/super-gradients)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
diff --git a/qai_hub_models/models/yolonas/__init__.py b/qai_hub_models/models/yolonas/__init__.py
new file mode 100644
index 00000000..6d2ecd39
--- /dev/null
+++ b/qai_hub_models/models/yolonas/__init__.py
@@ -0,0 +1,8 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.yolonas.app import YoloNASDetectionApp as App  # noqa: F401
+
+from .model import MODEL_ID  # noqa: F401
+from .model import YoloNAS as Model  # noqa: F401
diff --git a/qai_hub_models/models/yolonas/app.py b/qai_hub_models/models/yolonas/app.py
new file mode 100644
index 00000000..06cb65fb
--- /dev/null
+++ b/qai_hub_models/models/yolonas/app.py
@@ -0,0 +1,52 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+from typing import Tuple
+
+import torch
+
+from qai_hub_models.models._shared.yolo.app import YoloObjectDetectionApp
+from qai_hub_models.models.yolonas.model import YoloNAS
+
+
+class YoloNASDetectionApp(YoloObjectDetectionApp):
+    def check_image_size(self, pixel_values: torch.Tensor) -> None:
+        """
+        Verify image size is a valid model input. Image size should be shape
+        [batch_size, num_channels, height, width], where height and width are multiples
+        of `YoloNAS.STRIDE_MULTIPLE`.
+        """
+        if len(pixel_values.shape) != 4:
+            raise ValueError("Pixel Values must be rank 4: [batch, channels, x, y]")
+        if (
+            pixel_values.shape[2] % YoloNAS.STRIDE_MULTIPLE != 0
+            or pixel_values.shape[3] % YoloNAS.STRIDE_MULTIPLE != 0
+        ):
+            raise ValueError(
+                f"Pixel values must have spatial dimensions (H & W) that are multiples of {YoloNAS.STRIDE_MULTIPLE}."
+            )
+
+    def pre_nms_postprocess(
+        self, *predictions: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Process the output of the YOLO detector for input to NMS.
+
+        Parameters:
+            predictions:
+                Should contain two tensors: boxes and scores.
+
+        Returns:
+            boxes: torch.Tensor
+                Bounding box locations. Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y)
+            scores: torch.Tensor
+                Confidence score that the given box is the predicted class: Shape is [batch, num_preds]
+            class_idx: torch.tensor
+                Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction.
+        """
+        boxes, scores = predictions
+        scores, class_idx = torch.max(scores, -1, keepdim=False)
+        return boxes, scores, class_idx
diff --git a/qai_hub_models/models/yolonas/conftest.py b/qai_hub_models/models/yolonas/conftest.py
new file mode 100644
index 00000000..2d67f608
--- /dev/null
+++ b/qai_hub_models/models/yolonas/conftest.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+import inspect
+
+import pytest
+
+from qai_hub_models.models.yolonas import Model
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+# Instantiate the model only once for all tests.
+# Mock from_pretrained to always return the initialized model.
+# This speeds up tests and limits memory leaks.
+@pytest.fixture(scope="module", autouse=True)
+def cached_from_pretrained():
+    with pytest.MonkeyPatch.context() as mp:
+        pretrained_cache = {}
+        from_pretrained = Model.from_pretrained
+        sig = inspect.signature(from_pretrained)
+
+        @skip_clone_repo_check
+        def _cached_from_pretrained(*args, **kwargs):
+            cache_key = str(args) + str(kwargs)
+            model = pretrained_cache.get(cache_key, None)
+            if model:
+                return model
+            else:
+                model = from_pretrained(*args, **kwargs)
+                pretrained_cache[cache_key] = model
+                return model
+
+        _cached_from_pretrained.__signature__ = sig
+
+        mp.setattr(Model, "from_pretrained", _cached_from_pretrained)
+        yield mp
diff --git a/qai_hub_models/models/yolonas/demo.py b/qai_hub_models/models/yolonas/demo.py
new file mode 100644
index 00000000..fa4ade05
--- /dev/null
+++ b/qai_hub_models/models/yolonas/demo.py
@@ -0,0 +1,24 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.yolo.demo import yolo_detection_demo
+from qai_hub_models.models.yolonas.app import YoloNASDetectionApp
+from qai_hub_models.models.yolonas.model import MODEL_ID, YoloNAS
+from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS
+
+
+def main(is_test: bool = False):
+    yolo_detection_demo(
+        YoloNAS,
+        MODEL_ID,
+        YoloNASDetectionApp,
+        IMAGE_ADDRESS,
+        YoloNAS.STRIDE_MULTIPLE,
+        is_test=is_test,
+        default_score_threshold=0.7,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/yolonas/export.py b/qai_hub_models/models/yolonas/export.py
new file mode 100644
index 00000000..44dbdd27
--- /dev/null
+++ b/qai_hub_models/models/yolonas/export.py
@@ -0,0 +1,217 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import qai_hub as hub
+import torch
+
+from qai_hub_models.models.yolonas import Model
+from qai_hub_models.utils.args import (
+    export_parser,
+    get_input_spec_kwargs,
+    get_model_kwargs,
+)
+from qai_hub_models.utils.base_model import TargetRuntime
+from qai_hub_models.utils.compare import torch_inference
+from qai_hub_models.utils.input_spec import make_torch_inputs
+from qai_hub_models.utils.printing import (
+    print_inference_metrics,
+    print_on_target_demo_cmd,
+    print_profile_metrics_from_job,
+)
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+    transpose_channel_first_to_last,
+)
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_downloading: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    target_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    compile_options: str = "",
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[
+    str
+]:
+    """
+    This function accomplishes 6 main tasks:
+
+        1. Instantiates a PyTorch model and converts it to a traced TorchScript format.
+        2. Compiles the model to an asset that can be run on device.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Downloads the model asset to the local directory.
+        6. Summarizes the results from profiling and inference.
+
+    Each of the last four steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_downloading: If set, skips downloading of compiled model.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling and inference.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        target_runtime: Which on-device runtime to target. Default is TFLite.
+        compile_options: Additional options to pass when submitting the compile job.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_pretrained` and `model.get_input_spec`
+
+    Returns:
+        A 3-tuple of:
+            * A CompileJob object containing metadata about the compile job submitted to hub.
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "yolonas"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "yolonas",
+            "Yolo-NAS",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            skip_downloading,
+            skip_summary,
+            output_path,
+            target_runtime,
+            compile_options,
+            profile_options,
+        )
+
+    # 1. Initialize PyTorch model
+    model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs))
+    input_spec = model.get_input_spec(
+        **get_input_spec_kwargs(model, additional_model_kwargs)
+    )
+
+    # Trace the model
+    model.eval()
+    source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
+
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
+    # 2. Compile the model to an on-device asset
+    model_compile_options = model.get_hub_compile_options(
+        target_runtime, compile_options + channel_last_flags, hub_device
+    )
+    print(f"Optimizing model {model_name} to run on-device")
+    submitted_compile_job = hub.submit_compile_job(
+        model=source_model,
+        input_specs=input_spec,
+        device=hub_device,
+        name=model_name,
+        options=model_compile_options,
+    )
+    compile_job = cast(hub.client.CompileJob, submitted_compile_job)
+
+    # 3. Profile the model asset on real devices
+    profile_job: Optional[hub.client.ProfileJob] = None
+    if not skip_profiling:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(f"Profiling model {model_name} on a hosted device.")
+        submitted_profile_job = hub.submit_profile_job(
+            model=compile_job.get_target_model(),
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        profile_job = cast(hub.client.ProfileJob, submitted_profile_job)
+
+    # 4. Run inference on-device with sample inputs
+    inference_job: Optional[hub.client.InferenceJob] = None
+    if not skip_inferencing:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(
+            f"Running inference for {model_name} on a hosted device with example inputs."
+        )
+        sample_inputs = model.sample_inputs(input_spec)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
+        )
+        submitted_inference_job = hub.submit_inference_job(
+            model=compile_job.get_target_model(),
+            inputs=hub_inputs,
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        inference_job = cast(hub.client.InferenceJob, submitted_inference_job)
+
+    # 5. Download the model asset to a local file
+    if not skip_downloading:
+        os.makedirs(output_path, exist_ok=True)
+        target_model: hub.Model = compile_job.get_target_model()  # type: ignore
+        target_model.download(str(output_path / f"{model_name}.tflite"))
+
+    # 6. Summarize the results from profiling and inference
+    if not skip_summary and not skip_profiling:
+        assert profile_job is not None and profile_job.wait().success
+        profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+        print_profile_metrics_from_job(profile_job, profile_data)
+
+    if not skip_summary and not skip_inferencing:
+        torch_out = torch_inference(model, sample_inputs)
+        assert inference_job is not None and inference_job.wait().success
+        inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
+        print_inference_metrics(
+            inference_job, inference_result, torch_out, outputs_to_skip=[2]
+        )
+
+    if not skip_summary:
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+
+    return (compile_job, profile_job, inference_job)
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(model_cls=Model)
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/yolonas/info.yaml b/qai_hub_models/models/yolonas/info.yaml
new file mode 100644
index 00000000..b2b6b9e2
--- /dev/null
+++ b/qai_hub_models/models/yolonas/info.yaml
@@ -0,0 +1,40 @@
+name: Yolo-NAS
+# id must match with the model dir name in qai_hub_models
+id: yolonas
+status: public
+headline: Real-time object detection optimized for mobile and edge.
+domain: Computer Vision
+description: YoloNAS is a machine learning model that predicts bounding boxes and classes
+  of objects in an image.
+use_case: Object Detection
+tags:
+  - real-time
+research_paper: https://deci.ai/blog/yolo-nas-object-detection-foundation-model/
+research_paper_title: 'YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search'
+license: https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md
+deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf
+source_repo: https://github.com/Deci-AI/super-gradients
+technical_details:
+  Model checkpoint: YoloNAS Small
+  Input resolution: 640x640
+  Number of parameters: 12.2M
+  Model size: 46.6 MB
+applicable_scenarios:
+  - Factory Automation
+  - Robotic Navigation
+  - Camera
+related_models:
+  - yolov6
+  - yolov7
+  - yolov8_det
+form_factors:
+  - Phone
+  - Tablet
+  - IoT
+  - XR
+has_static_banner: yes
+has_animated_banner: yes
+license_type: apache-2.0
+deploy_license_type: AI Model Hub License
+dataset:
+  - COCO
diff --git a/qai_hub_models/models/yolonas/model.py b/qai_hub_models/models/yolonas/model.py
new file mode 100644
index 00000000..b2f5e62f
--- /dev/null
+++ b/qai_hub_models/models/yolonas/model.py
@@ -0,0 +1,160 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+
+from qai_hub_models.evaluators.base_evaluators import BaseEvaluator
+from qai_hub_models.evaluators.detection_evaluator import DetectionEvaluator
+from qai_hub_models.models._shared.yolo.utils import yolo_sample_inputs
+from qai_hub_models.models.common import SampleInputsType
+from qai_hub_models.utils.asset_loaders import SourceAsRoot, find_replace_in_repo
+from qai_hub_models.utils.base_model import BaseModel
+from qai_hub_models.utils.input_spec import InputSpec
+
+SOURCE_REPOSITORY = "https://github.com/Deci-AI/super-gradients/"
+SOURCE_REPO_COMMIT = "00a1f86da1a5bfdbbac44bfeda177de9439f4c73"
+MODEL_ID = __name__.split(".")[-2]
+DEFAULT_WEIGHTS = "yolo_nas_s"
+MODEL_ASSET_VERSION = 1
+YOLO_HEAD_FILE = (
+    "src/super_gradients/training/models/detection_models/pp_yolo_e/pp_yolo_head.py"
+)
+DFL_HEAD_FILE = (
+    "src/super_gradients/training/models/detection_models/yolo_nas/dfl_heads.py"
+)
+
+
+class YoloNAS(BaseModel):
+    """Exportable YoloNAS bounding box detector, end-to-end."""
+
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        include_postprocessing: bool = True,
+        class_dtype: torch.dtype = torch.float32,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.include_postprocessing = include_postprocessing
+        self.class_dtype = class_dtype
+
+    # All image input spatial dimensions should be a multiple of this stride.
+    STRIDE_MULTIPLE = 32
+
+    def get_evaluator(self) -> BaseEvaluator:
+        return DetectionEvaluator(*self.get_input_spec()["image"][0][2:])
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        weights_name: str = DEFAULT_WEIGHTS,
+        include_postprocessing: bool = True,
+    ):
+        with SourceAsRoot(
+            SOURCE_REPOSITORY,
+            SOURCE_REPO_COMMIT,
+            MODEL_ID,
+            MODEL_ASSET_VERSION,
+        ) as repo_root:
+            # There are some places where the input shape is derived dynamically
+            # from tensors that doesn't play nice with AIMET. Set the `eval_size`
+            # based on the model input spec and use that instead to derive shapes.
+            find_replace_in_repo(
+                repo_root,
+                YOLO_HEAD_FILE,
+                "feats: Tuple[Tensor, ...],\n",
+                "feats: Tuple[Tensor, ...], eval_size: Tuple[Tensor, Tensor],\n",
+            )
+            find_replace_in_repo(
+                repo_root,
+                YOLO_HEAD_FILE,
+                "_, _, h, w = feat.shape",
+                "h, w = (eval_size[0] // stride, eval_size[1] // stride)",
+            )
+            find_replace_in_repo(
+                repo_root,
+                DFL_HEAD_FILE,
+                "feats, self.fpn_strides",
+                "feats, self.eval_size, self.fpn_strides",
+            )
+            find_replace_in_repo(
+                repo_root, DFL_HEAD_FILE, "if feats is not None:", "if False:"
+            )
+            find_replace_in_repo(
+                repo_root, DFL_HEAD_FILE, "if self.eval_size:", "if False:"
+            )
+            find_replace_in_repo(
+                repo_root, DFL_HEAD_FILE, "dtype=dtype", "dtype=torch.float32"
+            )
+            find_replace_in_repo(
+                repo_root, DFL_HEAD_FILE, "device=device", "device='cpu'"
+            )
+
+            os.chdir("src")
+            sys.path.append(".")
+
+            from super_gradients.training import models
+
+            model = models.get(weights_name, pretrained_weights="coco")
+            input_size = cls.get_input_spec()["image"][0]
+            model.prep_model_for_conversion(input_size=input_size)
+            model.heads.eval_size = input_size[2:]
+            return cls(model.eval(), include_postprocessing)
+
+    def forward(self, image):
+        """
+        Run YoloNAS on `image`, and produce a predicted set of bounding boxes and associated class probabilities.
+
+        Parameters:
+            image: Pixel values pre-processed for encoder consumption.
+                   Range: float[0, 1]
+                   3-channel Color Space: BGR
+
+        Returns:
+            If self.include_postprocessing:
+                boxes: torch.Tensor
+                    Bounding box locations. Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y)
+                scores: torch.Tensor
+                    Confidence score that the given box is the predicted class: Shape is [batch, num_preds]
+                class_idx: torch.tensor
+                    Shape is [batch, num_preds] where the last dim is the index of the most probable class of the prediction.
+            else:
+                boxes: torch.Tensor
+                    Bounding box locations.  Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y)
+                scores: torch.Tensor
+                    Probability distribution over the classes for each box prediction.
+                    Shape is [batch, num_preds, num_classes]
+        """
+        out = self.model(image)
+        if isinstance(out[0], tuple):
+            out = out[0]
+        boxes, scores = out
+        if not self.include_postprocessing:
+            return boxes, scores
+        scores, class_idx = torch.max(scores, -1, keepdim=False)
+        return boxes, scores, class_idx.to(self.class_dtype)
+
+    @staticmethod
+    def get_input_spec(
+        batch_size: int = 1,
+        num_channels: int = 3,
+        height: int = 640,
+        width: int = 640,
+    ) -> InputSpec:
+        """
+        Returns the input specification (name -> (shape, type). This can be
+        used to submit profiling job on Qualcomm AI Hub.
+        """
+        return {"image": ((batch_size, num_channels, height, width), "float32")}
+
+    def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType:
+        if input_spec is not None and input_spec != self.get_input_spec():
+            raise ValueError("Sample input has a fixed size that cannot be changed")
+
+        return yolo_sample_inputs()
diff --git a/qai_hub_models/models/yolonas/perf.yaml b/qai_hub_models/models/yolonas/perf.yaml
new file mode 100644
index 00000000..56ca583e
--- /dev/null
+++ b/qai_hub_models/models/yolonas/perf.yaml
@@ -0,0 +1,234 @@
+aggregated:
+  supported_oses:
+  - Android
+  supported_devices:
+  - Google Pixel 3
+  - Google Pixel 3a
+  - Google Pixel 3a XL
+  - Google Pixel 4
+  - Google Pixel 4a
+  - Google Pixel 5a 5G
+  - QCS8550 (Proxy)
+  - Samsung Galaxy S21
+  - Samsung Galaxy S21 Ultra
+  - Samsung Galaxy S21+
+  - Samsung Galaxy S22 5G
+  - Samsung Galaxy S22 Ultra 5G
+  - Samsung Galaxy S22+ 5G
+  - Samsung Galaxy S23
+  - Samsung Galaxy S23 Ultra
+  - Samsung Galaxy S23+
+  - Samsung Galaxy S24
+  - Samsung Galaxy S24 Ultra
+  - Samsung Galaxy S24+
+  - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
+  - Xiaomi 12
+  - Xiaomi 12 Pro
+  supported_chipsets:
+  - Qcs8550
+  - Snapdragon® 8 Gen 1
+  - Snapdragon® 8 Gen 2
+  - Snapdragon® 8 Gen 3
+  - Snapdragon® 888
+  - Snapdragon® X Elite
+models:
+- name: Yolo-NAS
+  performance_metrics:
+  - torchscript_onnx_tflite:
+      inference_time: 11744.0
+      throughput: 85.14986376021798
+      estimated_peak_memory_range:
+        min: 20480
+        max: 7339120
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 201
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 201
+      job_id: j1pvw6ymg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 14893.0
+      throughput: 67.14563889075404
+      estimated_peak_memory_range:
+        min: 6094848
+        max: 24240072
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 289
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 289
+      job_id: jygz73q6p
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 9987.0
+      throughput: 100.13016921998599
+      estimated_peak_memory_range:
+        min: 32768
+        max: 59395840
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 290
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 290
+      job_id: jvgdvxylg
+      job_status: Passed
+    reference_device_info:
+      name: Samsung Galaxy S23
+      os: '13'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 2
+    timestamp: '2024-05-20T16:35:31.684983Z'
+  - torchscript_onnx_tflite:
+      inference_time: 8017.0
+      throughput: 124.73493825620557
+      estimated_peak_memory_range:
+        min: 229376
+        max: 96302464
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 201
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 201
+      job_id: j7gjlv68p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 10167.0
+      throughput: 98.35743090390478
+      estimated_peak_memory_range:
+        min: 4931584
+        max: 93285616
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 289
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 289
+      job_id: jz5w9e0jp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 6706.0
+      throughput: 149.1201908738443
+      estimated_peak_memory_range:
+        min: 4931584
+        max: 51901312
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 290
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 290
+      job_id: jz57dymr5
+      job_status: Passed
+    reference_device_info:
+      name: Samsung Galaxy S24
+      os: '14'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 3
+    timestamp: '2024-05-20T16:35:31.685021Z'
+  - torchscript_onnx_tflite:
+      inference_time: 11751.0
+      throughput: 85.09914049868097
+      estimated_peak_memory_range:
+        min: 249856
+        max: 7448824
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 201
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 201
+      job_id: jlpevd005
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 15283.0
+      throughput: 65.43217954590068
+      estimated_peak_memory_range:
+        min: 4947968
+        max: 24255016
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 289
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 289
+      job_id: jnp184klg
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.685050Z'
+  - torchscript_onnx_qnn:
+      inference_time: 11900.0
+      throughput: 84.03361344537815
+      estimated_peak_memory_range:
+        min: 4923392
+        max: 4923392
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 289
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 289
+      job_id: jmg94l7v5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 10117.0
+      throughput: 98.84353069091628
+      estimated_peak_memory_range:
+        min: 15732736
+        max: 15732736
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 290
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 290
+      job_id: jqp4wl7lg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 49609.0
+      throughput: 20.157632687617166
+      estimated_peak_memory_range:
+        min: 70164480
+        max: 70164480
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 186
+        total_layers: 186
+      job_id: j0px1kq9g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.685076Z'
diff --git a/qai_hub_models/models/yolonas/requirements.txt b/qai_hub_models/models/yolonas/requirements.txt
new file mode 100644
index 00000000..b6f0ec66
--- /dev/null
+++ b/qai_hub_models/models/yolonas/requirements.txt
@@ -0,0 +1,9 @@
+object-detection-metrics==0.4.post1
+stringcase==1.2.0
+rapidfuzz==3.8.1
+treelib==1.6.1
+imagesize==1.4.1
+einops==0.3.2
+Deprecated==1.2.11
+data-gradients==0.3.1
+shapely==2.0.3
diff --git a/qai_hub_models/models/yolonas/test.py b/qai_hub_models/models/yolonas/test.py
new file mode 100644
index 00000000..19ce0b85
--- /dev/null
+++ b/qai_hub_models/models/yolonas/test.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.yolonas.app import YoloNASDetectionApp
+from qai_hub_models.models.yolonas.demo import main as demo_main
+from qai_hub_models.models.yolonas.model import MODEL_ASSET_VERSION, MODEL_ID, YoloNAS
+from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    load_numpy,
+)
+from qai_hub_models.utils.bounding_box_processing import get_iou
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+GT_BOXES = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "yolonas_boxes.npy"
+)
+
+
+@skip_clone_repo_check
+def test_task():
+    image = load_image(IMAGE_ADDRESS)
+    app = YoloNASDetectionApp(YoloNAS.from_pretrained(), nms_score_threshold=0.7)
+    boxes = app.predict_boxes_from_image(image, raw_output=True)[0][0].numpy()
+    print(boxes.shape)
+    boxes_gt = load_numpy(GT_BOXES)
+    boxes = sorted(boxes, key=lambda box: box[0])
+    boxes_gt = sorted(boxes_gt, key=lambda box: box[0])
+    assert len(boxes) == len(boxes_gt)
+    ious = [get_iou(box, box_gt) for box, box_gt in zip(boxes, boxes_gt)]
+    for iou in ious:
+        assert iou > 0.95
+
+
+@skip_clone_repo_check
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/yolonas_quantized/README.md b/qai_hub_models/models/yolonas_quantized/README.md
new file mode 100644
index 00000000..eed10e62
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/README.md
@@ -0,0 +1,61 @@
+[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md)
+
+
+# [Yolo-NAS-Quantized: Quantized real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolonas_quantized)
+
+YoloNAS is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the COCO dataset.
+
+This is based on the implementation of Yolo-NAS-Quantized found
+[here](https://github.com/Deci-AI/super-gradients). This repository contains scripts for optimized on-device
+export suitable to run on Qualcomm® devices. More details on model performance
+accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolonas_quantized).
+
+[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on
+a hosted Qualcomm® device.
+
+
+
+
+## Example & Usage
+
+Install the package via pip:
+```bash
+pip install "qai_hub_models[yolonas_quantized]"
+```
+
+
+Once installed, run the following simple CLI demo:
+
+```bash
+python -m qai_hub_models.models.yolonas_quantized.demo
+```
+More details on the CLI tool can be found with the `--help` option. See
+[demo.py](demo.py) for sample usage of the model including pre/post processing
+scripts. Please refer to our [general instructions on using
+models](../../../#getting-started) for more usage instructions.
+
+## Export for on-device deployment
+
+This repository contains export scripts that produce a model optimized for
+on-device deployment. This can be run as follows:
+
+```bash
+python -m qai_hub_models.models.yolonas_quantized.export
+```
+Additional options are documented with the `--help` option. Note that the above
+script requires access to Deployment instructions for Qualcomm® AI Hub.
+
+## License
+- The license for the original implementation of Yolo-NAS-Quantized can be found
+  [here](https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md).
+- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url})
+
+## References
+* [YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search](https://deci.ai/blog/yolo-nas-object-detection-foundation-model/)
+* [Source Model Implementation](https://github.com/Deci-AI/super-gradients)
+
+## Community
+* Join [our AI Hub Slack community](https://qualcomm-ai-hub.slack.com/join/shared_invite/zt-2d5zsmas3-Sj0Q9TzslueCjS31eXG2UA#/shared-invite/email) to collaborate, post questions and learn more about on-device AI.
+* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com).
+
+
diff --git a/qai_hub_models/models/yolonas_quantized/__init__.py b/qai_hub_models/models/yolonas_quantized/__init__.py
new file mode 100644
index 00000000..28fd836f
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/__init__.py
@@ -0,0 +1,8 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.yolonas.app import YoloNASDetectionApp as App  # noqa: F401
+
+from .model import MODEL_ID  # noqa: F401
+from .model import YoloNASQuantizable as Model  # noqa: F401
diff --git a/qai_hub_models/models/yolonas_quantized/conftest.py b/qai_hub_models/models/yolonas_quantized/conftest.py
new file mode 100644
index 00000000..61cc2334
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/conftest.py
@@ -0,0 +1,39 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+import inspect
+
+import pytest
+
+from qai_hub_models.models.yolonas_quantized import Model
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+
+# Instantiate the model only once for all tests.
+# Mock from_pretrained to always return the initialized model.
+# This speeds up tests and limits memory leaks.
+@pytest.fixture(scope="module", autouse=True)
+def cached_from_pretrained():
+    with pytest.MonkeyPatch.context() as mp:
+        pretrained_cache = {}
+        from_pretrained = Model.from_pretrained
+        sig = inspect.signature(from_pretrained)
+
+        @skip_clone_repo_check
+        def _cached_from_pretrained(*args, **kwargs):
+            cache_key = str(args) + str(kwargs)
+            model = pretrained_cache.get(cache_key, None)
+            if model:
+                return model
+            else:
+                model = from_pretrained(*args, **kwargs)
+                pretrained_cache[cache_key] = model
+                return model
+
+        _cached_from_pretrained.__signature__ = sig
+
+        mp.setattr(Model, "from_pretrained", _cached_from_pretrained)
+        yield mp
diff --git a/qai_hub_models/models/yolonas_quantized/demo.py b/qai_hub_models/models/yolonas_quantized/demo.py
new file mode 100644
index 00000000..fb2e64a3
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/demo.py
@@ -0,0 +1,24 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models._shared.yolo.demo import yolo_detection_demo
+from qai_hub_models.models.yolonas.app import YoloNASDetectionApp
+from qai_hub_models.models.yolonas_quantized.model import MODEL_ID, YoloNASQuantizable
+from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS
+
+
+def main(is_test: bool = False):
+    yolo_detection_demo(
+        YoloNASQuantizable,
+        MODEL_ID,
+        YoloNASDetectionApp,
+        IMAGE_ADDRESS,
+        YoloNASQuantizable.STRIDE_MULTIPLE,
+        is_test=is_test,
+        default_score_threshold=0.7,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/yolonas_quantized/export.py b/qai_hub_models/models/yolonas_quantized/export.py
new file mode 100644
index 00000000..4f5733e9
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/export.py
@@ -0,0 +1,225 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY.
+
+
+from __future__ import annotations
+
+import os
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, cast
+
+import qai_hub as hub
+
+from qai_hub_models.models.yolonas_quantized import Model
+from qai_hub_models.utils.args import (
+    export_parser,
+    get_input_spec_kwargs,
+    get_model_kwargs,
+)
+from qai_hub_models.utils.base_model import TargetRuntime
+from qai_hub_models.utils.compare import torch_inference
+from qai_hub_models.utils.printing import (
+    print_inference_metrics,
+    print_on_target_demo_cmd,
+    print_profile_metrics_from_job,
+)
+from qai_hub_models.utils.qai_hub_helpers import (
+    can_access_qualcomm_ai_hub,
+    export_without_hub_access,
+    transpose_channel_first_to_last,
+)
+from qai_hub_models.utils.qnn_helpers import get_qnn_inputs
+
+
+def export_model(
+    device: str = "Samsung Galaxy S23",
+    chipset: Optional[str] = None,
+    skip_profiling: bool = False,
+    skip_inferencing: bool = False,
+    skip_downloading: bool = False,
+    skip_summary: bool = False,
+    output_dir: Optional[str] = None,
+    target_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    compile_options: str = "",
+    profile_options: str = "",
+    **additional_model_kwargs,
+) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[
+    str
+]:
+    """
+    This function accomplishes 6 main tasks:
+
+        1. Instantiates a PyTorch model and converts it to a traced TorchScript format.
+        2. Compiles the model to an asset that can be run on device.
+        3. Profiles the model performance on real devices.
+        4. Inferences the model on sample inputs.
+        5. Downloads the model asset to the local directory.
+        6. Summarizes the results from profiling and inference.
+
+    Each of the last four steps can be optionally skipped using the input options.
+
+    Parameters:
+        device: Device for which to export the model.
+            Full list of available devices can be found by running `hub.get_devices()`.
+            Defaults to DEFAULT_DEVICE if not specified.
+        chipset: If set, will choose a random device with this chipset.
+            Overrides the `device` argument.
+        skip_profiling: If set, skips profiling of compiled model on real devices.
+        skip_inferencing: If set, skips computing on-device outputs from sample data.
+        skip_downloading: If set, skips downloading of compiled model.
+        skip_summary: If set, skips waiting for and summarizing results
+            from profiling and inference.
+        output_dir: Directory to store generated assets (e.g. compiled model).
+            Defaults to `<cwd>/build/<model_name>`.
+        target_runtime: Which on-device runtime to target. Default is TFLite.
+        compile_options: Additional options to pass when submitting the compile job.
+        profile_options: Additional options to pass when submitting the profile job.
+        **additional_model_kwargs: Additional optional kwargs used to customize
+            `model_cls.from_pretrained` and `model.get_input_spec`
+
+    Returns:
+        A 3-tuple of:
+            * A CompileJob object containing metadata about the compile job submitted to hub.
+            * A ProfileJob containing metadata about the profile job (None if profiling skipped).
+            * An InferenceJob containing metadata about the inference job (None if inferencing skipped).
+    """
+    model_name = "yolonas_quantized"
+    output_path = Path(output_dir or Path.cwd() / "build" / model_name)
+    if chipset:
+        hub_device = hub.Device(attributes=f"chipset:{chipset}")
+    else:
+        hub_device = hub.Device(name=device)
+    if not can_access_qualcomm_ai_hub():
+        return export_without_hub_access(
+            "yolonas_quantized",
+            "Yolo-NAS-Quantized",
+            device,
+            skip_profiling,
+            skip_inferencing,
+            skip_downloading,
+            skip_summary,
+            output_path,
+            target_runtime,
+            compile_options,
+            profile_options,
+        )
+
+    # 1. Initialize PyTorch model
+    model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs))
+    input_spec = model.get_input_spec(
+        **get_input_spec_kwargs(model, additional_model_kwargs)
+    )
+
+    # Trace the model
+    source_model = model.convert_to_hub_source_model(
+        target_runtime, output_path, input_spec
+    )
+    if target_runtime == TargetRuntime.TFLITE:
+        quant_calibration_data = None
+    else:
+        quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
+
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
+    # 2. Compile the model to an on-device asset
+    model_compile_options = model.get_hub_compile_options(
+        target_runtime, compile_options + channel_last_flags, hub_device
+    )
+    print(f"Optimizing model {model_name} to run on-device")
+    submitted_compile_job = hub.submit_compile_job(
+        model=source_model,
+        input_specs=input_spec,
+        device=hub_device,
+        name=model_name,
+        calibration_data=quant_calibration_data,
+        options=model_compile_options,
+    )
+    compile_job = cast(hub.client.CompileJob, submitted_compile_job)
+
+    # 3. Profile the model asset on real devices
+    profile_job: Optional[hub.client.ProfileJob] = None
+    if not skip_profiling:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(f"Profiling model {model_name} on a hosted device.")
+        submitted_profile_job = hub.submit_profile_job(
+            model=compile_job.get_target_model(),
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        profile_job = cast(hub.client.ProfileJob, submitted_profile_job)
+
+    # 4. Run inference on-device with sample inputs
+    inference_job: Optional[hub.client.InferenceJob] = None
+    if not skip_inferencing:
+        profile_options_all = model.get_hub_profile_options(
+            target_runtime, profile_options
+        )
+        print(
+            f"Running inference for {model_name} on a hosted device with example inputs."
+        )
+        sample_inputs = model.sample_inputs(input_spec)
+        hub_inputs = sample_inputs
+        if target_runtime == TargetRuntime.QNN:
+            hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
+        # Convert inputs from channel first to channel last
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
+        )
+        submitted_inference_job = hub.submit_inference_job(
+            model=compile_job.get_target_model(),
+            inputs=hub_inputs,
+            device=hub_device,
+            name=model_name,
+            options=profile_options_all,
+        )
+        inference_job = cast(hub.client.InferenceJob, submitted_inference_job)
+
+    # 5. Download the model asset to a local file
+    if not skip_downloading:
+        os.makedirs(output_path, exist_ok=True)
+        target_model: hub.Model = compile_job.get_target_model()  # type: ignore
+        target_model.download(str(output_path / f"{model_name}.tflite"))
+
+    # 6. Summarize the results from profiling and inference
+    if not skip_summary and not skip_profiling:
+        assert profile_job is not None and profile_job.wait().success
+        profile_data: Dict[str, Any] = profile_job.download_profile()  # type: ignore
+        print_profile_metrics_from_job(profile_job, profile_data)
+
+    if not skip_summary and not skip_inferencing:
+        torch_out = torch_inference(model, sample_inputs)
+        assert inference_job is not None and inference_job.wait().success
+        inference_result: hub.client.DatasetEntries = inference_job.download_output_data()  # type: ignore
+        print_inference_metrics(
+            inference_job, inference_result, torch_out, outputs_to_skip=[2]
+        )
+
+    if not skip_summary:
+        print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device)
+
+    return (compile_job, profile_job, inference_job)
+
+
+def main():
+    warnings.filterwarnings("ignore")
+    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    args = parser.parse_args()
+    export_model(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qai_hub_models/models/yolonas_quantized/info.yaml b/qai_hub_models/models/yolonas_quantized/info.yaml
new file mode 100644
index 00000000..d3a0e9d4
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/info.yaml
@@ -0,0 +1,42 @@
+name: Yolo-NAS-Quantized
+# id must match with the model dir name in qai_hub_models
+id: yolonas_quantized
+status: public
+headline: Quantized real-time object detection optimized for mobile and edge.
+domain: Computer Vision
+description: YoloNAS is a machine learning model that predicts bounding boxes and classes
+  of objects in an image. This model is post-training quantized to int8 using samples
+  from the COCO dataset.
+use_case: Object Detection
+tags:
+  - real-time
+  - quantized
+research_paper: https://deci.ai/blog/yolo-nas-object-detection-foundation-model/
+research_paper_title: 'YOLO-NAS by Deci Achieves SOTA Performance on Object Detection Using Neural Architecture Search'
+license: https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md
+deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf
+source_repo: https://github.com/Deci-AI/super-gradients
+technical_details:
+  Model checkpoint: YoloNAS Small
+  Input resolution: 640x640
+  Number of parameters: 12.2M
+  Model size: 12.1 MB
+applicable_scenarios:
+  - Factory Automation
+  - Robotic Navigation
+  - Camera
+related_models:
+  - yolov6
+  - yolov7
+  - yolov8_det
+form_factors:
+  - Phone
+  - Tablet
+  - IoT
+  - XR
+has_static_banner: yes
+has_animated_banner: yes
+license_type: apache-2.0
+deploy_license_type: AI Model Hub License
+dataset:
+  - COCO
diff --git a/qai_hub_models/models/yolonas_quantized/model.py b/qai_hub_models/models/yolonas_quantized/model.py
new file mode 100644
index 00000000..52c5fb5e
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/model.py
@@ -0,0 +1,93 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from __future__ import annotations
+
+# isort: off
+# This verifies aimet is installed, and this must be included first.
+from qai_hub_models.utils.quantization_aimet import (
+    AIMETQuantizableMixin,
+    constrain_quantized_inputs_to_image_range,
+)
+
+# isort: on
+
+from typing import Optional
+
+import torch
+from aimet_torch.cross_layer_equalization import equalize_model
+from aimet_torch.model_preparer import prepare_model
+from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim
+
+from qai_hub_models.models.yolonas.model import DEFAULT_WEIGHTS, YoloNAS
+from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
+from qai_hub_models.utils.quantization_aimet import tie_observers
+
+MODEL_ID = __name__.split(".")[-2]
+MODEL_ASSET_VERSION = 1
+DEFAULT_ENCODINGS = "yolonas_quantized_encodings.json"
+
+
+class YoloNASQuantizable(AIMETQuantizableMixin, YoloNAS):
+    """Exportable Quantized YoloNAS bounding box detector, end-to-end."""
+
+    def __init__(
+        self,
+        sim_model: QuantizationSimModel,
+    ) -> None:
+        # Sim model will already include postprocessing
+        torch.nn.Module.__init__(self)
+        AIMETQuantizableMixin.__init__(self, sim_model)
+        self.model = sim_model.model
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        weights_name: Optional[str] = DEFAULT_WEIGHTS,
+        aimet_encodings: str | None = "DEFAULT",
+        include_postprocessing: bool = True,
+    ) -> "YoloNASQuantizable":
+        """Load YoloNAS from a weightfile created by the source YoloNAS repository."""
+        fp16_model = YoloNAS.from_pretrained(
+            weights_name,
+            include_postprocessing=include_postprocessing,
+        )
+        fp16_model.class_dtype = torch.int8
+
+        input_shape = cls.get_input_spec()["image"][0]
+
+        model = prepare_model(fp16_model)
+        equalize_model(model, input_shape)
+
+        sim = QuantizationSimModel(
+            model,
+            quant_scheme="tf_enhanced",
+            default_param_bw=8,
+            default_output_bw=8,
+            config_file=get_default_aimet_config(),
+            dummy_input=torch.rand(input_shape),
+        )
+        tie_observers(sim)
+        constrain_quantized_inputs_to_image_range(sim)
+
+        if aimet_encodings:
+            if aimet_encodings == "DEFAULT":
+                aimet_encodings = CachedWebModelAsset.from_asset_store(
+                    MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS
+                ).fetch()
+            load_encodings_to_sim(sim, aimet_encodings)
+
+        sim.model.eval()
+        final_model = cls(sim)
+        return final_model
+
+    def forward(self, image: torch.Tensor):
+        """
+        Run YoloNASQuantizable on `image`, and produce a
+            predicted set of bounding boxes and associated class probabilities.
+
+        See YoloNAS model for details.
+        """
+        return self.model(image)
diff --git a/qai_hub_models/models/yolonas_quantized/perf.yaml b/qai_hub_models/models/yolonas_quantized/perf.yaml
new file mode 100644
index 00000000..b30254ec
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/perf.yaml
@@ -0,0 +1,286 @@
+aggregated:
+  supported_oses:
+  - Android
+  supported_devices:
+  - Google Pixel 3
+  - Google Pixel 3a
+  - Google Pixel 3a XL
+  - Google Pixel 4
+  - Google Pixel 4a
+  - Google Pixel 5a 5G
+  - QCS6490 (Proxy)
+  - QCS8250 (Proxy)
+  - QCS8550 (Proxy)
+  - RB3 Gen 2 (Proxy)
+  - RB5 (Proxy)
+  - Samsung Galaxy S21
+  - Samsung Galaxy S21 Ultra
+  - Samsung Galaxy S21+
+  - Samsung Galaxy S22 5G
+  - Samsung Galaxy S22 Ultra 5G
+  - Samsung Galaxy S22+ 5G
+  - Samsung Galaxy S23
+  - Samsung Galaxy S23 Ultra
+  - Samsung Galaxy S23+
+  - Samsung Galaxy S24
+  - Samsung Galaxy S24 Ultra
+  - Samsung Galaxy S24+
+  - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
+  - Xiaomi 12
+  - Xiaomi 12 Pro
+  supported_chipsets:
+  - Qcs6490
+  - Qcs8250
+  - Qcs8550
+  - Snapdragon® 8 Gen 1
+  - Snapdragon® 8 Gen 2
+  - Snapdragon® 8 Gen 3
+  - Snapdragon® 888
+  - Snapdragon® X Elite
+models:
+- name: Yolo-NAS-Quantized
+  performance_metrics:
+  - torchscript_onnx_tflite:
+      inference_time: 6961.0
+      throughput: 143.65752047119668
+      estimated_peak_memory_range:
+        min: 9187328
+        max: 12329304
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 200
+        layers_on_gpu: 0
+        layers_on_cpu: 3
+        total_layers: 203
+      job_id: jo5mzn7qp
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jep2mk1m5
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1p87qn85
+      job_status: Failed
+    reference_device_info:
+      name: Samsung Galaxy S23
+      os: '13'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 2
+    timestamp: '2024-05-20T16:35:31.715440Z'
+  - torchscript_onnx_tflite:
+      inference_time: 4940.0
+      throughput: 202.42914979757086
+      estimated_peak_memory_range:
+        min: 712704
+        max: 62991232
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 200
+        layers_on_gpu: 0
+        layers_on_cpu: 3
+        total_layers: 203
+      job_id: jegne64mg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jqpyd1l4p
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jogkye1op
+      job_status: Failed
+    reference_device_info:
+      name: Samsung Galaxy S24
+      os: '14'
+      form_factor: Phone
+      os_name: Android
+      manufacturer: Samsung
+      chipset: Snapdragon® 8 Gen 3
+    timestamp: '2024-05-20T16:35:31.715466Z'
+  - torchscript_onnx_tflite:
+      inference_time: 6961.0
+      throughput: 143.65752047119668
+      estimated_peak_memory_range:
+        min: 102400
+        max: 12952792
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 200
+        layers_on_gpu: 0
+        layers_on_cpu: 3
+        total_layers: 203
+      job_id: jopryvreg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j2p0rzwep
+      job_status: Failed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.715498Z'
+  - torchscript_onnx_tflite:
+      inference_time: 18142.0
+      throughput: 55.120714364458166
+      estimated_peak_memory_range:
+        min: 69632
+        max: 59977872
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 200
+        layers_on_gpu: 0
+        layers_on_cpu: 1
+        total_layers: 201
+      job_id: jep2lzo4g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1gl3qw8g
+      job_status: Failed
+    reference_device_info:
+      name: RB3 Gen 2 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:31.715517Z'
+  - torchscript_onnx_tflite:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jqpy6y875
+      job_status: Failed
+    reference_device_info:
+      name: RB5 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:31.715528Z'
+  - torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jn5q26nm5
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 25114.0
+      throughput: 39.81842796846381
+      estimated_peak_memory_range:
+        min: 36032512
+        max: 36032512
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 279
+        total_layers: 279
+      job_id: j1glkvdlp
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.715550Z'
diff --git a/qai_hub_models/models/yolonas_quantized/requirements.txt b/qai_hub_models/models/yolonas_quantized/requirements.txt
new file mode 100644
index 00000000..b6f0ec66
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/requirements.txt
@@ -0,0 +1,9 @@
+object-detection-metrics==0.4.post1
+stringcase==1.2.0
+rapidfuzz==3.8.1
+treelib==1.6.1
+imagesize==1.4.1
+einops==0.3.2
+Deprecated==1.2.11
+data-gradients==0.3.1
+shapely==2.0.3
diff --git a/qai_hub_models/models/yolonas_quantized/test.py b/qai_hub_models/models/yolonas_quantized/test.py
new file mode 100644
index 00000000..2c1afe58
--- /dev/null
+++ b/qai_hub_models/models/yolonas_quantized/test.py
@@ -0,0 +1,45 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+from qai_hub_models.models.yolonas.app import YoloNASDetectionApp
+from qai_hub_models.models.yolonas_quantized.demo import main as demo_main
+from qai_hub_models.models.yolonas_quantized.model import (
+    MODEL_ASSET_VERSION,
+    MODEL_ID,
+    YoloNASQuantizable,
+)
+from qai_hub_models.models.yolov7.demo import IMAGE_ADDRESS
+from qai_hub_models.utils.asset_loaders import (
+    CachedWebModelAsset,
+    load_image,
+    load_numpy,
+)
+from qai_hub_models.utils.bounding_box_processing import get_iou
+from qai_hub_models.utils.testing import skip_clone_repo_check
+
+GT_BOXES = CachedWebModelAsset.from_asset_store(
+    MODEL_ID, MODEL_ASSET_VERSION, "yolonas_boxes.npy"
+)
+
+
+@skip_clone_repo_check
+def test_task():
+    image = load_image(IMAGE_ADDRESS)
+    app = YoloNASDetectionApp(
+        YoloNASQuantizable.from_pretrained(), nms_score_threshold=0.7
+    )
+    boxes = app.predict_boxes_from_image(image, raw_output=True)[0][0].numpy()
+    print(boxes)
+    boxes_gt = load_numpy(GT_BOXES)
+    boxes = sorted(boxes, key=lambda box: box[0])
+    boxes_gt = sorted(boxes_gt, key=lambda box: box[0])
+    assert len(boxes) == len(boxes_gt)
+    ious = [get_iou(box, box_gt) for box, box_gt in zip(boxes, boxes_gt)]
+    for iou in ious:
+        assert iou > 0.75
+
+
+@skip_clone_repo_check
+def test_demo():
+    demo_main(is_test=True)
diff --git a/qai_hub_models/models/yolov6/README.md b/qai_hub_models/models/yolov6/README.md
index bddb6ad0..d3d4f458 100644
--- a/qai_hub_models/models/yolov6/README.md
+++ b/qai_hub_models/models/yolov6/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 
diff --git a/qai_hub_models/models/yolov6/export.py b/qai_hub_models/models/yolov6/export.py
index 06e18952..8dfb4ffe 100644
--- a/qai_hub_models/models/yolov6/export.py
+++ b/qai_hub_models/models/yolov6/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -199,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/yolov6/model.py b/qai_hub_models/models/yolov6/model.py
index c30783cd..8960bd81 100644
--- a/qai_hub_models/models/yolov6/model.py
+++ b/qai_hub_models/models/yolov6/model.py
@@ -4,7 +4,6 @@
 # ---------------------------------------------------------------------
 from __future__ import annotations
 
-import tempfile
 from importlib import reload
 
 import torch
@@ -15,6 +14,7 @@
     CachedWebModelAsset,
     SourceAsRoot,
     load_path,
+    qaihm_temp_dir,
 )
 from qai_hub_models.utils.base_model import BaseModel
 from qai_hub_models.utils.input_spec import InputSpec
@@ -93,7 +93,7 @@ def get_input_spec(
 def _load_yolov6_source_model_from_weights(
     ckpt_path: str | CachedWebModelAsset,
 ) -> torch.nn.Module:
-    with tempfile.TemporaryDirectory() as tmpdir:
+    with qaihm_temp_dir() as tmpdir:
         model_path = load_path(ckpt_path, tmpdir)
         with SourceAsRoot(
             YOLOV6_SOURCE_REPOSITORY,
diff --git a/qai_hub_models/models/yolov6/perf.yaml b/qai_hub_models/models/yolov6/perf.yaml
index 49cd5919..a89eb61b 100644
--- a/qai_hub_models/models/yolov6/perf.yaml
+++ b/qai_hub_models/models/yolov6/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Yolo-v6
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 7953.0
-      throughput: 125.7387149503332
+      inference_time: 7322.0
+      throughput: 136.5747063643813
       estimated_peak_memory_range:
-        min: 2138112
-        max: 5576840
+        min: 225280
+        max: 2559408
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,37 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 182
-      job_id: jvgde2mz5
+      job_id: jw561yx7p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 6885.0
-      throughput: 145.24328249818447
+      inference_time: 5353.0
+      throughput: 186.81113394358303
       estimated_peak_memory_range:
-        min: 4939776
-        max: 18625080
+        min: 4947968
+        max: 15312024
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 229
+        layers_on_npu: 228
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 229
-      job_id: jqp4k321g
+        total_layers: 228
+      job_id: j1pvw68mg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 6690.0
-      throughput: 149.47683109118086
+      inference_time: 6762.0
+      throughput: 147.88524105294292
       estimated_peak_memory_range:
-        min: 5345280
-        max: 37259592
+        min: 5337088
+        max: 34449840
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 228
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jo5mq8l9p
+        total_layers: 228
+      job_id: jz5w9ekjp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -85,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.930548Z'
+    timestamp: '2024-05-20T16:35:31.754074Z'
   - torchscript_onnx_tflite:
-      inference_time: 5649.0
-      throughput: 177.02248185519562
+      inference_time: 5305.0
+      throughput: 188.5014137606032
       estimated_peak_memory_range:
-        min: 16384
-        max: 82704608
+        min: 49152
+        max: 78708192
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -99,37 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 182
-      job_id: jz570989g
+      job_id: j1p3mjdzg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 4867.0
-      throughput: 205.4653790836244
+      inference_time: 3962.0
+      throughput: 252.39777889954567
       estimated_peak_memory_range:
         min: 4931584
-        max: 98473200
+        max: 96111232
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 229
+        layers_on_npu: 228
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 229
-      job_id: j0pxnxzl5
+        total_layers: 228
+      job_id: j7gjlv98p
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 4842.0
-      throughput: 206.52622883106156
+      inference_time: 4919.0
+      throughput: 203.29335230737954
       estimated_peak_memory_range:
-        min: 4931584
-        max: 66299664
+        min: 3256320
+        max: 65642736
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 228
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jegnlkwq5
+        total_layers: 228
+      job_id: jmg94lrv5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -138,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.930616Z'
+    timestamp: '2024-05-20T16:35:31.754099Z'
   - torchscript_onnx_tflite:
-      inference_time: 7952.0
-      throughput: 125.75452716297787
+      inference_time: 7402.0
+      throughput: 135.09862199405566
       estimated_peak_memory_range:
-        min: 217088
-        max: 3444928
+        min: 229376
+        max: 3627424
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -152,22 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 182
-      job_id: jn5qenqo5
+      job_id: jwgov2xd5
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 6878.0
-      throughput: 145.39110206455365
+      inference_time: 5362.0
+      throughput: 186.4975755315181
       estimated_peak_memory_range:
-        min: 4952064
-        max: 19343808
+        min: 4939776
+        max: 15305728
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 229
+        layers_on_npu: 228
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 229
-      job_id: j7gjz9ve5
+        total_layers: 228
+      job_id: jygz7366p
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -176,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.930670Z'
+    timestamp: '2024-05-20T16:35:31.754129Z'
+  - torchscript_onnx_qnn:
+      inference_time: 6754.0
+      throughput: 148.06040864672786
+      estimated_peak_memory_range:
+        min: 4923392
+        max: 4923392
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 228
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 228
+      job_id: jlpevdq05
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 6563.0
+      throughput: 152.36934328813044
+      estimated_peak_memory_range:
+        min: 7618560
+        max: 7618560
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 228
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 228
+      job_id: jnp1849lg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 17525.0
+      throughput: 57.06134094151213
+      estimated_peak_memory_range:
+        min: 35479552
+        max: 35479552
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 149
+        total_layers: 149
+      job_id: jvgdvxklg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.754151Z'
diff --git a/qai_hub_models/models/yolov7/README.md b/qai_hub_models/models/yolov7/README.md
index 02430d15..e6ab3b03 100644
--- a/qai_hub_models/models/yolov7/README.md
+++ b/qai_hub_models/models/yolov7/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/yolov7/app.py b/qai_hub_models/models/yolov7/app.py
index bc694b38..69de2553 100644
--- a/qai_hub_models/models/yolov7/app.py
+++ b/qai_hub_models/models/yolov7/app.py
@@ -16,7 +16,9 @@
 class YoloV7DetectionApp(YoloObjectDetectionApp):
     def check_image_size(self, pixel_values: torch.Tensor) -> None:
         """
-        Verify image size is valid model input.
+        Verify image size is a valid model input. Image size should be shape
+        [batch_size, num_channels, height, width], where height and width are multiples
+        of `YoloNAS.STRIDE_MULTIPLE`.
         """
         if len(pixel_values.shape) != 4:
             raise ValueError("Pixel Values must be rank 4: [batch, channels, x, y]")
diff --git a/qai_hub_models/models/yolov7/export.py b/qai_hub_models/models/yolov7/export.py
index db8f85b2..0e4f08d9 100644
--- a/qai_hub_models/models/yolov7/export.py
+++ b/qai_hub_models/models/yolov7/export.py
@@ -119,9 +119,16 @@ def export_model(
     model.eval()
     source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec))
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -159,8 +166,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -199,7 +208,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/yolov7/model.py b/qai_hub_models/models/yolov7/model.py
index ccbcc00e..e937d71a 100644
--- a/qai_hub_models/models/yolov7/model.py
+++ b/qai_hub_models/models/yolov7/model.py
@@ -49,7 +49,7 @@ def __init__(
     STRIDE_MULTIPLE = 32
 
     def get_evaluator(self) -> BaseEvaluator:
-        return DetectionEvaluator(640, 640)
+        return DetectionEvaluator(*self.get_input_spec()["image"][0][2:])
 
     @classmethod
     def from_pretrained(
@@ -96,7 +96,7 @@ def forward(self, image):
         Returns:
             If self.include_postprocessing:
                 boxes: torch.Tensor
-                    Bounding box locations.  Shape [batch, num preds, 4] where 4 == (center_x, center_y, w, h)
+                    Bounding box locations.  Shape [batch, num preds, 4] where 4 == (left_x, top_y, right_x, bottom_y)
                 scores: torch.Tensor
                     class scores multiplied by confidence: Shape is [batch, num_preds]
                 class_idx: torch.tensor
diff --git a/qai_hub_models/models/yolov7/perf.yaml b/qai_hub_models/models/yolov7/perf.yaml
index 35945597..14de581f 100644
--- a/qai_hub_models/models/yolov7/perf.yaml
+++ b/qai_hub_models/models/yolov7/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,38 +31,54 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Yolo-v7
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 20875.0
-      throughput: 47.90419161676647
+      inference_time: 15991.0
+      throughput: 62.53517603652054
       estimated_peak_memory_range:
-        min: 9580544
-        max: 45193728
+        min: 1212416
+        max: 3555464
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 292
+        layers_on_npu: 203
         layers_on_gpu: 0
-        layers_on_cpu: 21
-        total_layers: 313
-      job_id: jep20ezqg
+        layers_on_cpu: 12
+        total_layers: 215
+      job_id: jz5w9ej6p
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jvgdvxjeg
+      job_status: Failed
     torchscript_onnx_ort:
-      inference_time: 22899.0
-      throughput: 43.670029258919605
+      inference_time: 13667.0
+      throughput: 73.16894709885125
       estimated_peak_memory_range:
-        min: 9625600
-        max: 55617832
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 6905856
+        max: 39411600
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 2
+        layers_on_npu: 213
         layers_on_gpu: 0
-        layers_on_cpu: 21
-        total_layers: 23
-      job_id: j2p036xnp
+        layers_on_cpu: 12
+        total_layers: 225
+      job_id: j0px1kw1g
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,36 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.954673Z'
+    timestamp: '2024-05-20T16:35:31.784368Z'
   - torchscript_onnx_tflite:
-      inference_time: 16244.0
-      throughput: 61.56119182467373
+      inference_time: 10824.0
+      throughput: 92.38728750923873
       estimated_peak_memory_range:
-        min: 40960
-        max: 202538080
+        min: 188416
+        max: 59790128
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 292
+        layers_on_npu: 203
         layers_on_gpu: 0
-        layers_on_cpu: 21
-        total_layers: 313
-      job_id: jqpyrmyl5
+        layers_on_cpu: 12
+        total_layers: 215
+      job_id: jmg94l6l5
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jz57dyql5
+      job_status: Failed
     torchscript_onnx_ort:
-      inference_time: 18014.0
-      throughput: 55.51237926057511
+      inference_time: 9691.0
+      throughput: 103.18852543597151
       estimated_peak_memory_range:
-        min: 17952768
-        max: 200617376
-      primary_compute_unit: CPU
-      precision: fp32
+        min: 6680576
+        max: 67183456
+      primary_compute_unit: NPU
+      precision: fp16
       layer_info:
-        layers_on_npu: 2
+        layers_on_npu: 213
         layers_on_gpu: 0
-        layers_on_cpu: 21
-        total_layers: 23
-      job_id: j1p801kog
+        layers_on_cpu: 12
+        total_layers: 225
+      job_id: jo5mznjwp
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,22 +140,37 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.954727Z'
+    timestamp: '2024-05-20T16:35:31.784396Z'
   - torchscript_onnx_tflite:
-      inference_time: 20857.0
-      throughput: 47.94553387351968
+      inference_time: 15945.0
+      throughput: 62.715584822828475
       estimated_peak_memory_range:
-        min: 9539584
-        max: 12396608
+        min: 1220608
+        max: 3533376
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 292
+        layers_on_npu: 203
         layers_on_gpu: 0
-        layers_on_cpu: 21
-        total_layers: 313
-      job_id: jvgdekxz5
+        layers_on_cpu: 12
+        total_layers: 215
+      job_id: jnp184r2g
       job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jqp4wlzvg
+      job_status: Failed
     reference_device_info:
       name: QCS8550 (Proxy)
       os: '12'
@@ -131,4 +178,42 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.954767Z'
+    timestamp: '2024-05-20T16:35:31.784414Z'
+  - torchscript_onnx_ort:
+      inference_time: 13497.0
+      throughput: 74.0905386382159
+      estimated_peak_memory_range:
+        min: 4927488
+        max: 4927488
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 213
+        layers_on_gpu: 0
+        layers_on_cpu: 12
+        total_layers: 225
+      job_id: jegne6jrg
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 38053.0
+      throughput: 26.279136993141144
+      estimated_peak_memory_range:
+        min: 150495232
+        max: 150495232
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jopryvz9g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.784434Z'
diff --git a/qai_hub_models/models/yolov7/requirements.txt b/qai_hub_models/models/yolov7/requirements.txt
index 4dd59a05..faa7f850 100644
--- a/qai_hub_models/models/yolov7/requirements.txt
+++ b/qai_hub_models/models/yolov7/requirements.txt
@@ -2,3 +2,4 @@ matplotlib==3.7.4
 object-detection-metrics==0.4.post1
 scipy==1.8.1
 seaborn==0.11.0
+shapely==2.0.3
diff --git a/qai_hub_models/models/yolov7_quantized/README.md b/qai_hub_models/models/yolov7_quantized/README.md
index 2c305a70..390b486d 100644
--- a/qai_hub_models/models/yolov7_quantized/README.md
+++ b/qai_hub_models/models/yolov7_quantized/README.md
@@ -3,7 +3,7 @@
 
 # [Yolo-v7-Quantized: Quantized real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov7_quantized)
 
-YoloV7 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the [COCO dataset](https://cocodataset.org/#home).
+YoloV7 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the COCO dataset.
 
 This is based on the implementation of Yolo-v7-Quantized found
 [here](https://github.com/WongKinYiu/yolov7/). This repository contains scripts for optimized on-device
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/yolov7_quantized/export.py b/qai_hub_models/models/yolov7_quantized/export.py
index a6d017a8..d2f3d51e 100644
--- a/qai_hub_models/models/yolov7_quantized/export.py
+++ b/qai_hub_models/models/yolov7_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
diff --git a/qai_hub_models/models/yolov7_quantized/info.yaml b/qai_hub_models/models/yolov7_quantized/info.yaml
index 17802e3b..9ce2d281 100644
--- a/qai_hub_models/models/yolov7_quantized/info.yaml
+++ b/qai_hub_models/models/yolov7_quantized/info.yaml
@@ -6,7 +6,7 @@ headline: Quantized real-time object detection optimized for mobile and edge.
 domain: Computer Vision
 description: YoloV7 is a machine learning model that predicts bounding boxes and classes
   of objects in an image. This model is post-training quantized to int8 using samples
-  from the [COCO dataset](https://cocodataset.org/#home).
+  from the COCO dataset.
 use_case: Object Detection
 tags:
   - real-time
diff --git a/qai_hub_models/models/yolov7_quantized/perf.yaml b/qai_hub_models/models/yolov7_quantized/perf.yaml
index c7be0fc9..f2da51b5 100644
--- a/qai_hub_models/models/yolov7_quantized/perf.yaml
+++ b/qai_hub_models/models/yolov7_quantized/perf.yaml
@@ -9,8 +9,10 @@ aggregated:
   - Google Pixel 4a
   - Google Pixel 5a 5G
   - QCS6490 (Proxy)
+  - QCS8250 (Proxy)
   - QCS8550 (Proxy)
   - RB3 Gen 2 (Proxy)
+  - RB5 (Proxy)
   - Samsung Galaxy S21
   - Samsung Galaxy S21 Ultra
   - Samsung Galaxy S21+
@@ -24,48 +26,66 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
   - Qcs6490
+  - Qcs8250
   - Qcs8550
   - Snapdragon® 8 Gen 1
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: Yolo-v7-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 6122.0
-      throughput: 163.3453119895459
+      inference_time: 4575.0
+      throughput: 218.5792349726776
       estimated_peak_memory_range:
-        min: 278528
-        max: 13519408
+        min: 323584
+        max: 2051176
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 224
+        layers_on_npu: 225
         layers_on_gpu: 0
-        layers_on_cpu: 0
-        total_layers: 224
-      job_id: j1gl6j32g
+        layers_on_cpu: 1
+        total_layers: 226
+      job_id: jep2mk245
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 5732.0
-      throughput: 174.45917655268667
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 16384
-        max: 12543776
-      primary_compute_unit: NPU
-      precision: int8
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
-        layers_on_npu: 219
+        layers_on_npu: 0
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 219
-      job_id: j1p80l0og
-      job_status: Passed
+        total_layers: 0
+      job_id: j1p87qlx5
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1glkvj8p
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -73,37 +93,52 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:33.972519Z'
+    timestamp: '2024-05-20T16:35:31.813972Z'
   - torchscript_onnx_tflite:
-      inference_time: 4059.0
-      throughput: 246.3661000246366
+      inference_time: 2984.0
+      throughput: 335.1206434316354
       estimated_peak_memory_range:
         min: 40960
-        max: 67566064
+        max: 60470096
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 224
+        layers_on_npu: 225
         layers_on_gpu: 0
-        layers_on_cpu: 0
-        total_layers: 224
-      job_id: jogk7jyvp
+        layers_on_cpu: 1
+        total_layers: 226
+      job_id: jqpyd197p
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 3804.0
-      throughput: 262.88117770767616
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 1245184
-        max: 89862128
-      primary_compute_unit: NPU
-      precision: int8
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
-        layers_on_npu: 219
+        layers_on_npu: 0
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 219
-      job_id: jmg9j64m5
-      job_status: Passed
+        total_layers: 0
+      job_id: jogkyej2p
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jw561yk0p
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -111,8 +146,23 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:33.972581Z'
-  - torchscript_onnx_qnn:
+    timestamp: '2024-05-20T16:35:31.813999Z'
+  - torchscript_onnx_tflite:
+      inference_time: 4604.0
+      throughput: 217.2024326672459
+      estimated_peak_memory_range:
+        min: 282624
+        max: 2513496
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 225
+        layers_on_gpu: 0
+        layers_on_cpu: 1
+        total_layers: 226
+      job_id: j2p0rzn6p
+      job_status: Passed
+    torchscript_onnx_qnn:
       inference_time: 'null'
       throughput: 'null'
       estimated_peak_memory_range:
@@ -125,36 +175,112 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 0
-      job_id: j1p80nqog
+      job_id: jn5q26j45
       job_status: Failed
     reference_device_info:
-      name: RB3 Gen 2 (Proxy)
+      name: QCS8550 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs6490
-    timestamp: '2024-04-23T18:42:33.972595Z'
-  - torchscript_onnx_qnn:
-      inference_time: 5978.0
-      throughput: 167.2800267648043
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.814017Z'
+  - torchscript_onnx_tflite:
+      inference_time: 11128.0
+      throughput: 89.86340762041696
       estimated_peak_memory_range:
-        min: 4939776
-        max: 15407880
+        min: 262144
+        max: 60976880
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 220
+        layers_on_npu: 225
+        layers_on_gpu: 0
+        layers_on_cpu: 1
+        total_layers: 226
+      job_id: jn5q3d1np
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 220
-      job_id: j2p03wznp
+        total_layers: 0
+      job_id: j7gje88v5
+      job_status: Failed
+    reference_device_info:
+      name: RB3 Gen 2 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:31.814035Z'
+  - torchscript_onnx_tflite:
+      inference_time: 86803.0
+      throughput: 11.520339158784834
+      estimated_peak_memory_range:
+        min: 4190208
+        max: 40909296
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 32
+        layers_on_gpu: 126
+        layers_on_cpu: 68
+        total_layers: 226
+      job_id: j1gl3q8jg
       job_status: Passed
     reference_device_info:
-      name: QCS8550 (Proxy)
+      name: RB5 (Proxy)
       os: '12'
       form_factor: Iot
       os_name: Android
       manufacturer: Qualcomm
-      chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:33.972626Z'
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:31.814046Z'
+  - torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1p3mjylg
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 15343.0
+      throughput: 65.17630189663039
+      estimated_peak_memory_range:
+        min: 51806208
+        max: 51806208
+      primary_compute_unit: CPU
+      precision: fp32
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 256
+        total_layers: 256
+      job_id: jwgov2jx5
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.814068Z'
diff --git a/qai_hub_models/models/yolov7_quantized/requirements.txt b/qai_hub_models/models/yolov7_quantized/requirements.txt
index 4dd59a05..faa7f850 100644
--- a/qai_hub_models/models/yolov7_quantized/requirements.txt
+++ b/qai_hub_models/models/yolov7_quantized/requirements.txt
@@ -2,3 +2,4 @@ matplotlib==3.7.4
 object-detection-metrics==0.4.post1
 scipy==1.8.1
 seaborn==0.11.0
+shapely==2.0.3
diff --git a/qai_hub_models/models/yolov8_det/README.md b/qai_hub_models/models/yolov8_det/README.md
index 610006a8..c82afce9 100644
--- a/qai_hub_models/models/yolov8_det/README.md
+++ b/qai_hub_models/models/yolov8_det/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/yolov8_det/export.py b/qai_hub_models/models/yolov8_det/export.py
index 4d4d321c..71b23405 100644
--- a/qai_hub_models/models/yolov8_det/export.py
+++ b/qai_hub_models/models/yolov8_det/export.py
@@ -121,9 +121,16 @@ def export_model(
         model.to("cpu"), make_torch_inputs(input_spec), check_trace=False
     )
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -161,8 +168,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -201,7 +210,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/yolov8_det/model.py b/qai_hub_models/models/yolov8_det/model.py
index 48ecd1f9..224497fd 100644
--- a/qai_hub_models/models/yolov8_det/model.py
+++ b/qai_hub_models/models/yolov8_det/model.py
@@ -160,7 +160,7 @@ def get_input_spec(
         return {"image": ((batch_size, num_channels, height, width), "float32")}
 
     def get_evaluator(self) -> BaseEvaluator:
-        return DetectionEvaluator(640, 640)
+        return DetectionEvaluator(*self.get_input_spec()["image"][0][2:])
 
 
 def yolov8_detect_postprocess(
diff --git a/qai_hub_models/models/yolov8_det/perf.yaml b/qai_hub_models/models/yolov8_det/perf.yaml
index 526f1b82..6ba185bb 100644
--- a/qai_hub_models/models/yolov8_det/perf.yaml
+++ b/qai_hub_models/models/yolov8_det/perf.yaml
@@ -8,6 +8,7 @@ aggregated:
   - Google Pixel 4
   - Google Pixel 4a
   - Google Pixel 5a 5G
+  - QCS8550 (Proxy)
   - Samsung Galaxy S21
   - Samsung Galaxy S21 Ultra
   - Samsung Galaxy S21+
@@ -21,30 +22,63 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
+  - Qcs8550
   - Snapdragon® 8 Gen 1
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: YOLOv8-Detection
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 6113.0
-      throughput: 163.5858007524947
+      inference_time: 5873.0
+      throughput: 170.2707304614337
       estimated_peak_memory_range:
-        min: 233472
-        max: 8968336
+        min: 245760
+        max: 8436704
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 300
+        layers_on_npu: 290
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 300
-      job_id: jqpyzm28g
+        total_layers: 290
+      job_id: j1pvw6jjg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 5218.0
+      throughput: 191.64430816404752
+      estimated_peak_memory_range:
+        min: 6332416
+        max: 18723960
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 285
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 285
+      job_id: jygz731kp
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 6644.0
+      throughput: 150.51173991571343
+      estimated_peak_memory_range:
+        min: 6328320
+        max: 32755768
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 286
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 286
+      job_id: jvgdvxweg
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -53,36 +87,51 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-02T15:27:43.907101Z'
+    timestamp: '2024-05-20T16:35:31.892347Z'
+  - torchscript_onnx_tflite:
+      inference_time: 4141.0
+      throughput: 241.48756339048538
+      estimated_peak_memory_range:
+        min: 36864
+        max: 84965392
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 290
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 290
+      job_id: j7gjlvjxp
+      job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 5316.0
-      throughput: 188.11136192626034
+      inference_time: 3671.0
+      throughput: 272.40533914464726
       estimated_peak_memory_range:
-        min: 4935680
-        max: 19108344
+        min: 78393344
+        max: 180541088
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 293
+        layers_on_npu: 285
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 293
-      job_id: j1p821rkp
+        total_layers: 285
+      job_id: jz5w9eo6p
       job_status: Passed
-  - torchscript_onnx_tflite:
-      inference_time: 4320.0
-      throughput: 231.4814814814815
+    torchscript_onnx_ort:
+      inference_time: 4354.0
+      throughput: 229.67386311437758
       estimated_peak_memory_range:
-        min: 73728
-        max: 88723920
+        min: 4956160
+        max: 70229504
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 300
+        layers_on_npu: 286
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 300
-      job_id: j2p04699g
+        total_layers: 286
+      job_id: jz57dyzl5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -91,19 +140,95 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-02T15:30:24.719256Z'
+    timestamp: '2024-05-20T16:35:31.892377Z'
+  - torchscript_onnx_tflite:
+      inference_time: 5872.0
+      throughput: 170.29972752043597
+      estimated_peak_memory_range:
+        min: 16384
+        max: 5135584
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 290
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 290
+      job_id: jlpevdj15
+      job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 3677.0
-      throughput: 271.9608376393799
+      inference_time: 5208.0
+      throughput: 192.01228878648234
+      estimated_peak_memory_range:
+        min: 4935680
+        max: 18248224
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 285
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 285
+      job_id: jnp18402g
+      job_status: Passed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.892393Z'
+  - torchscript_onnx_qnn:
+      inference_time: 5820.0
+      throughput: 171.82130584192439
+      estimated_peak_memory_range:
+        min: 4923392
+        max: 4923392
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 285
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 285
+      job_id: jmg94lvl5
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 6424.0
+      throughput: 155.6662515566625
       estimated_peak_memory_range:
-        min: 4931584
-        max: 110753456
+        min: 10039296
+        max: 10039296
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 293
+        layers_on_npu: 286
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 293
-      job_id: jogkv80wp
+        total_layers: 286
+      job_id: jqp4wlqvg
       job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 14327.0
+      throughput: 69.79828296223913
+      estimated_peak_memory_range:
+        min: 82149376
+        max: 82149376
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j0px1kv1g
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.892415Z'
diff --git a/qai_hub_models/models/yolov8_det/requirements.txt b/qai_hub_models/models/yolov8_det/requirements.txt
index 48634d89..158fe9bd 100644
--- a/qai_hub_models/models/yolov8_det/requirements.txt
+++ b/qai_hub_models/models/yolov8_det/requirements.txt
@@ -2,3 +2,4 @@ object-detection-metrics==0.4.post1
 seaborn==0.11.0
 thop==0.1.1.post2209072238
 ultralytics==8.0.193
+shapely==2.0.3
diff --git a/qai_hub_models/models/yolov8_det_quantized/README.md b/qai_hub_models/models/yolov8_det_quantized/README.md
index 9e6d342c..874a00c7 100644
--- a/qai_hub_models/models/yolov8_det_quantized/README.md
+++ b/qai_hub_models/models/yolov8_det_quantized/README.md
@@ -3,7 +3,7 @@
 
 # [YOLOv8-Detection-Quantized: Quantized real-time object detection optimized for mobile and edge by Ultralytics](https://aihub.qualcomm.com/models/yolov8_det_quantized)
 
-Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the [COCO dataset](https://cocodataset.org/#home).
+Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes and classes of objects in an image. This model is post-training quantized to int8 using samples from the COCO dataset.
 
 This is based on the implementation of YOLOv8-Detection-Quantized found
 [here](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/detect). This repository contains scripts for optimized on-device
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/yolov8_det_quantized/export.py b/qai_hub_models/models/yolov8_det_quantized/export.py
index 7482e447..ddf0d6eb 100644
--- a/qai_hub_models/models/yolov8_det_quantized/export.py
+++ b/qai_hub_models/models/yolov8_det_quantized/export.py
@@ -123,9 +123,16 @@ def export_model(
     else:
         quant_calibration_data = model.get_calibration_data(target_runtime, input_spec)
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -167,8 +174,10 @@ def export_model(
         if target_runtime == TargetRuntime.QNN:
             hub_inputs = get_qnn_inputs(compile_job, sample_inputs)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
diff --git a/qai_hub_models/models/yolov8_det_quantized/info.yaml b/qai_hub_models/models/yolov8_det_quantized/info.yaml
index 9e8e4bea..09e86fec 100644
--- a/qai_hub_models/models/yolov8_det_quantized/info.yaml
+++ b/qai_hub_models/models/yolov8_det_quantized/info.yaml
@@ -7,7 +7,7 @@ domain: Computer Vision
 use_case: Object Detection
 description: Ultralytics YOLOv8 is a machine learning model that predicts bounding
   boxes and classes of objects in an image. This model is post-training quantized
-  to int8 using samples from the [COCO dataset](https://cocodataset.org/#home).
+  to int8 using samples from the COCO dataset.
 tags:
   - real-time
   - quantized
diff --git a/qai_hub_models/models/yolov8_det_quantized/perf.yaml b/qai_hub_models/models/yolov8_det_quantized/perf.yaml
index b99d8e6a..3542b0ae 100644
--- a/qai_hub_models/models/yolov8_det_quantized/perf.yaml
+++ b/qai_hub_models/models/yolov8_det_quantized/perf.yaml
@@ -8,6 +8,11 @@ aggregated:
   - Google Pixel 4
   - Google Pixel 4a
   - Google Pixel 5a 5G
+  - QCS6490 (Proxy)
+  - QCS8250 (Proxy)
+  - QCS8550 (Proxy)
+  - RB3 Gen 2 (Proxy)
+  - RB5 (Proxy)
   - Samsung Galaxy S21
   - Samsung Galaxy S21 Ultra
   - Samsung Galaxy S21+
@@ -21,46 +26,66 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
+  - Qcs6490
+  - Qcs8250
+  - Qcs8550
   - Snapdragon® 8 Gen 1
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: YOLOv8-Detection-Quantized
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 2122.0
-      throughput: 471.25353440150803
+      inference_time: 2343.0
+      throughput: 426.8032437046522
       estimated_peak_memory_range:
         min: 12288
-        max: 2262728
+        max: 2559336
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 274
+        layers_on_npu: 276
         layers_on_gpu: 0
-        layers_on_cpu: 0
-        total_layers: 274
-      job_id: jwgokj31p
+        layers_on_cpu: 1
+        total_layers: 277
+      job_id: jo5mznrwp
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 2121.0
-      throughput: 471.4757190004715
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 1249280
-        max: 12007368
-      primary_compute_unit: NPU
-      precision: int8
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
-        layers_on_npu: 272
+        layers_on_npu: 0
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 272
-      job_id: jnp1yrwnp
-      job_status: Passed
+        total_layers: 0
+      job_id: jep2mk845
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1p87qox5
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S23
       os: '13'
@@ -68,37 +93,52 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:34.010775Z'
+    timestamp: '2024-05-20T16:35:31.922821Z'
   - torchscript_onnx_tflite:
-      inference_time: 1422.0
-      throughput: 703.2348804500704
+      inference_time: 1587.0
+      throughput: 630.119722747322
       estimated_peak_memory_range:
         min: 12288
-        max: 49561728
+        max: 49417568
       primary_compute_unit: NPU
       precision: int8
       layer_info:
-        layers_on_npu: 274
+        layers_on_npu: 276
         layers_on_gpu: 0
-        layers_on_cpu: 0
-        total_layers: 274
-      job_id: j1gl6jk2g
+        layers_on_cpu: 1
+        total_layers: 277
+      job_id: jegne62rg
       job_status: Passed
     torchscript_onnx_qnn:
-      inference_time: 1420.0
-      throughput: 704.2253521126761
+      inference_time: 'null'
+      throughput: 'null'
       estimated_peak_memory_range:
-        min: 1245184
-        max: 107412320
-      primary_compute_unit: NPU
-      precision: int8
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
       layer_info:
-        layers_on_npu: 272
+        layers_on_npu: 0
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 272
-      job_id: jvgdejv65
-      job_status: Passed
+        total_layers: 0
+      job_id: jqpyd1e7p
+      job_status: Failed
+    torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jogkyez2p
+      job_status: Failed
     reference_device_info:
       name: Samsung Galaxy S24
       os: '14'
@@ -106,4 +146,141 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:34.010850Z'
+    timestamp: '2024-05-20T16:35:31.922851Z'
+  - torchscript_onnx_tflite:
+      inference_time: 2345.0
+      throughput: 426.43923240938165
+      estimated_peak_memory_range:
+        min: 12288
+        max: 3644216
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 276
+        layers_on_gpu: 0
+        layers_on_cpu: 1
+        total_layers: 277
+      job_id: jopryvk9g
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j2p0rzy6p
+      job_status: Failed
+    reference_device_info:
+      name: QCS8550 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8550
+    timestamp: '2024-05-20T16:35:31.922869Z'
+  - torchscript_onnx_tflite:
+      inference_time: 5342.0
+      throughput: 187.19580681392736
+      estimated_peak_memory_range:
+        min: 12288
+        max: 37726400
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 274
+        layers_on_gpu: 0
+        layers_on_cpu: 1
+        total_layers: 275
+      job_id: jogk3kkw5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: j1pvvnykp
+      job_status: Failed
+    reference_device_info:
+      name: RB3 Gen 2 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs6490
+    timestamp: '2024-05-20T16:35:31.922886Z'
+  - torchscript_onnx_tflite:
+      inference_time: 44633.0
+      throughput: 22.404947012300315
+      estimated_peak_memory_range:
+        min: 3031040
+        max: 12276104
+      primary_compute_unit: NPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 273
+        layers_on_gpu: 1
+        layers_on_cpu: 1
+        total_layers: 275
+      job_id: jn5q3ddnp
+      job_status: Passed
+    reference_device_info:
+      name: RB5 (Proxy)
+      os: '12'
+      form_factor: Iot
+      os_name: Android
+      manufacturer: Qualcomm
+      chipset: Qcs8250
+    timestamp: '2024-05-20T16:35:31.922898Z'
+  - torchscript_onnx_ort:
+      inference_time: 'null'
+      throughput: 'null'
+      estimated_peak_memory_range:
+        min: 0
+        max: 0
+      primary_compute_unit: 'null'
+      precision: 'null'
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 0
+      job_id: jn5q26845
+      job_status: Failed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 63514.0
+      throughput: 15.744560254432093
+      estimated_peak_memory_range:
+        min: 82382848
+        max: 82382848
+      primary_compute_unit: GPU
+      precision: int8
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: j1glkvn8p
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.922917Z'
diff --git a/qai_hub_models/models/yolov8_det_quantized/requirements.txt b/qai_hub_models/models/yolov8_det_quantized/requirements.txt
index 48634d89..158fe9bd 100644
--- a/qai_hub_models/models/yolov8_det_quantized/requirements.txt
+++ b/qai_hub_models/models/yolov8_det_quantized/requirements.txt
@@ -2,3 +2,4 @@ object-detection-metrics==0.4.post1
 seaborn==0.11.0
 thop==0.1.1.post2209072238
 ultralytics==8.0.193
+shapely==2.0.3
diff --git a/qai_hub_models/models/yolov8_seg/README.md b/qai_hub_models/models/yolov8_seg/README.md
index f9d01e5e..518fab1f 100644
--- a/qai_hub_models/models/yolov8_seg/README.md
+++ b/qai_hub_models/models/yolov8_seg/README.md
@@ -14,6 +14,8 @@ accross various devices, can be found [here](https://aihub.qualcomm.com/models/y
 a hosted Qualcomm® device.
 
 
+
+
 ## Example & Usage
 
 Install the package via pip:
diff --git a/qai_hub_models/models/yolov8_seg/export.py b/qai_hub_models/models/yolov8_seg/export.py
index 362c5898..a632483a 100644
--- a/qai_hub_models/models/yolov8_seg/export.py
+++ b/qai_hub_models/models/yolov8_seg/export.py
@@ -121,9 +121,16 @@ def export_model(
         model.to("cpu"), make_torch_inputs(input_spec), check_trace=False
     )
 
+    # Convert outputs from channel last to channel first (preferred I/O format for QNN and TensorFlow Lite)
+    channel_last_flags = (
+        " --force_channel_last_input image"
+        if target_runtime != TargetRuntime.ORT
+        else ""
+    )
+
     # 2. Compile the model to an on-device asset
     model_compile_options = model.get_hub_compile_options(
-        target_runtime, compile_options + " --force_channel_last_input image"
+        target_runtime, compile_options + channel_last_flags, hub_device
     )
     print(f"Optimizing model {model_name} to run on-device")
     submitted_compile_job = hub.submit_compile_job(
@@ -161,8 +168,10 @@ def export_model(
         )
         sample_inputs = model.sample_inputs(input_spec)
         # Convert inputs from channel first to channel last
-        hub_inputs = transpose_channel_first_to_last(
-            "image", sample_inputs, target_runtime
+        hub_inputs = (
+            sample_inputs
+            if target_runtime == TargetRuntime.ORT
+            else transpose_channel_first_to_last("image", sample_inputs, target_runtime)
         )
         submitted_inference_job = hub.submit_inference_job(
             model=compile_job.get_target_model(),
@@ -201,7 +210,7 @@ def export_model(
 
 def main():
     warnings.filterwarnings("ignore")
-    parser = export_parser(model_cls=Model, supports_qnn=False, supports_ort=False)
+    parser = export_parser(model_cls=Model, supports_qnn=False)
     args = parser.parse_args()
     export_model(**vars(args))
 
diff --git a/qai_hub_models/models/yolov8_seg/info.yaml b/qai_hub_models/models/yolov8_seg/info.yaml
index e6fec415..c766b93d 100644
--- a/qai_hub_models/models/yolov8_seg/info.yaml
+++ b/qai_hub_models/models/yolov8_seg/info.yaml
@@ -37,7 +37,7 @@ form_factors:
   - IoT
   - XR
 has_static_banner: yes
-has_animated_banner: no
+has_animated_banner: yes
 license_type: agpl-3.0
 deploy_license_type: agpl-3.0
 dataset: []
diff --git a/qai_hub_models/models/yolov8_seg/perf.yaml b/qai_hub_models/models/yolov8_seg/perf.yaml
index fd75d70b..2f2dba93 100644
--- a/qai_hub_models/models/yolov8_seg/perf.yaml
+++ b/qai_hub_models/models/yolov8_seg/perf.yaml
@@ -22,6 +22,7 @@ aggregated:
   - Samsung Galaxy S24 Ultra
   - Samsung Galaxy S24+
   - Samsung Galaxy Tab S8
+  - Snapdragon X Elite CRD
   - Xiaomi 12
   - Xiaomi 12 Pro
   supported_chipsets:
@@ -30,15 +31,16 @@ aggregated:
   - Snapdragon® 8 Gen 2
   - Snapdragon® 8 Gen 3
   - Snapdragon® 888
+  - Snapdragon® X Elite
 models:
 - name: YOLOv8-Segmentation
   performance_metrics:
   - torchscript_onnx_tflite:
-      inference_time: 7033.0
-      throughput: 142.18683349921798
+      inference_time: 7377.0
+      throughput: 135.556459265284
       estimated_peak_memory_range:
-        min: 4595712
-        max: 6959144
+        min: 4571136
+        max: 14729800
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -46,22 +48,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 337
-      job_id: jqp4k361g
+      job_id: jw561y60p
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 6398.0
+      throughput: 156.29884338855894
+      estimated_peak_memory_range:
+        min: 6324224
+        max: 17126264
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 333
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 333
+      job_id: j1pvw63jg
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 8072.0
-      throughput: 123.8850346878097
+      inference_time: 8007.0
+      throughput: 124.89072061945798
       estimated_peak_memory_range:
-        min: 15532032
-        max: 36380192
+        min: 14934016
+        max: 41914744
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 336
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jo5mq819p
+        total_layers: 336
+      job_id: jz5w9ev6p
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S23
@@ -70,13 +87,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 2
-    timestamp: '2024-04-23T18:42:34.024849Z'
+    timestamp: '2024-05-20T16:35:31.961611Z'
   - torchscript_onnx_tflite:
-      inference_time: 5210.0
-      throughput: 191.93857965451056
+      inference_time: 5365.0
+      throughput: 186.39328984156572
       estimated_peak_memory_range:
-        min: 40960
-        max: 98992992
+        min: 16384
+        max: 95805104
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -84,22 +101,37 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 337
-      job_id: j0pxnx8l5
+      job_id: j1p3mjklg
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 4560.0
+      throughput: 219.2982456140351
+      estimated_peak_memory_range:
+        min: 4931584
+        max: 119239328
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 333
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 333
+      job_id: j7gjlvxxp
       job_status: Passed
     torchscript_onnx_ort:
-      inference_time: 5653.0
-      throughput: 176.89722271360338
+      inference_time: 5499.0
+      throughput: 181.8512456810329
       estimated_peak_memory_range:
-        min: 17702912
-        max: 83989088
+        min: 16408576
+        max: 80100880
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
-        layers_on_npu: 1
+        layers_on_npu: 336
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 1
-      job_id: jegnlkdq5
+        total_layers: 336
+      job_id: jmg94l1l5
       job_status: Passed
     reference_device_info:
       name: Samsung Galaxy S24
@@ -108,13 +140,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-04-23T18:42:34.024902Z'
+    timestamp: '2024-05-20T16:35:31.961638Z'
   - torchscript_onnx_tflite:
-      inference_time: 7217.0
-      throughput: 138.56172925038103
+      inference_time: 7372.0
+      throughput: 135.6483993488877
       estimated_peak_memory_range:
         min: 4579328
-        max: 18295080
+        max: 7772616
       primary_compute_unit: NPU
       precision: fp16
       layer_info:
@@ -122,7 +154,22 @@ models:
         layers_on_gpu: 0
         layers_on_cpu: 0
         total_layers: 337
-      job_id: j0pxnq9l5
+      job_id: jwgov2yx5
+      job_status: Passed
+    torchscript_onnx_qnn:
+      inference_time: 6402.0
+      throughput: 156.20118712902217
+      estimated_peak_memory_range:
+        min: 4939776
+        max: 15507456
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 333
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 333
+      job_id: jygz73ekp
       job_status: Passed
     reference_device_info:
       name: QCS8550 (Proxy)
@@ -131,4 +178,57 @@ models:
       os_name: Android
       manufacturer: Qualcomm
       chipset: Qcs8550
-    timestamp: '2024-04-23T18:42:34.024944Z'
+    timestamp: '2024-05-20T16:35:31.961654Z'
+  - torchscript_onnx_qnn:
+      inference_time: 7604.0
+      throughput: 131.5097317201473
+      estimated_peak_memory_range:
+        min: 4923392
+        max: 4923392
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 333
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 333
+      job_id: jlpevd915
+      job_status: Passed
+    torchscript_onnx_ort:
+      inference_time: 8070.0
+      throughput: 123.91573729863693
+      estimated_peak_memory_range:
+        min: 22331392
+        max: 22331392
+      primary_compute_unit: NPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 336
+        layers_on_gpu: 0
+        layers_on_cpu: 0
+        total_layers: 336
+      job_id: jnp184l2g
+      job_status: Passed
+    torchscript_onnx_ort_dml_gpu:
+      inference_time: 22496.0
+      throughput: 44.45234708392603
+      estimated_peak_memory_range:
+        min: 104538112
+        max: 104538112
+      primary_compute_unit: GPU
+      precision: fp16
+      layer_info:
+        layers_on_npu: 0
+        layers_on_gpu: 1
+        layers_on_cpu: 0
+        total_layers: 1
+      job_id: jvgdvx9eg
+      job_status: Passed
+    reference_device_info:
+      name: Snapdragon X Elite CRD
+      os: '11'
+      form_factor: Compute
+      os_name: Windows
+      manufacturer: Qualcomm
+      chipset: Snapdragon® X Elite
+    timestamp: '2024-05-20T16:35:31.961681Z'
diff --git a/qai_hub_models/requirements-dev.txt b/qai_hub_models/requirements-dev.txt
index be5243dc..e1b0706d 100644
--- a/qai_hub_models/requirements-dev.txt
+++ b/qai_hub_models/requirements-dev.txt
@@ -1,6 +1,6 @@
 boto3==1.34.40
 botocore==1.34.40
-coverage==6.5.0
+coverage==5.3.1
 imageio[ffmpeg]==2.31.5
 jinja2==3.0.3
 mypy==0.991
diff --git a/qai_hub_models/utils/aimet/default_config_llama.json b/qai_hub_models/utils/aimet/default_config_llama.json
new file mode 100644
index 00000000..f084e00d
--- /dev/null
+++ b/qai_hub_models/utils/aimet/default_config_llama.json
@@ -0,0 +1,176 @@
+{
+  "defaults":
+  {
+    "ops":
+    {
+      "is_output_quantized": "True"
+    },
+    "params":
+    {
+      "is_quantized": "True",
+      "is_symmetric": "True"
+    },
+    "per_channel_quantization": "False",
+    "strict_symmetric": "False",
+    "unsigned_symmetric": "False"
+  },
+
+  "params":
+  {
+    "bias":
+    {
+      "is_quantized": "False"
+    }
+  },
+
+  "op_type":
+  {
+    "Squeeze":
+    {
+      "is_output_quantized": "False"
+    },
+    "Pad":
+    {
+      "is_output_quantized": "False"
+    },
+    "Reshape":
+    {
+      "is_output_quantized": "False"
+    },
+    "ChannelShuffle":
+    {
+      "is_output_quantized": "False"
+    },
+    "Tile":
+    {
+      "is_output_quantized": "False"
+    },
+    "Cast":
+    {
+      "is_output_quantized": "False"
+    },
+    "TopK":
+    {
+      "is_output_quantized": "False"
+    },
+    "GatherND":
+    {
+      "is_output_quantized": "False"
+    },
+    "ReduceMin":
+    {
+      "is_output_quantized": "False"
+    },
+    "ReduceMax":
+    {
+      "is_output_quantized": "False"
+    },
+    "Slice":
+    {
+      "is_output_quantized": "False"
+    },
+    "NonZero":
+    {
+      "is_output_quantized": "False"
+    },
+    "DepthToSpace":
+    {
+      "is_output_quantized": "False"
+    },
+    "MaxPool":
+    {
+      "is_output_quantized": "False"
+    },
+    "Split":
+    {
+      "is_output_quantized": "False"
+    },
+    "Mean":
+    {
+      "is_output_quantized": "False"
+    },
+    "Gemm":
+    {
+      "per_channel_quantization": "True"
+    },
+    "Conv":
+    {
+      "per_channel_quantization": "True"
+    },
+    "Transpose":
+    {
+      "is_output_quantized": "False"
+    },
+    "LayerNorm":
+    {
+      "per_channel_quantization": "False",
+      "params": {
+        "weight": {
+          "is_symmetric": "False"
+        }
+      }
+    },
+    "Gather":
+    {
+      "is_output_quantized": "True"
+    },
+    "Sigmoid":
+    {
+      "encoding_constraints":
+      {
+        "min": 0.0,
+        "max": 1.0
+      }
+    },
+    "Softmax":
+    {
+      "encoding_constraints":
+      {
+        "min": 0.0,
+        "max": 1.0
+      }
+    }
+  },
+
+  "supergroups":
+  [
+    {
+      "op_list": ["Conv", "Relu"]
+    },
+	{
+      "op_list": ["Conv", "Clip"]
+	},
+    {
+      "op_list": ["Conv", "BatchNormalization", "Relu"]
+    },
+    {
+      "op_list": ["ConvTranspose", "Relu"]
+    },
+    {
+      "op_list": ["Add", "Relu"]
+    },
+    {
+      "op_list": ["Gemm", "Relu"]
+    },
+    {
+      "op_list": ["Conv", "PRelu"]
+    },
+    {
+      "op_list": ["Conv", "BatchNormalization","PRelu"]
+    },
+    {
+      "op_list": ["Conv", "HardSwish"]
+    },
+    {
+      "op_list": ["Conv", "BatchNormalization","HardSwish"]
+    }
+  ],
+
+  "model_input":
+  {
+    "is_input_quantized": "True"
+  },
+
+  "model_output":
+  {}
+}
diff --git a/qai_hub_models/utils/args.py b/qai_hub_models/utils/args.py
index 93185c2c..9261722d 100644
--- a/qai_hub_models/utils/args.py
+++ b/qai_hub_models/utils/args.py
@@ -56,6 +56,17 @@ def add_output_dir_arg(parser: argparse.ArgumentParser) -> argparse.ArgumentPars
     return parser
 
 
+def _get_default_runtime(available_runtimes: List[TargetRuntime]):
+    if len(available_runtimes) == 0:
+        raise RuntimeError("available_runtimes empty, expecting at-least one runtime.")
+
+    return (
+        TargetRuntime.TFLITE
+        if TargetRuntime.TFLITE in available_runtimes
+        else available_runtimes[0]
+    )
+
+
 def add_target_runtime_arg(
     parser: argparse.ArgumentParser,
     help: str,
@@ -116,11 +127,7 @@ def get_on_device_demo_parser(
         default="",
         help="If running on-device, use these options when submitting the inference job.",
     )
-    default_runtime = (
-        TargetRuntime.TFLITE
-        if TargetRuntime.TFLITE in available_target_runtimes
-        else available_target_runtimes[0]
-    )
+    default_runtime = _get_default_runtime(available_runtimes=available_target_runtimes)
     add_target_runtime_arg(
         parser,
         help="The runtime to demo (if --on-device is specified).",
@@ -378,9 +385,12 @@ def get_qcom_chipsets() -> Set[str]:
 def export_parser(
     model_cls: Type[FromPretrainedTypeVar] | Type[FromPrecompiledTypeVar],
     components: Optional[List[str]] = None,
-    supports_qnn=True,
-    supports_ort=True,
-    exporting_compiled_model=False,
+    supports_tflite: bool = True,
+    supports_qnn: bool = True,
+    supports_ort: bool = True,
+    default_runtime: TargetRuntime = TargetRuntime.TFLITE,
+    exporting_compiled_model: bool = False,
+    default_export_device: str = DEFAULT_EXPORT_DEVICE,
 ) -> argparse.ArgumentParser:
     """
     Arg parser to be used in export scripts.
@@ -401,6 +411,8 @@ def export_parser(
             True when exporting compiled model.
             If set, removing skip_profiling flag from export arguments.
             Default = False.
+        default_export_device:
+            Default device to set for export.
 
     Returns:
         Arg parser object.
@@ -409,7 +421,7 @@ def export_parser(
     parser.add_argument(
         "--device",
         type=str,
-        default=DEFAULT_EXPORT_DEVICE,
+        default=default_export_device,
         help="Device for which to export.",
     )
     parser.add_argument(
@@ -450,14 +462,19 @@ def export_parser(
     )
     if not exporting_compiled_model:
         # Default runtime for compiled model is fixed for given model
-        available_runtimes = [TargetRuntime.TFLITE]
+        available_runtimes = []
+        if supports_tflite:
+            available_runtimes.append(TargetRuntime.TFLITE)
         if supports_qnn:
             available_runtimes.append(TargetRuntime.QNN)
         if supports_ort:
             available_runtimes.append(TargetRuntime.ORT)
+
+        default_runtime = _get_default_runtime(available_runtimes)
         add_target_runtime_arg(
             parser,
             available_target_runtimes=available_runtimes,
+            default=default_runtime,
             help="The runtime for which to export.",
         )
         # No compilation for compiled models
diff --git a/qai_hub_models/utils/asset_loaders.py b/qai_hub_models/utils/asset_loaders.py
index d7d6b0dd..ab2dd61d 100644
--- a/qai_hub_models/utils/asset_loaders.py
+++ b/qai_hub_models/utils/asset_loaders.py
@@ -30,6 +30,7 @@
 from git import Repo
 from PIL import Image
 from schema import And, Schema, SchemaError
+from tqdm import tqdm
 
 ASSET_BASES_DEFAULT_PATH = os.path.join(
     os.path.dirname(os.path.dirname(__file__)), "asset_bases.yaml"
@@ -112,7 +113,7 @@ def maybe_clone_git_repo(
     model_name: str,
     model_version: VersionType,
     patches: List[str] = [],
-) -> str:
+) -> Path:
     """Clone (or pull) a repository, save it to disk in a standard location,
     and return the absolute path to the cloned location. Patches can be applied
     by providing a list of paths to diff files."""
@@ -242,12 +243,14 @@ def SourceAsRoot(
     Only one of this class should be active per Python session.
     """
 
-    repository_path = maybe_clone_git_repo(
-        source_repo_url,
-        source_repo_commit_hash,
-        source_repo_name,
-        source_repo_version,
-        patches=source_repo_patches,
+    repository_path = str(
+        maybe_clone_git_repo(
+            source_repo_url,
+            source_repo_commit_hash,
+            source_repo_name,
+            source_repo_version,
+            patches=source_repo_patches,
+        )
     )
     SOURCE_AS_ROOT_LOCK.acquire()
     original_path = list(sys.path)
@@ -384,63 +387,85 @@ def get_web_asset_url(self, model_id: str, type: QAIHM_WEB_ASSET):
             raise NotImplementedError("unsupported web asset type")
         return f"{self.asset_url}/{ModelZooAssetConfig._replace_path_keywords(self.web_asset_folder, model_id=model_id)}/{file}"
 
+    def get_local_store_path(self) -> Path:
+        return Path(self.local_store_path)
+
     def get_local_store_model_path(
         self, model_name: str, version: VersionType, filename: str
-    ) -> str:
-        model_dir = os.path.join(
-            self.local_store_path,
-            self.get_relative_model_asset_path(model_name, version, filename),
+    ) -> Path:
+        return self.local_store_path / self.get_relative_model_asset_path(
+            model_name, version, filename
         )
-        return model_dir
 
     def get_local_store_dataset_path(
         self, dataset_name: str, version: VersionType, filename: str
-    ) -> str:
-        model_dir = os.path.join(
-            self.local_store_path,
-            self.get_relative_dataset_asset_path(dataset_name, version, filename),
+    ) -> Path:
+        return self.local_store_path / self.get_relative_dataset_asset_path(
+            dataset_name, version, filename
         )
-        return model_dir
 
     def get_relative_model_asset_path(
         self, model_id: str, version: Union[int, str], file_name: str
-    ):
+    ) -> Path:
         assert not file_name.startswith("/") and not file_name.startswith("\\")
-        return f"{ModelZooAssetConfig._replace_path_keywords(self.model_asset_folder, model_id=model_id, version=version)}/{file_name}"
+        return (
+            Path(
+                ModelZooAssetConfig._replace_path_keywords(
+                    self.model_asset_folder, model_id=model_id, version=version
+                )
+            )
+            / file_name
+        )
 
     def get_relative_dataset_asset_path(
         self, dataset_id: str, version: Union[int, str], file_name: str
-    ):
+    ) -> Path:
         assert not file_name.startswith("/") and not file_name.startswith("\\")
-        return f"{ModelZooAssetConfig._replace_path_keywords(self.dataset_asset_folder, dataset_id=dataset_id, version=version)}/{file_name}"
+        return (
+            Path(
+                ModelZooAssetConfig._replace_path_keywords(
+                    self.dataset_asset_folder, dataset_id=dataset_id, version=version
+                )
+            )
+            / file_name
+        )
 
     def get_model_asset_url(
         self, model_id: str, version: Union[int, str], file_name: str
-    ):
+    ) -> str:
         assert not file_name.startswith("/") and not file_name.startswith("\\")
-        return f"{self.asset_url}/{self.get_relative_model_asset_path(model_id, version, file_name)}"
+        return f"{self.asset_url}/{self.get_relative_model_asset_path(model_id, version, file_name).as_posix()}"
 
     def get_dataset_asset_url(
         self, dataset_id: str, version: Union[int, str], file_name: str
-    ):
+    ) -> str:
         assert not file_name.startswith("/") and not file_name.startswith("\\")
-        return f"{self.asset_url}/{self.get_relative_dataset_asset_path(dataset_id, version, file_name)}"
+        return f"{self.asset_url}/{self.get_relative_dataset_asset_path(dataset_id, version, file_name).as_posix()}"
 
-    def get_qaihm_repo(self, model_id: str, relative=True):
-        relative_path = f"{ModelZooAssetConfig._replace_path_keywords(self.qaihm_repo, model_id=model_id)}"
+    def get_qaihm_repo(self, model_id: str, relative=True) -> Path | str:
+        relative_path = Path(
+            ModelZooAssetConfig._replace_path_keywords(
+                self.qaihm_repo, model_id=model_id
+            )
+        )
         if not relative:
-            return self.repo_url + "/" + relative_path
-
+            return f"{self.repo_url}/{relative_path.as_posix()}"
         return relative_path
 
-    def get_website_url(self, model_id: str, relative=False):
-        relative_path = f"{ModelZooAssetConfig._replace_path_keywords(self.models_website_relative_path, model_id=model_id)}"
+    def get_website_url(self, model_id: str, relative=False) -> str:
+        relative_path = Path(
+            ModelZooAssetConfig._replace_path_keywords(
+                self.models_website_relative_path, model_id=model_id
+            )
+        ).as_posix()
         if not relative:
-            return self.models_website_url + "/" + relative_path
+            return f"{self.models_website_url}/{relative_path}"
         return relative_path
 
-    def get_example_use(self, model_id: str):
-        return f"{ModelZooAssetConfig._replace_path_keywords(self.example_use, model_id=model_id)}"
+    def get_example_use(self, model_id: str) -> str:
+        return ModelZooAssetConfig._replace_path_keywords(
+            self.example_use, model_id=model_id
+        )
 
     ###
     # Helpers
@@ -558,7 +583,7 @@ class CachedWebAsset:
     def __init__(
         self,
         url: str,
-        local_cache_path: str,
+        local_cache_path: Path,
         asset_config=ASSET_CONFIG,
         model_downloader: Callable[[str, str, int], str] | None = None,
         downloader_num_retries=4,
@@ -573,12 +598,12 @@ def __init__(
         path, ext = os.path.splitext(self.local_cache_path)
         if not ext:
             file_name = self.url.rsplit("/", 1)[-1]
-            self.local_cache_path = os.path.join(path, file_name)
+            self.local_cache_path = Path(path) / file_name
 
         # Set is_extracted if already extracted on disk
         file, _ = os.path.splitext(self.local_cache_path)
         self.is_extracted = list(
-            filter(local_cache_path.endswith, [".zip", ".tar", ".tar.gz", ".tgz"])
+            filter(str(local_cache_path).endswith, [".zip", ".tar", ".tar.gz", ".tgz"])
         ) != [] and os.path.isdir(file)
 
     def __repr__(self):
@@ -602,7 +627,7 @@ def from_asset_store(
         web_store_path = f"{asset_config.asset_url}/{relative_store_file_path}"
         return CachedWebAsset(
             web_store_path,
-            relative_store_file_path,
+            Path(relative_store_file_path),
             asset_config,
             download_file,
             num_retries,
@@ -611,7 +636,7 @@ def from_asset_store(
     @staticmethod
     def from_google_drive(
         gdrive_file_id: str,
-        relative_store_file_path: str,
+        relative_store_file_path: str | Path,
         num_retries=4,
         asset_config=ASSET_CONFIG,
     ):
@@ -630,7 +655,7 @@ def from_google_drive(
         """
         return CachedWebAsset(
             f"https://drive.google.com/uc?id={gdrive_file_id}",
-            relative_store_file_path,
+            Path(relative_store_file_path),
             asset_config,
             download_and_cache_google_drive,
             num_retries,
@@ -647,12 +672,13 @@ def path(self, extracted=None) -> Path:
             extracted: If true, return the path of the extracted asset on disk.
                        If false, return the path of the archive path on disk.
         """
+        file: str | Path
         if (extracted is None and self.is_extracted) or extracted:
             file, _ = os.path.splitext(self.local_cache_path)
         else:
             file = self.local_cache_path
 
-        return Path(self.asset_config.local_store_path) / file
+        return self.asset_config.get_local_store_path() / file
 
     def fetch(self, force=False, extract=False) -> Path:
         """
@@ -930,11 +956,22 @@ def download_file(web_url: str, dst_path: str, num_retries: int = 4) -> str:
     """
     if not os.path.exists(dst_path):
         print(f"Downloading data at {web_url} to {dst_path}... ", end="")
-        file_data = requests.get(web_url)
-        if file_data.status_code != 200:
+
+        # Streaming, so we can iterate over the response.
+        response = requests.get(web_url, stream=True)
+
+        # Sizes in bytes.
+        total_size = int(response.headers.get("content-length", 0))
+        block_size = 1024
+
+        with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
+            with open(dst_path, "wb") as file:
+                for data in response.iter_content(block_size):
+                    progress_bar.update(len(data))
+                    file.write(data)
+
+        if response.status_code != 200:
             raise ValueError(f"Unable to download file at {web_url}")
-        with open(dst_path, "wb") as dst_file:
-            dst_file.write(file_data.content)
         print("Done")
     return dst_path
 
@@ -1020,4 +1057,16 @@ def callback_with_retry(
             return callback_with_retry(num_retries - 1, callback, *args, **kwargs)
 
 
+@contextmanager
+def qaihm_temp_dir():
+    """
+    Keep temp file under LOCAL_STORE_DEFAULT_PATH instead of /tmp which has
+    limited space.
+    """
+    path = os.path.join(LOCAL_STORE_DEFAULT_PATH, "tmp")
+    os.makedirs(path, exist_ok=True)
+    with tempfile.TemporaryDirectory(dir=path) as tempdir:
+        yield tempdir
+
+
 PathType = Union[str, Path, CachedWebAsset]
diff --git a/qai_hub_models/utils/base_model.py b/qai_hub_models/utils/base_model.py
index fb87155d..bc8a9a87 100644
--- a/qai_hub_models/utils/base_model.py
+++ b/qai_hub_models/utils/base_model.py
@@ -5,10 +5,11 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Any
+from typing import Any, List, Optional
 
+import qai_hub
 import torch
-from qai_hub.client import SourceModel
+from qai_hub.client import Device, SourceModel
 
 from qai_hub_models.models.common import (
     SampleInputsType,
@@ -124,6 +125,8 @@ def convert_to_hub_source_model(
         output_path: str | Path,
         input_spec: InputSpec | None = None,
         check_trace: bool = True,
+        external_onnx_weights: bool = False,
+        output_names: Optional[List[str]] = None,
     ) -> SourceModel:
         """
         Convert to a AI Hub source model appropriate for the export method.
@@ -138,6 +141,8 @@ def convert_to_hub_source_model(
             output_path=output_path,
             input_spec=input_spec,
             check_trace=check_trace,
+            external_onnx_weights=external_onnx_weights,
+            output_names=output_names,
         )
         return source_model
 
@@ -145,17 +150,44 @@ def get_hub_compile_options(
         self,
         target_runtime: TargetRuntime,
         other_compile_options: str = "",
+        device: Optional[Device] = None,
     ) -> str:
         """
         AI Hub compile options recommended for the model.
         """
-        compile_options = ""
-        if target_runtime == TargetRuntime.QNN:
-            compile_options = "--target_runtime qnn_lib_aarch64_android"
-        if target_runtime == TargetRuntime.ORT:
-            compile_options = "--target_runtime onnx"
+        target_runtime_flag = None
+        if "--target_runtime" not in other_compile_options:
+            if target_runtime == TargetRuntime.QNN:
+                if device:
+                    if not device.attributes:
+                        # Only name / os specified
+                        devices = qai_hub.get_devices(device.name, device.os)
+                    elif not device.name:
+                        # Only attribute specified
+                        devices = qai_hub.get_devices(attributes=device.attributes)
+                    else:
+                        devices = [device]
+
+                    for device in devices:
+                        if "os:android" not in device.attributes:
+                            target_runtime_flag = "qnn_bin"
+                            break
+
+                target_runtime_flag = target_runtime_flag or "qnn_lib_aarch64_android"
+            elif target_runtime == TargetRuntime.ORT:
+                target_runtime_flag = "onnx"
+            elif target_runtime == TargetRuntime.TFLITE:
+                target_runtime_flag = "tflite"
+            else:
+                raise NotImplementedError()
+
+        compile_options = (
+            f"--target_runtime {target_runtime_flag}" if target_runtime_flag else ""
+        )
+
         if other_compile_options != "":
             return compile_options + " " + other_compile_options
+
         return compile_options
 
     def preferred_hub_source_model_format(
diff --git a/qai_hub_models/utils/compare.py b/qai_hub_models/utils/compare.py
index a89d437e..8b887ba9 100644
--- a/qai_hub_models/utils/compare.py
+++ b/qai_hub_models/utils/compare.py
@@ -4,13 +4,24 @@
 # ---------------------------------------------------------------------
 from __future__ import annotations
 
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 import torch
 
 
+def _flatten_tuple(out_tuple):
+    if not isinstance(out_tuple, tuple):
+        return (out_tuple.detach(),)
+
+    flattened_tuple = []
+    for elem in out_tuple:
+        flattened_tuple.extend(_flatten_tuple(elem))
+
+    return tuple(flattened_tuple)
+
+
 def torch_inference(
     model: torch.nn.Module, sample_inputs: Dict[str, List[np.ndarray]]
 ) -> List[np.ndarray]:
@@ -33,8 +44,10 @@ def torch_inference(
                 "cpu"
             )
         with torch.no_grad():
-            out = model(**inputs)
+            out = model(*inputs.values())
         out_tuple = (out,) if isinstance(out, torch.Tensor) else out
+        out_tuple = _flatten_tuple(out_tuple)
+
         for i, out_val in enumerate(out_tuple):
             if i == len(torch_outs):
                 torch_outs.append([])
@@ -120,7 +133,7 @@ def compute_top_k_accuracy(expected, actual, k):
 def generate_comparison_metrics(
     expected: List[np.ndarray],
     actual: List[np.ndarray],
-    names: List[str] | None = None,
+    names: Optional[List[str]] = None,
     metrics: str = "psnr",
 ) -> pd.DataFrame:
     """
diff --git a/qai_hub_models/utils/config_loaders.py b/qai_hub_models/utils/config_loaders.py
index 14b4136c..e0c71145 100644
--- a/qai_hub_models/utils/config_loaders.py
+++ b/qai_hub_models/utils/config_loaders.py
@@ -155,6 +155,10 @@ def __repr__(self) -> str:
         return self.__str__()
 
 
+def is_gen_ai_model(tags: List[MODEL_TAG]) -> bool:
+    return MODEL_TAG.LLM in tags or MODEL_TAG.GENERATIVE_AI in tags
+
+
 class MODEL_STATUS(Enum):
     PUBLIC = 0
     PRIVATE = 1
@@ -176,6 +180,7 @@ class MODEL_USE_CASE(Enum):
     IMAGE_GENERATION = 102
     SUPER_RESOLUTION = 103
     SEMANTIC_SEGMENTATION = 104
+    DEPTH_ESTIMATION = 105
     # Ex: OCR, image caption
     IMAGE_TO_TEXT = 105
     OBJECT_DETECTION = 106
@@ -479,6 +484,7 @@ def __init__(
         has_on_target_demo: bool,
         qnn_export_failure_reason: str,
         tflite_export_failure_reason: str,
+        ort_export_failure_reason: str,
         has_demo: bool,
         check_trace: bool,
         channel_last_input: List[str],
@@ -490,15 +496,17 @@ def __init__(
         skip_tests: bool,
         is_precompiled: bool,
         no_assets: bool,
+        skip_export: bool,
         global_requirements_incompatible: bool,
         torchscript_opt: List[str],
         inference_metrics: str,
-        supports_ort: bool,
+        additional_readme_section: str,
     ) -> None:
         self.is_aimet = is_aimet
         self.has_on_target_demo = has_on_target_demo
         self.qnn_export_failure_reason = qnn_export_failure_reason
         self.tflite_export_failure_reason = tflite_export_failure_reason
+        self.ort_export_failure_reason = ort_export_failure_reason
         self.has_demo = has_demo
         self.check_trace = check_trace
         self.channel_last_input = channel_last_input
@@ -513,7 +521,8 @@ def __init__(
         self.global_requirements_incompatible = global_requirements_incompatible
         self.torchscript_opt = torchscript_opt
         self.inference_metrics = inference_metrics
-        self.supports_ort = supports_ort
+        self.additional_readme_section = additional_readme_section
+        self.skip_export = skip_export
 
     def validate(self) -> Tuple[bool, Optional[str]]:
         """Returns false with a reason if the info spec for this model is not valid."""
@@ -537,6 +546,7 @@ def from_yaml(
             code_gen_config["has_on_target_demo"],
             code_gen_config["qnn_export_failure_reason"],
             code_gen_config["tflite_export_failure_reason"],
+            code_gen_config["ort_export_failure_reason"],
             code_gen_config["has_demo"],
             code_gen_config["check_trace"],
             code_gen_config["channel_last_input"],
@@ -551,7 +561,8 @@ def from_yaml(
             code_gen_config["global_requirements_incompatible"],
             code_gen_config["torchscript_opt"],
             code_gen_config["inference_metrics"],
-            code_gen_config["supports_ort"],
+            code_gen_config["additional_readme_section"],
+            code_gen_config["skip_export"],
         )
 
     # Schema for code-gen.yaml
@@ -563,6 +574,7 @@ def from_yaml(
                 OptionalSchema("has_on_target_demo", default=False): bool,
                 OptionalSchema("qnn_export_failure_reason", default=""): str,
                 OptionalSchema("tflite_export_failure_reason", default=""): str,
+                OptionalSchema("ort_export_failure_reason", default=""): str,
                 OptionalSchema("has_demo", default=True): bool,
                 OptionalSchema("check_trace", default=True): bool,
                 OptionalSchema("channel_last_input", default=[]): list,
@@ -577,7 +589,8 @@ def from_yaml(
                 OptionalSchema("global_requirements_incompatible", default=False): bool,
                 OptionalSchema("torchscript_opt", default=[]): list,
                 OptionalSchema("inference_metrics", default="psnr"): str,
-                OptionalSchema("supports_ort", default=False): bool,
+                OptionalSchema("additional_readme_section", default=""): str,
+                OptionalSchema("skip_export", default=False): bool,
             }
         )
     )
@@ -736,7 +749,7 @@ def validate(self) -> Tuple[bool, Optional[str]]:
             if session.head(animated_banner_url).status_code != requests.codes.ok:
                 return False, f"Animated banner is missing at {animated_banner_url}"
 
-        expected_qaihm_repo = f"qai_hub_models/models/{self.id}"
+        expected_qaihm_repo = Path("qai_hub_models") / "models" / self.id
         if expected_qaihm_repo != ASSET_CONFIG.get_qaihm_repo(self.id):
             return False, "QAIHM repo not pointing to expected relative path"
 
diff --git a/qai_hub_models/utils/draw.py b/qai_hub_models/utils/draw.py
index 9352e7ef..2f89e8fc 100644
--- a/qai_hub_models/utils/draw.py
+++ b/qai_hub_models/utils/draw.py
@@ -4,27 +4,27 @@
 # ---------------------------------------------------------------------
 from __future__ import annotations
 
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import cv2
-import numpy
+import numpy as np
 import torch
 
 
 def draw_points(
-    frame: numpy.ndarray,
-    points: numpy.ndarray | torch.Tensor,
+    frame: np.ndarray,
+    points: np.ndarray | torch.Tensor,
     color: Tuple[int, int, int] = (0, 0, 0),
-    size: int = 3,
+    size: Union[int, List[int]] = 10,
 ):
     """
     Draw the given points on the frame.
 
     Parameters:
-        frame: numpy.ndarray
-            numpy array (H W C x uint8, BGR)
+        frame: np.ndarray
+            np array (H W C x uint8, BGR)
 
-        points: numpy.ndarray | torch.Tensor
+        points: np.ndarray | torch.Tensor
             array (N, 2) where layout is
                 [x1, y1] [x2, y2], ...
             or
@@ -40,38 +40,49 @@ def draw_points(
     Returns:
         None; modifies frame in place.
     """
-    n2 = len(points.shape) == 2
-    for i in range(0, len(points) if n2 else len(points) // 2):
-        x, y = points[i] if n2 else (points[i * 2], points[i * 2 + 1])
-        cv2.circle(frame, (int(x), int(y)), size, color, thickness=size)
+    if len(points.shape) == 1:
+        points = points.reshape(-1, 2)
+    assert isinstance(size, int) or len(size) == len(points)
+    cv_keypoints = []
+    for i, (x, y) in enumerate(points):
+        curr_size = size if isinstance(size, int) else size[i]
+        cv_keypoints.append(cv2.KeyPoint(int(x), int(y), curr_size))
+
+    cv2.drawKeypoints(
+        frame,
+        cv_keypoints,
+        outImage=frame,
+        color=color,
+        flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS,
+    )
 
 
 def draw_connections(
-    frame: numpy.ndarray,
-    points: numpy.ndarray | torch.Tensor,
+    frame: np.ndarray,
+    points: np.ndarray | torch.Tensor,
     connections: List[Tuple[int, int]],
     color: Tuple[int, int, int] = (0, 0, 0),
-    size: int = 3,
+    size: int = 1,
 ):
     """
     Draw connecting lines between the given points on the frame.
 
     Parameters:
-        frame: numpy.ndarray
-            numpy array (H W C x uint8, BGR)
+        frame:
+            np array (H W C x uint8, BGR)
 
-        points: numpy.ndarray | torch.Tensor
+        points:
             array (N, 2) where layout is
                 [x1, y1] [x2, y2], ...
             or
             array (N * 2,) where layout is
                 x1, y1, x2, y2, ...
 
-        connections: List[Tuple[int, int]]
+        connections:
             List of points that should be connected by a line.
             Format is [(src point index, dst point index), ...]
 
-        color: Tuple[int, int, int]
+        color:
             Color of drawn points (RGB)
 
         size: int
@@ -80,34 +91,28 @@ def draw_connections(
     Returns:
         None; modifies frame in place.
     """
-    n2 = len(points.shape) == 2
-    for connection in connections:
-        x0, y0 = (
-            points[connection[0]]
-            if n2
-            else (points[connection[0] * 2], points[connection[0] * 2 + 1])
-        )
-        x1, y1 = (
-            points[connection[1]]
-            if n2
-            else (points[connection[1] * 2], points[connection[1] * 2 + 1])
-        )
-        x0, y0 = int(x0), int(y0)
-        x1, y1 = int(x1), int(y1)
-        cv2.line(frame, (x0, y0), (x1, y1), color, size)
+    if len(points.shape) == 1:
+        points = points.reshape(-1, 2)
+    point_pairs = [
+        ((int(points[i][0]), int(points[i][1])), (int(points[j][0]), int(points[j][1])))
+        for (i, j) in connections
+    ]
+    cv2.polylines(
+        frame, np.array(point_pairs), isClosed=False, color=color, thickness=size  # type: ignore
+    )
 
 
 def draw_box_from_corners(
-    frame: numpy.ndarray, corners: numpy.ndarray | torch.Tensor, color=(0, 0, 0), size=3
+    frame: np.ndarray, corners: np.ndarray | torch.Tensor, color=(0, 0, 0), size=3
 ):
     """
     Draw a box using the 4 points provided as boundaries.
 
     Parameters:
-        frame: numpy.ndarray
-            numpy array (H W C x uint8, BGR)
+        frame: np.ndarray
+            np array (H W C x uint8, BGR)
 
-        corners: numpy.ndarray | torch.Tensor
+        corners: np.ndarray | torch.Tensor
             array (4, 2) where layout is
                 [x1, y1] [x2, y2], ...
             or
@@ -128,8 +133,8 @@ def draw_box_from_corners(
 
 
 def draw_box_from_xywh(
-    frame: numpy.ndarray,
-    box: numpy.ndarray | torch.Tensor,
+    frame: np.ndarray,
+    box: np.ndarray | torch.Tensor,
     color: Tuple[int, int, int] = (0, 0, 0),
     size: int = 3,
 ):
@@ -137,10 +142,10 @@ def draw_box_from_xywh(
     Draw a box using the provided data (center / height / width) to compute the box.
 
     Parameters:
-        frame: numpy.ndarray
-            numpy array (H W C x uint8, BGR)
+        frame: np.ndarray
+            np array (H W C x uint8, BGR)
 
-        box: numpy.ndarray | torch.Tensor
+        box: np.ndarray | torch.Tensor
             array (4), where layout is
                 [xcenter, ycenter, h, w]
 
@@ -160,9 +165,9 @@ def draw_box_from_xywh(
 
 
 def draw_box_from_xyxy(
-    frame: numpy.ndarray,
-    top_left: numpy.ndarray | torch.Tensor | Tuple[int, int],
-    bottom_right: numpy.ndarray | torch.Tensor | Tuple[int, int],
+    frame: np.ndarray,
+    top_left: np.ndarray | torch.Tensor | Tuple[int, int],
+    bottom_right: np.ndarray | torch.Tensor | Tuple[int, int],
     color: Tuple[int, int, int] = (0, 0, 0),
     size: int = 3,
     text: Optional[str] = None,
@@ -171,10 +176,10 @@ def draw_box_from_xyxy(
     Draw a box using the provided top left / bottom right points to compute the box.
 
     Parameters:
-        frame: numpy.ndarray
-            numpy array (H W C x uint8, BGR)
+        frame: np.ndarray
+            np array (H W C x uint8, BGR)
 
-        box: numpy.ndarray | torch.Tensor
+        box: np.ndarray | torch.Tensor
             array (4), where layout is
                 [xc, yc, h, w]
 
@@ -217,7 +222,7 @@ def create_color_map(num_classes):
     Returns:
         A list of `num_classes` colors in RGB format.
     """
-    numpy.random.seed(42)  # For reproducible results
-    color_map = numpy.random.randint(0, 256, size=(num_classes, 3), dtype=numpy.uint8)
+    np.random.seed(42)  # For reproducible results
+    color_map = np.random.randint(0, 256, size=(num_classes, 3), dtype=np.uint8)
     color_map[0] = [0, 0, 0]  # Background class, usually black
     return color_map
diff --git a/qai_hub_models/utils/huggingface.py b/qai_hub_models/utils/huggingface.py
index d278d95c..4ddd9bef 100644
--- a/qai_hub_models/utils/huggingface.py
+++ b/qai_hub_models/utils/huggingface.py
@@ -8,7 +8,9 @@
 from pathlib import Path
 from typing import List
 
-from huggingface_hub import HfFileSystem, hf_hub_download
+from huggingface_hub import HfApi, HfFileSystem, hf_hub_download
+from huggingface_hub.utils import GatedRepoError
+from packaging import version
 
 from qai_hub_models.utils.asset_loaders import ASSET_CONFIG, ModelZooAssetConfig
 from qai_hub_models.utils.base_model import TargetRuntime
@@ -45,3 +47,37 @@ def fetch_huggingface_target_model(
         paths.append(path)
 
     return paths
+
+
+def has_model_access(repo_name: str, repo_url: str):
+    # Huggingface returns GatedRepoError if model is not accessible to current User.
+    # ref: https://github.com/huggingface/huggingface_hub/blob/5ff2d150d121d04799b78bc08f2343c21b8f07a9/src/huggingface_hub/utils/_errors.py#L135
+
+    try:
+        hf_api = HfApi()
+        hf_api.model_info(repo_name)
+    except GatedRepoError:
+        no_access_error = (
+            f"Seems like you don't have access to {repo_name} yet.\nPlease follow the following steps:"
+            f"\n 1. Apply for access at {repo_url}"
+            f"\n 2. Setup Huggingface API token as described in https://huggingface.co/docs/huggingface_hub/en/quick-start#login-command"
+            f"\nOnce access request is approved, you should be able to export/load {repo_name} via AI-Hub."
+        )
+        raise RuntimeError(no_access_error)
+
+    # Model is accesible for current User.
+    return True
+
+
+def ensure_has_required_transformer(least_expected_version):
+    # import transformer as part of this function
+    # to avoid leaking installation globally on file import.
+    # NOTE: #10761 this function should not be required once AIMET (https://pypi.org/project/aimet-torch/)
+    # remove tight dependency on transformers.
+    import transformers
+
+    if version.parse(transformers.__version__) < version.parse(least_expected_version):
+        raise RuntimeError(
+            f"Installed transformers version not supported. Expected >= {least_expected_version}, got {str(transformers.__version__)}\n"
+            f"Please run `pip install transformers=={least_expected_version}`"
+        )
diff --git a/qai_hub_models/utils/image_processing.py b/qai_hub_models/utils/image_processing.py
index 4fb9405a..8d86f924 100644
--- a/qai_hub_models/utils/image_processing.py
+++ b/qai_hub_models/utils/image_processing.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import functools
+import math
 from typing import Callable, List, Tuple
 
 import cv2
@@ -16,6 +17,15 @@
 from torch.nn.functional import interpolate, pad
 from torchvision import transforms
 
+IMAGENET_DIM = 224
+IMAGENET_TRANSFORM = transforms.Compose(
+    [
+        transforms.Resize(256),
+        transforms.CenterCrop(IMAGENET_DIM),
+        transforms.ToTensor(),
+    ]
+)
+
 
 def app_to_net_image_inputs(
     pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image],
@@ -175,12 +185,15 @@ def resize_pad(image: torch.Tensor, dst_size: Tuple[int, int]):
 
     h_ratio = dst_frame_height / height
     w_ratio = dst_frame_width / width
-    if width * h_ratio > dst_frame_height:
-        scale = w_ratio
-    else:
+    scale = min(h_ratio, w_ratio)
+    if h_ratio < w_ratio:
         scale = h_ratio
-
-    import math
+        new_height = dst_frame_height
+        new_width = math.floor(width * scale)
+    else:
+        scale = w_ratio
+        new_height = math.floor(height * scale)
+        new_width = dst_frame_width
 
     new_height = math.floor(height * scale)
     new_width = math.floor(width * scale)
diff --git a/qai_hub_models/utils/inference.py b/qai_hub_models/utils/inference.py
index ef927a52..e122014d 100644
--- a/qai_hub_models/utils/inference.py
+++ b/qai_hub_models/utils/inference.py
@@ -5,9 +5,8 @@
 from __future__ import annotations
 
 import os
-import tempfile
 from pathlib import Path
-from typing import List, Mapping, Tuple
+from typing import List, Mapping, Optional, Tuple
 
 import numpy as np
 import qai_hub as hub
@@ -15,7 +14,7 @@
 from qai_hub.public_rest_api import DatasetEntries
 
 from qai_hub_models.models.protocols import ExecutableModelProtocol
-from qai_hub_models.utils.asset_loaders import ModelZooAssetConfig
+from qai_hub_models.utils.asset_loaders import ModelZooAssetConfig, qaihm_temp_dir
 from qai_hub_models.utils.base_model import BaseModel, SourceModelFormat, TargetRuntime
 from qai_hub_models.utils.input_spec import InputSpec
 from qai_hub_models.utils.qai_hub_helpers import (
@@ -38,6 +37,8 @@ def prepare_compile_zoo_model_to_hub(
     input_spec: InputSpec | None = None,
     check_trace: bool = True,
     prepare_compile_options_only: bool = False,
+    external_onnx_weights: bool = False,
+    output_names: Optional[List[str]] = None,
 ) -> Tuple[str | None, str]:
     """
     Args:
@@ -86,12 +87,19 @@ def prepare_compile_zoo_model_to_hub(
 
     compilation_options = model.get_hub_compile_options(target_runtime)
 
+    if output_names is None:
+        output_names = []
+
     if is_aimet:
         if source_model_format == SourceModelFormat.ONNX:
 
             def export_model_func():
+                print("Exporting model to ONNX and generating AIMET encodings")
                 return model.convert_to_onnx_and_aimet_encodings(
-                    output_path, model_name=model_name
+                    output_path,
+                    model_name=model_name,
+                    external_weights=external_onnx_weights,
+                    output_names=output_names,
                 )
 
         elif (
@@ -100,6 +108,7 @@ def export_model_func():
         ):
 
             def export_model_func():
+                print("Converting model to Torchscript")
                 traced_model = model.convert_to_torchscript(
                     input_spec=input_spec, check_trace=check_trace
                 )
@@ -111,6 +120,7 @@ def export_model_func():
         else:  # Torchscript and QNN
 
             def export_model_func():
+                print("Converting model to Torchscript and generating AIMET encodings")
                 exported_model = model.convert_to_torchscript_and_aimet_encodings(  # type: ignore
                     output_path,
                     model_name=model_name,
@@ -161,7 +171,7 @@ def compile_zoo_model_to_hub(
 
     model_name = model.__class__.__name__
 
-    with tempfile.TemporaryDirectory() as tmp_dir:
+    with qaihm_temp_dir() as tmp_dir:
         assert tmp_dir is not None
         source_model, compilation_options = prepare_compile_zoo_model_to_hub(
             model=model,
@@ -218,11 +228,13 @@ def __init__(
         input_names: List[str],
         device: hub.Device,
         inference_options: str = "",
+        output_names: Optional[List[str]] = None,
     ):
         self.model = model
         self.input_names = input_names
         self.device = device
         self.inference_options = inference_options
+        self.output_names = [] if output_names is None else output_names
 
     def __call__(
         self,
@@ -309,9 +321,12 @@ def forward(
                 target_runtime,
             )  # type: ignore
 
+        outputs = output_dataset.values()  # type: ignore
+        if len(self.output_names) > 0:
+            outputs = [output_dataset[out_name] for out_name in self.output_names]  # type: ignore
+
         output_torch = [
-            torch.from_numpy(np.concatenate(outputs, axis=0))
-            for outputs in output_dataset.values()  # type: ignore
+            torch.from_numpy(np.concatenate(output, axis=0)) for output in outputs
         ]
 
         if len(output_torch) == 1:
@@ -334,9 +349,8 @@ def get_uploaded_precompiled_model(
         model_name, model_version, f"{model_component}_model_id.cached"
     )
 
-    use_cached_model = not ignore_cached_model or os.path.exists(model_id_path)
     uploaded_model = None
-    if use_cached_model:
+    if not ignore_cached_model:
         try:
             with open(model_id_path, "r") as model_id_file:
                 model_id = model_id_file.readline().strip()
@@ -346,8 +360,7 @@ def get_uploaded_precompiled_model(
                 return uploaded_model
 
         except Exception:
-            # Try uploading model instead
-            use_cached_model = False
+            pass
 
     # Upload model on hub
     uploaded_model = hub.upload_model(model_path)
diff --git a/qai_hub_models/utils/measurement.py b/qai_hub_models/utils/measurement.py
index 2c4a8f21..b0eb555a 100644
--- a/qai_hub_models/utils/measurement.py
+++ b/qai_hub_models/utils/measurement.py
@@ -5,7 +5,6 @@
 from __future__ import annotations
 
 import os
-import tempfile
 from pathlib import Path
 from typing import List, Union
 
@@ -13,6 +12,8 @@
 import qai_hub as hub
 from tflite import Model as TFModel  # type: ignore
 
+from qai_hub_models.utils.asset_loaders import qaihm_temp_dir
+
 
 def display_with_sig_figs(num: float, num_sig_figs: int = 3) -> str:
     """
@@ -103,7 +104,7 @@ def get_model_size_mb(hub_model: hub.Model) -> float:
     """Return target model size in MB. This is a special case for ease of
     testing"""
     assert hub_model is not None
-    with tempfile.TemporaryDirectory() as tmp_dir:
+    with qaihm_temp_dir() as tmp_dir:
         download_path = Path(tmp_dir) / "model"
         # Download the model into the temporary directory
         hub_model.download(download_path)  # type: ignore
diff --git a/qai_hub_models/utils/model_adapters.py b/qai_hub_models/utils/model_adapters.py
index 44e94b16..720a3aa5 100644
--- a/qai_hub_models/utils/model_adapters.py
+++ b/qai_hub_models/utils/model_adapters.py
@@ -16,7 +16,7 @@ def flatten(obj):
     flattened_list = []
     for item in obj:
         if isinstance(item, tgt_type):
-            flattened_list.extend(flatten(item, tgt_type))
+            flattened_list.extend(flatten(item))
         else:
             flattened_list.append(item)
     return flattened_list
diff --git a/qai_hub_models/utils/printing.py b/qai_hub_models/utils/printing.py
index 89e5ab3f..95aa9bdc 100644
--- a/qai_hub_models/utils/printing.py
+++ b/qai_hub_models/utils/printing.py
@@ -4,7 +4,7 @@
 # ---------------------------------------------------------------------
 from collections import Counter
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import qai_hub as hub
@@ -20,17 +20,36 @@
 _INFO_DASH = "-" * 60
 
 
+def print_with_box(data: List[str]) -> None:
+    """
+    Print input list with box around it as follows
+    +-----------------------------+
+    | list data 1                 |
+    | list data 2 that is longest |
+    | data                        |
+    +-----------------------------+
+    """
+    size = max(len(line) for line in data)
+    size += 2
+    print("+" + "-" * size + "+")
+    for line in data:
+        print("| {:<{}} |".format(line, size - 2))
+    print("+" + "-" * size + "+")
+
+
 def print_inference_metrics(
     inference_job: hub.InferenceJob,
     inference_result: DatasetEntries,
     torch_out: List[np.ndarray],
     outputs_to_skip: Optional[List[int]] = None,
+    output_names: Optional[List[str]] = None,
     metrics: str = "psnr",
 ) -> None:
+    if output_names is None:
+        output_names = list(inference_result.keys())
     inference_data = [
-        np.concatenate(outputs, axis=0) for outputs in inference_result.values()
+        np.concatenate(inference_result[out_name], axis=0) for out_name in output_names
     ]
-    output_names = list(inference_result.keys())
     df_eval = generate_comparison_metrics(
         torch_out, inference_data, names=output_names, metrics=metrics
     )
@@ -78,7 +97,7 @@ def print_profile_metrics_from_job(
         runtime = TargetRuntime.TFLITE
     elif is_qnn_hub_model(profile_job.model):
         runtime = TargetRuntime.QNN
-    elif profile_job.model.model_type == SourceModelType.ORT:
+    elif profile_job.model.model_type in [SourceModelType.ORT, SourceModelType.ONNX]:
         runtime = TargetRuntime.ORT
     else:
         raise NotImplementedError()
@@ -128,18 +147,30 @@ def print_profile_metrics(
 
 
 def print_on_target_demo_cmd(
-    compile_job: hub.CompileJob, model_folder: Path, device: str
+    compile_job: Union[hub.CompileJob, List[hub.CompileJob]],
+    model_folder: Path,
+    device: str,
 ) -> None:
     """
     Outputs a command that will run a model's demo script via inference job.
     """
-    assert compile_job.wait().success
-    print("\nRun this model on a hosted device on sample data using:")
-    target_model = compile_job.get_target_model()
-    assert target_model is not None
+    if isinstance(compile_job, hub.CompileJob):
+        compile_job = [compile_job]
+
+    target_model_id = []
+    for job in compile_job:
+        assert job.wait().success
+        target_model = job.get_target_model()
+        assert target_model is not None
+        target_model_id.append(target_model.model_id)
+
+    target_model_id_str = ",".join(target_model_id)
+    print(
+        f"\nRun compiled model{'s' if len(target_model_id) > 1 else ''} on a hosted device on sample data using:"
+    )
     print(
         f"python {model_folder / 'demo.py'} "
         "--on-device "
-        f"--hub-model-id {target_model.model_id} "
+        f"--hub-model-id {target_model_id_str} "
         f'--device "{device}"\n'
     )
diff --git a/qai_hub_models/utils/qai_hub_helpers.py b/qai_hub_models/utils/qai_hub_helpers.py
index 89deb1d6..d8db058e 100644
--- a/qai_hub_models/utils/qai_hub_helpers.py
+++ b/qai_hub_models/utils/qai_hub_helpers.py
@@ -6,7 +6,7 @@
 
 import os
 from pathlib import Path
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import numpy as np
 import qai_hub as hub
@@ -89,7 +89,7 @@ def export_without_hub_access(
     target_runtime: TargetRuntime,
     compile_options: str,
     profile_options: str,
-    components: List[str] | None = None,
+    components: Optional[List[str]] = None,
 ) -> List[str]:
     print(_WARNING_DASH)
     print(
diff --git a/qai_hub_models/utils/quantization_aimet.py b/qai_hub_models/utils/quantization_aimet.py
index 02ebd2a1..0a0a61d0 100644
--- a/qai_hub_models/utils/quantization_aimet.py
+++ b/qai_hub_models/utils/quantization_aimet.py
@@ -31,15 +31,16 @@
     )
 
 import shutil
-import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
-from zipfile import ZipFile
+from zipfile import ZIP_DEFLATED, ZipFile
 
 import aimet_torch.elementwise_ops as aimet_ops
 import torch
 import torch.nn.modules as nn
-from qai_hub.client import DatasetEntries
+from onnx import load_model as load_onnx_model
+from onnx import save_model as save_onnx_model
+from qai_hub.client import DatasetEntries, Device
 
 from qai_hub_models.evaluators.base_evaluators import _DataLoader, _for_each_batch
 from qai_hub_models.models._shared.common import apply_module_function_recursively
@@ -48,6 +49,7 @@
     PretrainedHubModelProtocol,
     QuantizableModelProtocol,
 )
+from qai_hub_models.utils.asset_loaders import qaihm_temp_dir
 from qai_hub_models.utils.input_spec import InputSpec, make_torch_inputs
 
 
@@ -58,19 +60,27 @@ def _should_tie_observers(op: torch.nn.Module) -> bool:
     if not hasattr(op, "_module_to_wrap"):
         return False
     wrapped_op = op._module_to_wrap
-    op_types_to_tie = [nn.MaxPool2d, nn.AvgPool2d, nn.Upsample, aimet_ops.Concat]
+    op_types_to_tie = [
+        nn.MaxPool2d,
+        nn.AvgPool2d,
+        nn.Upsample,
+        aimet_ops.Concat,
+        aimet_ops.Interpolate,
+    ]
     for op_type in op_types_to_tie:
         if isinstance(wrapped_op, op_type):
             return True
     return False
 
 
-def _get_observer_module_name(modules: Dict[str, Any], name: str) -> Optional[str]:
-    module = modules.get(name)
+def _get_observer_module_name(modules: Dict[str, Any], target: Any) -> Optional[str]:
+    if not isinstance(target, str):
+        return None
+    module = modules.get(target)
     if isinstance(module, QcQuantizeWrapper):
-        return name
+        return target
     elif isinstance(module, aimet_ops.CustomSiLU):
-        return name + ".mul"
+        return target + ".mul"
     return None
 
 
@@ -140,7 +150,13 @@ def tie_observers(quant_sim: QuantizationSimModel) -> None:
                     modules, input_node.target
                 )
             ) is None:
+                if input_node.target == getattr:
+                    # If the input node is getting a tensor attribute (e.g. shape)
+                    # No observers need to be tied
+                    break
                 input_node = input_node.all_input_nodes[0]
+            if input_node.target == getattr or observer_module_name is None:
+                continue
             if observer_module_name not in quantizer_deps:
                 quantizer_deps[observer_module_name] = []
             quantizer_deps[observer_module_name].append(node.target)
@@ -315,7 +331,7 @@ def convert_to_torchscript_and_aimet_encodings(
         zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip")
         base_dir = Path(f"{model_name}.aimet")
 
-        with tempfile.TemporaryDirectory() as tmpdir:
+        with qaihm_temp_dir() as tmpdir:
             base_path = Path(tmpdir) / base_dir
             os.makedirs(base_path)
             self.quant_sim.export(
@@ -343,6 +359,8 @@ def convert_to_onnx_and_aimet_encodings(
         output_dir: str | Path,
         input_spec: InputSpec | None = None,
         model_name: str | None = None,
+        external_weights: bool = False,
+        output_names: Optional[List[str]] = None,
     ) -> str:
         """
         Converts the torch module to a zip file containing an
@@ -357,27 +375,53 @@ def convert_to_onnx_and_aimet_encodings(
         zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip")
         base_dir = Path(f"{model_name}.aimet")
 
-        with tempfile.TemporaryDirectory() as tmpdir:
+        with qaihm_temp_dir() as tmpdir:
             base_path = Path(tmpdir) / base_dir
             if base_path.exists():
                 shutil.rmtree(base_path)
             os.makedirs(base_path)
 
             onnx_utils.EXPORT_TO_ONNX_DIRECT = self.needs_onnx_direct_aimet_export
+
             self.quant_sim.export(
                 str(base_path),
                 model_name,
                 tuple(make_torch_inputs(input_spec)),
-                onnx_export_args=dict(input_names=[name for name in input_spec]),
+                onnx_export_args=dict(
+                    input_names=[name for name in input_spec], output_names=output_names
+                ),
             )
-
             onnx_file_name = f"{model_name}.onnx"
             encodings_file_name = f"{model_name}.encodings"
-            with ZipFile(zip_path, "w") as zip_object:
+            external_weights_file_name = f"{model_name}.data"
+
+            if external_weights:
+                # Torch exports to onnx with external weights scattered in a directory.
+                # Save ONNX model with weights to one file.
+                onnx_file_path = str(base_path / onnx_file_name)
+                onnx_model = load_onnx_model(onnx_file_path)
+                save_onnx_model(
+                    onnx_model,
+                    str(onnx_file_path),
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=True,
+                    location=external_weights_file_name,
+                )
+
+            # compresslevel defines how fine compression should run
+            # higher the level, heavier algorithm is used leading to more time.
+            # For large models, higher compression takes longer time to compress.
+            with ZipFile(zip_path, "w", ZIP_DEFLATED, compresslevel=4) as zip_object:
                 zip_object.write(base_path, base_dir)
+
                 zip_object.write(
                     base_path / onnx_file_name, os.path.join(base_dir, onnx_file_name)
                 )
+                if external_weights:
+                    zip_object.write(
+                        base_path / external_weights_file_name,
+                        os.path.join(base_dir, external_weights_file_name),
+                    )
                 zip_object.write(
                     base_path / encodings_file_name,
                     os.path.join(base_dir, encodings_file_name),
@@ -391,7 +435,7 @@ def convert_to_torchscript(
         if not input_spec:
             input_spec = self.get_input_spec()
 
-        with tempfile.TemporaryDirectory() as tempdir:
+        with qaihm_temp_dir() as tempdir:
             self.quant_sim.export(
                 tempdir,
                 "model",
@@ -415,12 +459,19 @@ def get_calibration_data(
         return {k: v.numpy() for k, v in zip(input_spec.keys(), inputs)}
 
     def get_hub_compile_options(
-        self, target_runtime: TargetRuntime, other_compile_options: str = ""
+        self,
+        target_runtime: TargetRuntime,
+        other_compile_options: str = "",
+        device: Optional[Device] = None,
     ) -> str:
         compile_options = super().get_hub_compile_options(  # type: ignore
-            target_runtime, other_compile_options
+            target_runtime, other_compile_options, device
         )
-        return compile_options + " --quantize_full_type int8 --quantize_io"
+        compile_options = compile_options + " --quantize_full_type int8"
+        if target_runtime != TargetRuntime.ORT:
+            # TODO(#10896): Restore quantize_io flag when targeting ORT
+            compile_options = compile_options + " --quantize_io"
+        return compile_options
 
     def preferred_hub_source_model_format(
         self, target_runtime: TargetRuntime
diff --git a/qai_hub_models/utils/scorecard/common.py b/qai_hub_models/utils/scorecard/common.py
index f08b909b..b3230a71 100644
--- a/qai_hub_models/utils/scorecard/common.py
+++ b/qai_hub_models/utils/scorecard/common.py
@@ -2,35 +2,262 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
-import qai_hub as hub
+import os
+from enum import Enum
+from typing import Dict, List, Optional, Tuple
 
-SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME = {
-    "s23": "qualcomm-snapdragon-8gen2",
-    "s24": "qualcomm-snapdragon-8gen3",
-    "6490": "qualcomm-qcs6490",
-    "8250": "qualcomm-qcs8250",
-    "8550": "qualcomm-qcs8550",
-}
+import qai_hub as hub
 
+from qai_hub_models.models.common import TargetRuntime
 
-SCORECARD_DEVICE_NAME_TO_CHIPSET = {
-    device: f"chipset:{chipset}"
-    for device, chipset in SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME.items()
-}
+_DEVICE_CACHE: Dict[str, hub.Device] = {}
 
 
-def __get_device(device_name) -> hub.Device:
+def _get_cached_device(device_name: str) -> hub.Device:
     # Gets a device with attributes & OS. This only comes from hub.get_devices()
-    for device in hub.get_devices():
-        if device.name == device_name:
-            return device
-    raise ValueError(f"No device named {device_name}")
-
-
-REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS = {
-    "qualcomm-snapdragon-8gen2": __get_device("Samsung Galaxy S23"),
-    "qualcomm-snapdragon-8gen3": __get_device("Samsung Galaxy S24"),
-    "qualcomm-qcs6490": __get_device("RB3 Gen 2 (Proxy)"),
-    "qualcomm-qcs8250": __get_device("RB5 (Proxy)"),
-    "qualcomm-qcs8550": __get_device("QCS8550 (Proxy)"),
-}
+    device = _DEVICE_CACHE.get(device_name, None)
+    if not device:
+        device = hub.get_devices(device_name)[0]
+        _DEVICE_CACHE[device_name] = device
+    return device
+
+
+class ScorecardDevice(Enum):
+    any = 0  # no specific device (usable only during compilation)
+
+    # cs == chipset
+    cs_8_gen_2 = 1
+    cs_8_gen_3 = 2
+    cs_6490 = 3
+    cs_8250 = 4
+    cs_8550 = 5
+    cs_x_elite = 6
+
+    def enabled(self) -> bool:
+        valid_test_devices = os.environ.get("WHITELISTED_PROFILE_TEST_DEVICES", "ALL")
+        return (
+            valid_test_devices == "ALL"
+            or self == ScorecardDevice.any
+            or self.name in valid_test_devices.split(",")
+        )
+
+    def all_enabled(self) -> List["ScorecardDevice"]:
+        return [x for x in ScorecardDevice if x.enabled()]
+
+    def get_reference_device(self) -> hub.Device:
+        if self in [ScorecardDevice.cs_8_gen_2, ScorecardDevice.any]:
+            return _get_cached_device("Samsung Galaxy S23")
+        if self == ScorecardDevice.cs_8_gen_3:
+            return _get_cached_device("Samsung Galaxy S24")
+        if self == ScorecardDevice.cs_6490:
+            return _get_cached_device("RB3 Gen 2 (Proxy)")
+        if self == ScorecardDevice.cs_8250:
+            return _get_cached_device("RB5 (Proxy)")
+        if self == ScorecardDevice.cs_8550:
+            return _get_cached_device("QCS8550 (Proxy)")
+        if self == ScorecardDevice.cs_x_elite:
+            return _get_cached_device("Snapdragon X Elite CRD")
+        raise NotImplementedError(f"No reference device for {self.name}")
+
+    def get_chipset(self) -> str:
+        if self in [ScorecardDevice.cs_8_gen_2, ScorecardDevice.any]:
+            return "qualcomm-snapdragon-8gen2"
+        if self == ScorecardDevice.cs_8_gen_3:
+            return "qualcomm-snapdragon-8gen3"
+        if self == ScorecardDevice.cs_6490:
+            return "qualcomm-qcs6490"
+        if self == ScorecardDevice.cs_8250:
+            return "qualcomm-qcs8250"
+        if self == ScorecardDevice.cs_8550:
+            return "qualcomm-qcs8550"
+        if self == ScorecardDevice.cs_x_elite:
+            return "qualcomm-snapdragon-x-elite"
+        raise NotImplementedError(f"No chipset for {self.name}")
+
+    def get_os(self) -> str:
+        for attr in self.get_reference_device().attributes:
+            if attr.startswith("os:"):
+                return attr[3:]
+        raise ValueError(f"OS Not found for device: {self.name}")
+
+
+class ScorecardCompilePath(Enum):
+    TFLITE = 0
+    QNN = 1
+    ORT = 2
+
+    def __str__(self):
+        return self.name.lower()
+
+    @property
+    def long_name(self):
+        return f"torchscript_onnx_{self.name.lower()}"
+
+    def enabled(self) -> bool:
+        valid_test_runtimes = os.environ.get("WHITELISTED_TEST_RUNTIMES", "ALL")
+        return valid_test_runtimes == "ALL" or (
+            self.get_runtime().name.lower()
+            in [x.lower() for x in valid_test_runtimes.split(",")]
+        )
+
+    @staticmethod
+    def all_enabled() -> List["ScorecardCompilePath"]:
+        return [x for x in ScorecardCompilePath if x.enabled()]
+
+    @staticmethod
+    def get_parameterized_test_config(
+        aimet_model=False,
+        only_enabled_paths=True,
+        only_enabled_devices=True,
+    ) -> List[Tuple["ScorecardCompilePath", ScorecardDevice]]:
+        path_list: List[ScorecardCompilePath] = ScorecardCompilePath.all_enabled() if only_enabled_paths else ScorecardCompilePath  # type: ignore
+        path_devices_dict = {
+            sc_path: sc_path.get_test_devices(aimet_model, only_enabled_devices)
+            for sc_path in path_list
+        }
+        return [
+            (key, dev) for key, devices in path_devices_dict.items() for dev in devices
+        ]
+
+    def get_runtime(self) -> TargetRuntime:
+        if self == ScorecardCompilePath.TFLITE:
+            return TargetRuntime.TFLITE
+        if self == ScorecardCompilePath.ORT:
+            return TargetRuntime.ORT
+        if self == ScorecardCompilePath.QNN:
+            return TargetRuntime.QNN
+        raise NotImplementedError()
+
+    def get_test_devices(
+        self, aimet_model=False, only_enabled=True
+    ) -> List[ScorecardDevice]:
+        if self == ScorecardCompilePath.QNN:
+            devices = [ScorecardDevice.any, ScorecardDevice.cs_x_elite]
+        else:
+            devices = [ScorecardDevice.any]
+
+        return [x for x in devices if x.enabled()] if only_enabled else devices
+
+    def get_compile_options(self, aimet_model=False) -> str:
+        if aimet_model and self.get_runtime() == TargetRuntime.ORT:
+            # TODO(#10896): Restore quantize_io flag to
+            # the default set of flags used to target ORT.
+            # This flag can be removed when that happens.
+            return "--quantize_io"
+        return ""
+
+    def get_job_cache_name(
+        self,
+        model: str,
+        device: ScorecardDevice = ScorecardDevice.any,
+        component: Optional[str] = None,
+    ):
+        if device not in self.get_test_devices():
+            device = ScorecardDevice.any  # default to the "generic" compilation path
+        return f"{model}_{self.name}{'-' + device.name if device != ScorecardDevice.any else ''}{'_' + component if component else ''}"
+
+
+class ScorecardProfilePath(Enum):
+    TFLITE = 0
+    QNN = 1
+    ORT = 2
+    ORT_DML_GPU = 3
+
+    def __str__(self):
+        return self.name.lower()
+
+    @property
+    def long_name(self):
+        return f"torchscript_onnx_{self.name.lower()}"
+
+    def enabled(self) -> bool:
+        valid_test_runtimes = os.environ.get("WHITELISTED_TEST_RUNTIMES", "ALL")
+        return valid_test_runtimes == "ALL" or (
+            self.get_runtime().name.lower()
+            in [x.lower() for x in valid_test_runtimes.split(",")]
+        )
+
+    @staticmethod
+    def all_enabled() -> List["ScorecardProfilePath"]:
+        return [x for x in ScorecardProfilePath if x.enabled()]
+
+    @staticmethod
+    def get_parameterized_test_config(
+        aimet_model=False,
+        only_enabled_paths=True,
+        only_enabled_devices=True,
+    ) -> List[Tuple["ScorecardProfilePath", ScorecardDevice]]:
+        path_list: List[ScorecardProfilePath] = ScorecardProfilePath.all_enabled() if only_enabled_paths else ScorecardProfilePath  # type: ignore
+        path_devices_dict = {
+            sc_path: sc_path.get_test_devices(aimet_model, only_enabled_devices)
+            for sc_path in path_list
+        }
+        return [
+            (key, dev) for key, devices in path_devices_dict.items() for dev in devices
+        ]
+
+    def get_runtime(self) -> TargetRuntime:
+        if self == ScorecardProfilePath.TFLITE:
+            return TargetRuntime.TFLITE
+        if self in [ScorecardProfilePath.ORT, ScorecardProfilePath.ORT_DML_GPU]:
+            return TargetRuntime.ORT
+        if self == ScorecardProfilePath.QNN:
+            return TargetRuntime.QNN
+        raise NotImplementedError()
+
+    def get_compile_path(self) -> ScorecardCompilePath:
+        if self == ScorecardProfilePath.TFLITE:
+            return ScorecardCompilePath.TFLITE
+        if self in [ScorecardProfilePath.ORT, ScorecardProfilePath.ORT_DML_GPU]:
+            return ScorecardCompilePath.ORT
+        if self == ScorecardProfilePath.QNN:
+            return ScorecardCompilePath.QNN
+        raise NotImplementedError()
+
+    def get_profile_options(self) -> str:
+        if self == ScorecardProfilePath.ORT_DML_GPU:
+            return "--compute_unit gpu"
+        return ""
+
+    def get_test_devices(
+        self, aimet_model=False, only_enabled=True
+    ) -> List[ScorecardDevice]:
+        if self == ScorecardProfilePath.TFLITE:
+            devices = [
+                ScorecardDevice.cs_8_gen_2,
+                ScorecardDevice.cs_8_gen_3,
+                ScorecardDevice.cs_8550,
+            ] + (
+                [ScorecardDevice.cs_6490, ScorecardDevice.cs_8250]
+                if aimet_model
+                else []
+            )
+        elif self == ScorecardProfilePath.ORT:
+            devices = [
+                ScorecardDevice.cs_8_gen_2,
+                ScorecardDevice.cs_8_gen_3,
+                ScorecardDevice.cs_x_elite,
+            ]
+        elif self == ScorecardProfilePath.QNN:
+            devices = [
+                ScorecardDevice.cs_8_gen_2,
+                ScorecardDevice.cs_8_gen_3,
+                ScorecardDevice.cs_x_elite,
+                ScorecardDevice.cs_8550,
+            ] + ([ScorecardDevice.cs_6490] if aimet_model else [])
+        elif self == ScorecardProfilePath.ORT_DML_GPU:
+            devices = [ScorecardDevice.cs_x_elite]
+        else:
+            raise NotImplementedError()
+
+        return [x for x in devices if x.enabled()] if only_enabled else devices
+
+    def get_job_cache_name(
+        self,
+        model: str,
+        device: ScorecardDevice,
+        component: Optional[str] = None,
+    ):
+        return (
+            f"{model}_{self.name}-{device.name}{'_' + component if component else ''}"
+        )
diff --git a/qai_hub_models/utils/scorecard/job_summary.py b/qai_hub_models/utils/scorecard/job_summary.py
index a9fc112f..683dda40 100644
--- a/qai_hub_models/utils/scorecard/job_summary.py
+++ b/qai_hub_models/utils/scorecard/job_summary.py
@@ -8,11 +8,11 @@
 
 import qai_hub as hub
 
-from qai_hub_models.models.common import TargetRuntime
 from qai_hub_models.utils.config_loaders import QAIHMModelCodeGen, QAIHMModelInfo
 from qai_hub_models.utils.scorecard.common import (
-    REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS,
-    SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME,
+    ScorecardCompilePath,
+    ScorecardDevice,
+    ScorecardProfilePath,
 )
 
 
@@ -20,11 +20,10 @@
 class JobSummary:
     model_id: str
     job_id: Optional[str]
-    runtime: TargetRuntime
+    _device: ScorecardDevice
 
     def __post_init__(self):
         assert self.model_id
-        assert self.runtime
         # Verify Job Exists
         if self.job_id:
             assert self.job
@@ -100,6 +99,8 @@ def quantized(self) -> str:
 
 @dataclass
 class CompileJobSummary(JobSummary):
+    path: ScorecardCompilePath
+
     @classmethod
     def from_model_id(
         cls: Type["CompileJobSummary"], model_id: str, job_ids: Dict[str, str]
@@ -123,25 +124,25 @@ def from_model_id(
                 components = model_code_gen.default_components
             else:
                 components = list(model_code_gen.components.keys())
+        else:
+            components.append(None)  # type: ignore
 
-        for runtime in TargetRuntime:
-            if not components:
-                model_runs.append(
-                    cls(
-                        model_id=model_info.name,
-                        job_id=job_ids.get(f"{model_id}_{runtime.name}", None),
-                        runtime=runtime,
-                    )
-                )
-            else:
-                for component in components:
+        path: ScorecardCompilePath
+        for path in ScorecardCompilePath.all_enabled():
+            for component in components:
+                for device in path.get_test_devices(model_code_gen.is_aimet):
                     model_runs.append(
                         cls(
-                            model_id=component,
+                            model_id=component or model_info.name,
                             job_id=job_ids.get(
-                                f"{model_id}_{runtime.name}_{component}", None
+                                path.get_job_cache_name(
+                                    model=model_id,
+                                    device=device,
+                                    component=component,
+                                )
                             ),
-                            runtime=runtime,
+                            path=path,
+                            _device=device,
                         )
                     )
 
@@ -162,7 +163,7 @@ def compile_job(self) -> Optional[hub.CompileJob]:
 
 @dataclass
 class ProfileJobSummary(JobSummary):
-    _chipset: str
+    path: ScorecardProfilePath
 
     @classmethod
     def from_model_id(
@@ -187,43 +188,33 @@ def from_model_id(
                 components = model_code_gen.default_components
             else:
                 components = list(model_code_gen.components.keys())
+        else:
+            components.append(None)  # type: ignore
 
-        for runtime in TargetRuntime:
-            for device, chipset in SCORECARD_DEVICE_NAME_TO_CHIPSET_NAME.items():
-                run_dev = f"{runtime.name}-{device}"
-                if not components:
-                    if (job_id := job_ids.get(f"{model_id}_{run_dev}", None)) is None:
-                        continue
+        path: ScorecardProfilePath
+        for path in ScorecardProfilePath.all_enabled():
+            for component in components:
+                for device in path.get_test_devices(model_code_gen.is_aimet):
                     model_runs.append(
                         cls(
-                            model_id=model_info.name,
-                            job_id=job_id,
-                            runtime=runtime,
-                            _chipset=chipset,
+                            model_id=component or model_info.name,
+                            job_id=job_ids.get(
+                                path.get_job_cache_name(
+                                    model=model_id,
+                                    device=device,
+                                    component=component,
+                                ),
+                                None,
+                            ),
+                            _device=device,
+                            path=path,
                         )
                     )
-                else:
-                    for component in components:
-                        if (
-                            job_id := job_ids.get(
-                                f"{model_id}_{run_dev}_{component}", None
-                            )
-                        ) is None:
-                            continue
-                        model_runs.append(
-                            cls(
-                                model_id=component,
-                                job_id=job_id,
-                                runtime=runtime,
-                                _chipset=chipset,
-                            )
-                        )
 
         return model_runs
 
     def __post_init__(self):
         super().__post_init__()
-        assert self.chipset in REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS
         if not self.skipped:
             assert isinstance(self.job, hub.ProfileJob)
             if self._job_status.success:
@@ -233,7 +224,7 @@ def __post_init__(self):
     def chipset(self) -> str:
         """Chipset the job was run on."""
         if not self.job:
-            return self._chipset
+            return self._device.get_chipset()
 
         hub_device = self.job.device
         for attr in hub_device.attributes:
@@ -243,11 +234,7 @@ def chipset(self) -> str:
 
     @cached_property
     def device(self) -> hub.Device:
-        return (
-            self.job.device
-            if self.job
-            else REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS[self.chipset]
-        )
+        return self.job.device if self.job else self._device.get_reference_device()
 
     @cached_property
     def profile_job(self) -> Optional[hub.ProfileJob]:
diff --git a/qai_hub_models/utils/scorecard/model_card.py b/qai_hub_models/utils/scorecard/model_card.py
index 1b989dbc..ae5798e4 100644
--- a/qai_hub_models/utils/scorecard/model_card.py
+++ b/qai_hub_models/utils/scorecard/model_card.py
@@ -13,10 +13,11 @@
 
 import qai_hub as hub
 
-from qai_hub_models.models.common import TargetRuntime
 from qai_hub_models.utils.config_loaders import MODEL_IDS
 from qai_hub_models.utils.scorecard.common import (
-    REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS,
+    ScorecardCompilePath,
+    ScorecardDevice,
+    ScorecardProfilePath,
 )
 from qai_hub_models.utils.scorecard.job_summary import (
     CompileJobSummary,
@@ -127,9 +128,10 @@ def supported_oses() -> List[str]:
 __REFERENCE_DEVICE_INFO_PER_CHIPSET = {}
 
 
-def get_reference_device_info(chipset: str) -> Dict[str, str]:
+def get_reference_device_info(device: ScorecardDevice) -> Dict[str, str]:
+    chipset = device.get_chipset()
     if chipset not in __REFERENCE_DEVICE_INFO_PER_CHIPSET:
-        hub_device = REFERENCE_DEVICE_PER_SUPPORTED_CHIPSETS[chipset]
+        hub_device = device.get_reference_device()
         device_name = hub_device.name
         os_version = hub_device.os
         os_name, form_factor, manufacturer = "", "", ""
@@ -153,28 +155,26 @@ def get_reference_device_info(chipset: str) -> Dict[str, str]:
 
 
 @dataclass
-class ChipsetPerfSummary:
-    chipset_name: str
-    run_per_runtime: Dict[TargetRuntime, ProfileJobSummary]  # Map<Runtime, Summary>
+class DevicePerfSummary:
+    device: ScorecardDevice
+    run_per_path: Dict[ScorecardProfilePath, ProfileJobSummary]  # Map<path, Summary>
 
     @staticmethod
-    def from_runs(chipset_name: str, runtime_runs: List[ProfileJobSummary]):
+    def from_runs(device: ScorecardDevice, path_runs: List[ProfileJobSummary]):
         # Figure out unique devices in various baselines
-        run_per_runtime: Dict[TargetRuntime, ProfileJobSummary] = {}
-        for run in runtime_runs:
-            assert run.chipset == chipset_name  # Chipset should match
-            run_per_runtime[run.runtime] = run
+        run_per_path: Dict[ScorecardProfilePath, ProfileJobSummary] = {}
+        for run in path_runs:
+            assert run._device == device  # Device should match
+            run_per_path[run.path] = run
 
-        return ChipsetPerfSummary(chipset_name, run_per_runtime)
+        return DevicePerfSummary(device, run_per_path)
 
     def get_perf_card(self) -> Dict[str, str | Dict[str, str]]:
         perf_card: Dict[str, str | Dict[str, str]] = {}
-        for runtime, run in self.run_per_runtime.items():
+        for path, run in self.run_per_path.items():
             if not run.skipped:  # Skipped runs are not included
-                perf_card[runtime.long_name] = run.performance_metrics
-        perf_card["reference_device_info"] = get_reference_device_info(
-            self.chipset_name
-        )
+                perf_card[path.long_name] = run.performance_metrics
+        perf_card["reference_device_info"] = get_reference_device_info(self.device)
         perf_card["timestamp"] = datetime.datetime.utcnow().isoformat() + "Z"
         return perf_card
 
@@ -185,29 +185,31 @@ def __repr__(self) -> str:
 @dataclass
 class ModelPerfSummary:
     model_id: str
-    runs_per_chipset: Dict[str, ChipsetPerfSummary]  # Map<Device Name, Summary>
+    runs_per_device: Dict[
+        ScorecardDevice, DevicePerfSummary
+    ]  # Map<Device Name, Summary>
 
     @staticmethod
     def from_runs(model_id: str, device_runs: List[ProfileJobSummary]):
         # Figure out unique devices in various baselines
-        runs_per_chipset: Dict[str, List[ProfileJobSummary]] = {}
+        runs_per_device: Dict[ScorecardDevice, List[ProfileJobSummary]] = {}
         for run in device_runs:
             assert run.model_id == model_id  # All should have the same model ID
-            list = runs_per_chipset.get(run.chipset or "", [])
-            runs_per_chipset[run.chipset] = list
+            list = runs_per_device.get(run._device, [])
+            runs_per_device[run._device] = list
             list.append(run)
 
         return ModelPerfSummary(
             model_id,
             {
-                chipset_name: ChipsetPerfSummary.from_runs(chipset_name, runs)
-                for chipset_name, runs in runs_per_chipset.items()
+                device: DevicePerfSummary.from_runs(device, runs)
+                for device, runs in runs_per_device.items()
             },
         )
 
     def get_perf_card(self) -> List[Dict[str, Union[str, Dict[str, str]]]]:
         perf_card = []
-        for summary in self.runs_per_chipset.values():
+        for summary in self.runs_per_device.values():
             perf_card.append(summary.get_perf_card())
         return perf_card
 
@@ -226,8 +228,8 @@ def from_model_ids(
         """
         Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format:
         Either:
-            <model_id>|<runtime>|<device>|<model_component_id> : job_id
-            <model_id>|<runtime>|<device> : job_id
+            <model_id>_<path>-<device>_<model_component_id> : job_id
+            <model_id>_<path>-<device> : job_id
 
         Returns models in this format:
             model_id: List[Summary]
@@ -248,8 +250,8 @@ def from_model_id(
         """
         Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format:
         Either:
-            <model_id>|<runtime>|<device>|<model_component_id> : job_id
-            <model_id>|<runtime>|<device> : job_id
+            <model_id>_<path>-<device>_<model_component_id> : job_id
+            <model_id>_<path>-<device> : job_id
 
         Returns models in this format:
             model_id: List[Summary]
@@ -277,7 +279,9 @@ def from_runs(model_runs: List[ProfileJobSummary]):
     def get_chipsets(self) -> Set[str]:
         chips: Set[str] = set()
         for _, model_summary in self.runs_per_model.items():
-            chips.update(model_summary.runs_per_chipset.keys())
+            chips.update(
+                [x.get_chipset() for x in model_summary.runs_per_device.keys()]
+            )
         return chips
 
     def get_perf_card(self) -> Dict[str, str | List[Any] | Dict[str, Any]]:
@@ -302,20 +306,44 @@ def __repr__(self):
         return pprint.pformat(self.get_perf_card())
 
 
+@dataclass
+class DeviceCompileSummary:
+    device: ScorecardDevice
+    run_per_path: Dict[ScorecardCompilePath, CompileJobSummary]  # Map<path, Summary>
+
+    @staticmethod
+    def from_runs(device: ScorecardDevice, path_runs: List[CompileJobSummary]):
+        # Figure out unique devices in various baselines
+        run_per_path: Dict[ScorecardCompilePath, CompileJobSummary] = {}
+        for run in path_runs:
+            assert run._device == device  # Device should match
+            run_per_path[run.path] = run
+
+        return DeviceCompileSummary(device, run_per_path)
+
+
 @dataclass
 class ModelCompileSummary:
     model_id: str
-    runs_per_runtime: Dict[
-        TargetRuntime, CompileJobSummary
+    runs_per_device: Dict[
+        ScorecardDevice, DeviceCompileSummary
     ]  # Map<Device Name, Summary>
 
     @staticmethod
-    def from_runs(model_id: str, runtime_runs: List[CompileJobSummary]):
-        run_per_runtime: Dict[TargetRuntime, CompileJobSummary] = {}
-        for run in runtime_runs:
+    def from_runs(model_id: str, path_runs: List[CompileJobSummary]):
+        runs_per_device: Dict[ScorecardDevice, List[CompileJobSummary]] = {}
+        for run in path_runs:
             assert run.model_id == model_id  # model id should match
-            run_per_runtime[run.runtime] = run
-        return ModelCompileSummary(model_id, run_per_runtime)
+            list = runs_per_device.get(run._device, [])
+            runs_per_device[run._device] = list
+            list.append(run)
+        return ModelCompileSummary(
+            model_id,
+            {
+                device: DeviceCompileSummary.from_runs(device, runs)
+                for device, runs in runs_per_device.items()
+            },
+        )
 
 
 @dataclass
@@ -329,8 +357,10 @@ def from_model_ids(
         """
         Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format:
         Either:
-            <model_id>|<runtime>|<device>|<model_component_id> : job_id
-            <model_id>|<runtime>|<device> : job_id
+            <model_id>_<runtime>-<device>_<model_component_id> : job_id
+            <model_id>_<runtime>-<device> : job_id
+            <model_id>_<runtime>_<model_component_id> : job_id
+            <model_id>_<runtime> : job_id
 
         Returns models in this format:
             model_id: List[Summary]
@@ -351,8 +381,10 @@ def from_model_id(
         """
         Reads jobs for every `model_id` from the dictionary and creates summaries for each. `job_ids` format:
         Either:
-            <model_id>|<runtime>|<device>|<model_component_id> : job_id
-            <model_id>|<runtime>|<device> : job_id
+            <model_id>_<runtime>-<device>_<model_component_id> : job_id
+            <model_id>_<runtime>-<device> : job_id
+            <model_id>_<runtime>_<model_component_id> : job_id
+            <model_id>_<runtime> : job_id
 
         Returns models in this format:
             model_id: List[Summary]
diff --git a/qai_hub_models/utils/system_info.py b/qai_hub_models/utils/system_info.py
new file mode 100644
index 00000000..50f8d531
--- /dev/null
+++ b/qai_hub_models/utils/system_info.py
@@ -0,0 +1,49 @@
+# ---------------------------------------------------------------------
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# ---------------------------------------------------------------------
+
+import psutil
+
+from qai_hub_models.utils.printing import print_with_box
+
+
+def has_recommended_memory(required_memory_in_gb: float) -> None:
+    """
+    Prints out warning if system has less memory(RAM+swap-space) than recommended.
+    """
+    total_ram = psutil.virtual_memory().total
+    total_swap = psutil.swap_memory().total
+
+    # Get total memory in GB
+    total_ram_in_gb = total_ram / 1024**3
+    total_swap_in_gb = total_swap / 1024**3
+
+    total_memory_in_gb = int(total_ram_in_gb + total_swap_in_gb)
+
+    if required_memory_in_gb > total_memory_in_gb:
+        recommended_swap = int(required_memory_in_gb - total_ram_in_gb) + 1
+        warning_msgs = [
+            f"Recommended minimum memory of {required_memory_in_gb} GB memory (RAM + swap-space), found {total_memory_in_gb} GB.",
+            "You might see process killed error due to OOM during export/demo.",
+            "",
+            "Please increase your swap-space temporarily as a work-around. It might slow down export but allow you to export successfully.",
+            "You can refer to https://askubuntu.com/questions/178712/how-to-increase-swap-space for instructions",
+            "or run following commands: ",
+            "",
+            "sudo swapoff -a",
+            "# bs=<amount of data that can be read/write>",
+            "# count=number of <bs> to allocate for swapfile",
+            "# Total size = <bs> * count",
+            "#            = 1 MB * 40k = ~40GB",
+            f"sudo dd if=/dev/zero of=/local/mnt/swapfile bs=1M count={recommended_swap}k",
+            "" "# Set the correct permissions",
+            "sudo chmod 0600 /local/mnt/swapfile",
+            "",
+            "sudo mkswap /local/mnt/swapfile  # Set up a Linux swap area",
+            "sudo swapon /local/mnt/swapfile  # Turn the swap on",
+            "",
+            "You can update `count` to increase swap space that works for machine."
+            "NOTE: above commands does not persist through reboot.",
+        ]
+        print_with_box(warning_msgs)
diff --git a/scripts/build_and_test.py b/scripts/build_and_test.py
index 7768a28f..471bae6c 100755
--- a/scripts/build_and_test.py
+++ b/scripts/build_and_test.py
@@ -374,7 +374,7 @@ def test_all_models(self, plan: Plan, step_id: str = "test_all_models") -> str:
             PyTestModelsTask(
                 self.python_executable,
                 all_models,
-                [],
+                REPRESENTATIVE_EXPORT_MODELS,
                 self.venv_path,
                 venv_for_each_model=False,
                 use_shared_cache=True,
diff --git a/scripts/examples/quantize_yolo.py b/scripts/examples/quantize_detector_coco.py
similarity index 92%
rename from scripts/examples/quantize_yolo.py
rename to scripts/examples/quantize_detector_coco.py
index 66f3df10..b5e405e3 100644
--- a/scripts/examples/quantize_yolo.py
+++ b/scripts/examples/quantize_detector_coco.py
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 """
-This is a sample script showing how to take a AIMET model zoo model without
-pre-computed activations, and compute those activations using QAIHM.
+This is a sample script showing how to compute AIMET encodings for a YOLO model
+    using the COCO dataset.
 This script assumes the model is added to QAIHM, but is missing quantization parameters.
 """
 import argparse
@@ -14,6 +14,7 @@
 from torch.utils.data import DataLoader
 
 from qai_hub_models.datasets.coco import CocoDataset
+from qai_hub_models.models.yolonas_quantized.model import YoloNASQuantizable
 from qai_hub_models.models.yolov7_quantized.model import YoloV7Quantizable
 from qai_hub_models.models.yolov8_det_quantized.model import YoloV8DetectorQuantizable
 
@@ -25,6 +26,7 @@
 MODELS = {
     "yolov7": YoloV7Quantizable,
     "yolov8": YoloV8DetectorQuantizable,
+    "yolonas": YoloNASQuantizable,
 }
 
 if __name__ == "__main__":
diff --git a/scripts/examples/quantize_ffnet.py b/scripts/examples/quantize_ffnet.py
index beb54b5b..eca22d69 100644
--- a/scripts/examples/quantize_ffnet.py
+++ b/scripts/examples/quantize_ffnet.py
@@ -2,6 +2,11 @@
 # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
+"""
+This is a sample script showing how to compute AIMET encodings for an FFNet model
+    using the Cityscapes dataset.
+This script assumes the model is added to QAISM, but is missing quantization parameters.
+"""
 import argparse
 from pathlib import Path
 
@@ -20,13 +25,6 @@
     "ffnet_78s": FFNet78SQuantizable,
 }
 
-
-"""
-This is a sample script showing how to take a AIMET model zoo model without
-pre-computed activations, and compute those activations using QAISM.
-
-This script assumes the model is added to QAISM, but is missing quantization parameters.
-"""
 if __name__ == "__main__":
     # Args
     parser = argparse.ArgumentParser()
diff --git a/scripts/examples/quantize_hrnet.py b/scripts/examples/quantize_hrnet.py
index b8c40b13..835beb6c 100644
--- a/scripts/examples/quantize_hrnet.py
+++ b/scripts/examples/quantize_hrnet.py
@@ -3,9 +3,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 """
-This is a sample script showing how to take a AIMET model zoo model without
-pre-computed activations, and compute those activations using QAIHM.
-
+This is a sample script showing how to compute AIMET encodings for an HRNet model
+    using the COCO dataset.
 This script assumes the model is added to QAIHM, but is missing quantization parameters.
 """
 import argparse
diff --git a/scripts/examples/quantize_imagenet_classifier.py b/scripts/examples/quantize_imagenet_classifier.py
index 980bfef6..907e877b 100644
--- a/scripts/examples/quantize_imagenet_classifier.py
+++ b/scripts/examples/quantize_imagenet_classifier.py
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 """
-This is a sample script showing how to take a AIMET model zoo model without
-pre-computed activations, and compute those activations using QAIHM.
+This is a sample script showing how to compute AIMET encodings for an
+    Imagenet Classifier using the Imagenette dataset.
 This script assumes the model is added to QAIHM, but is missing quantization parameters.
 """
 import argparse
@@ -14,6 +14,12 @@
 from torch.utils.data import DataLoader
 
 from qai_hub_models.datasets.imagenette import ImagenetteDataset
+from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import (
+    ConvNextTinyW8A8Quantizable,
+)
+from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import (
+    ConvNextTinyW8A16Quantizable,
+)
 from qai_hub_models.models.googlenet_quantized.model import GoogLeNetQuantizable
 from qai_hub_models.models.inception_v3_quantized.model import InceptionNetV3Quantizable
 from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable
@@ -45,6 +51,8 @@
     "shufflenet_v2": ShufflenetV2Quantizable,
     "squeezenet1_1": SqueezeNetQuantizable,
     "wideresnet50": WideResNet50Quantizable,
+    "convnext_tiny_w8a8": ConvNextTinyW8A8Quantizable,
+    "convnext_tiny_w8a16": ConvNextTinyW8A16Quantizable,
 }
 
 if __name__ == "__main__":
diff --git a/scripts/examples/quantize_deeplabv3.py b/scripts/examples/quantize_segmenter_voc.py
similarity index 91%
rename from scripts/examples/quantize_deeplabv3.py
rename to scripts/examples/quantize_segmenter_voc.py
index ad160ee7..2efa487a 100644
--- a/scripts/examples/quantize_deeplabv3.py
+++ b/scripts/examples/quantize_segmenter_voc.py
@@ -3,9 +3,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 """
-This is a sample script showing how to take a AIMET model zoo model without
-pre-computed activations, and compute those activations using QAIHM.
-
+This is a sample script showing how to compute AIMET encodings for a DeepLab model
+    using the PASCAL VOC dataset.
 This script assumes the model is added to QAIHM, but is missing quantization parameters.
 """
 import argparse
@@ -18,9 +17,11 @@
 from qai_hub_models.models.deeplabv3_plus_mobilenet_quantized.model import (
     DeepLabV3PlusMobilenetQuantizable,
 )
+from qai_hub_models.models.fcn_resnet50_quantized.model import FCN_ResNet50Quantizable
 
 MODELS = {
     "deeplabv3_plus_mobilenet": DeepLabV3PlusMobilenetQuantizable,
+    "fcn_resnet50": FCN_ResNet50Quantizable,
 }
 
 if __name__ == "__main__":
@@ -69,7 +70,6 @@
     evaluator = model.get_evaluator()
     evaluator.add_from_dataset(model, dataloader, args.num_iter)
     accuracy_fp32 = evaluator.get_accuracy_score()
-
     model.quantize(dataloader, args.num_iter, data_has_gt=True)
     evaluator.reset()
     evaluator.add_from_dataset(model, dataloader, args.num_iter)
diff --git a/scripts/examples/quantize_superresolution.py b/scripts/examples/quantize_superresolution.py
index 0a354e54..6807c7d2 100644
--- a/scripts/examples/quantize_superresolution.py
+++ b/scripts/examples/quantize_superresolution.py
@@ -3,53 +3,88 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 """
-This is a sample script showing how to take a AIMET model zoo model without
-pre-computed activations, and compute those activations using QAISM.
-
+This is a sample script showing how to compute AIMET encodings for an SuperResolution
+    model using the BSD300 dataset.
 This script assumes the model is added to QAISM, but is missing quantization parameters.
 """
 import argparse
-import importlib
 from pathlib import Path
 
 import torch
 from torch.utils.data import DataLoader
 
 from qai_hub_models.datasets.bsd300 import BSD300Dataset
+from qai_hub_models.models.quicksrnetlarge_quantized.model import (
+    QuickSRNetLargeQuantizable,
+)
+from qai_hub_models.models.quicksrnetmedium_quantized.model import (
+    QuickSRNetMediumQuantizable,
+)
+from qai_hub_models.models.quicksrnetsmall_quantized.model import (
+    QuickSRNetSmallQuantizable,
+)
+from qai_hub_models.models.xlsr_quantized.model import XLSRQuantizable
 
 from qai_hub_models.utils.quantization_aimet import (  # isort: skip
     AIMETQuantizableMixin,
 )
 
+MODELS = {
+    "xlsr": XLSRQuantizable,
+    "quicksrnetsmall": QuickSRNetSmallQuantizable,
+    "quicksrnetmedium": QuickSRNetMediumQuantizable,
+    "quicksrnetlarge": QuickSRNetLargeQuantizable,
+}
+
+
 if __name__ == "__main__":
     # Args
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--num-iter", type=int, default=1, help="Number of batches to use."
+        "--num-iter", type=int, default=128, help="Number of batches to use."
     )
     parser.add_argument(
         "--batch-size",
         type=int,
-        default=128,
+        default=1,
         help="Batch size to use on each iteration.",
     )
     parser.add_argument(
         "--model",
         type=str,
-        default="sesr_m5_quantized",
+        choices=MODELS.keys(),
+        required=True,
         help="Name of the model folder to compute encodings. This script expects a super resolution model with a scaling parameter, eg SESR M5 Quantized.",
     )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory where encodings should be stored. Defaults to ./build.",
+    )
+    parser.add_argument(
+        "--output-name",
+        type=str,
+        default=None,
+        help="Encodings filename. Defaults to <model_name>_encodings.",
+    )
     parser.add_argument(
         "--seed",
         type=int,
         default=42,
         help="Manual seed to ensure reproducibility for quantization.",
     )
+    parser.add_argument(
+        "--scale-factor",
+        type=int,
+        default=4,
+        help="Scaling factor of the model.",
+    )
     args = parser.parse_args()
-    module = importlib.import_module(f"qai_hub_models.models.{args.model}")
+    model = MODELS[args.model].from_pretrained(aimet_encodings=None)
 
     # Load dataset
-    dataset = BSD300Dataset(scaling_factor=module.model.SCALING_FACTOR)
+    dataset = BSD300Dataset(scaling_factor=args.scale_factor)
     torch.manual_seed(args.seed)
     # Pass it to the dataloader
     dataloader = DataLoader(
@@ -57,7 +92,6 @@
     )
 
     # Load model and confirm it's a quantizable type.
-    model = module.Model.from_pretrained(aimet_encodings=None)
     assert isinstance(model, AIMETQuantizableMixin)
 
     evaluator = model.get_evaluator()
@@ -73,8 +107,10 @@
     evaluator.add_from_dataset(model, dataloader, args.num_iter)
     accuracy_int8 = evaluator.get_accuracy_score()
 
-    print(f"FP32 PSNR: {accuracy_fp32} dB")
-    print(f"INT8 PSNR: {accuracy_int8} dB")
+    print(f"FP32 PSNR: {accuracy_fp32:.2f} dB")
+    print(f"INT8 PSNR: {accuracy_int8:.2f} dB")
 
     # Export encodings
-    model.quant_sim.save_encodings_to_json(Path() / "build", module.MODEL_ID)
+    output_path = args.output_dir or str(Path() / "build")
+    output_name = args.output_name or f"{args.model}_quantized_encodings"
+    model.quant_sim.save_encodings_to_json(output_path, output_name)
diff --git a/scripts/examples/test_numerics_imagenet_classifier_quantized.py b/scripts/examples/test_numerics_imagenet_classifier_quantized.py
index e278ff1f..d9191056 100644
--- a/scripts/examples/test_numerics_imagenet_classifier_quantized.py
+++ b/scripts/examples/test_numerics_imagenet_classifier_quantized.py
@@ -16,6 +16,12 @@
 
 from qai_hub_models.datasets.imagenette import ImagenetteDataset
 from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier
+from qai_hub_models.models.convnext_tiny_w8a8_quantized.model import (
+    ConvNextTinyW8A8Quantizable,
+)
+from qai_hub_models.models.convnext_tiny_w8a16_quantized.model import (
+    ConvNextTinyW8A16Quantizable,
+)
 from qai_hub_models.models.inception_v3_quantized.model import InceptionNetV3Quantizable
 from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable
 from qai_hub_models.models.mobilenet_v3_large_quantized.model import (
@@ -89,69 +95,47 @@ def test_dataloader_is_deterministic(data_loaders):
     assert labels[:5].tolist() == expected_test_labels
 
 
-@pytest.fixture(
-    scope="module",
-    params=[
-        # Class, AIMET accuracy
-        (MobileNetV2Quantizable, 0.8100),
-        (MobileNetV3LargeQuantizable, 0.8430),
-        (ResNet18Quantizable, 0.8010),
-        (ResNet50Quantizable, 0.8520),
-        (ResNet101Quantizable, 0.8530),
-        (ResNeXt50Quantizable, 0.8880),
-        (ResNeXt101Quantizable, 0.9250),
-        (SqueezeNetQuantizable, 0.6410),
-        (RegNetQuantizable, 0.8750),
-        (WideResNet50Quantizable, 0.9190),
-        (ShufflenetV2Quantizable, 0.6740),
-        (InceptionNetV3Quantizable, 0.8430),
-    ],
-)
-def quantized_model(request, data_loaders, test_data):
-    """
-    Create encoding from calibration data and returned quantized model with
-    validated off-target accuracy computed on QuantSim
-    """
-    img_test, label_test, hub_dataset = test_data
-    calib_loader, test_loader = data_loaders
-    model_cls, target_sim_acc = request.param
-    model = model_cls.from_pretrained(aimet_encodings=None)
-
-    # Calibration in quantization
-    num_calib_batches = 3
-    model.quantize(calib_loader, num_calib_batches, data_has_gt=True)
-
-    # QuantSim evaluation on eval set
-    evaluator = model.get_evaluator()
-
-    batch_size = 32
-    for i in tqdm(list(range(0, img_test.size(0), batch_size)), desc="QuantSim eval"):
-        img_batch = img_test[i : i + batch_size]
-        label_batch = label_test[i : i + batch_size]
-
-        sim_out = model(img_batch).detach()
-        evaluator.add_batch(sim_out, label_batch)
-
-    sim_acc = evaluator.get_accuracy_score()
-    print(f"{model_cls=}, {sim_acc=}")
-    np.testing.assert_allclose(target_sim_acc, sim_acc, atol=0.01)
-    return model
-
-
 @on_device
 @pytest.mark.parametrize(
-    "source_model_format,target_runtime,hub_needs_calib_data",
+    "model_cls,target_runtime,expected_size_mb,expected_acc",
     [
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, False),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, False),
+        (MobileNetV2Quantizable, TargetRuntime.TFLITE, 3.64, 0.816),
+        (MobileNetV2Quantizable, TargetRuntime.QNN, 4.09, 0.813),
+        (MobileNetV3LargeQuantizable, TargetRuntime.TFLITE, 5.72, 0.848),
+        # MobileNetV3LargeQuantizable, TargetRuntime.QNN fails to convert (AISW-87206)
+        (ResNet18Quantizable, TargetRuntime.TFLITE, 11.30, 0.805),
+        (ResNet18Quantizable, TargetRuntime.QNN, 11.65, 0.796),
+        (ResNet50Quantizable, TargetRuntime.TFLITE, 25.09, 0.847),
+        (ResNet50Quantizable, TargetRuntime.QNN, 25.41, 0.848),
+        (ResNet101Quantizable, TargetRuntime.TFLITE, 43.88, 0.858),
+        (ResNet101Quantizable, TargetRuntime.QNN, 44.08, 0.831),
+        (ResNeXt50Quantizable, TargetRuntime.TFLITE, 24.77, 0.891),
+        (ResNeXt50Quantizable, TargetRuntime.QNN, 25.03, 0.893),
+        (ResNeXt101Quantizable, TargetRuntime.TFLITE, 87.28, 0.926),
+        # Fails to infer (#9827)
+        (ResNeXt101Quantizable, TargetRuntime.QNN, 87.26, None),
+        (SqueezeNetQuantizable, TargetRuntime.TFLITE, 1.30, 0.637),
+        (SqueezeNetQuantizable, TargetRuntime.QNN, 1.69, 0.636),
+        (RegNetQuantizable, TargetRuntime.TFLITE, 15.42, 0.872),
+        (RegNetQuantizable, TargetRuntime.QNN, 15.89, 0.876),
+        (WideResNet50Quantizable, TargetRuntime.TFLITE, 66.59, 0.923),
+        (WideResNet50Quantizable, TargetRuntime.QNN, 66.86, 0.922),
+        (ShufflenetV2Quantizable, TargetRuntime.TFLITE, 1.46, 0.674),
+        (ShufflenetV2Quantizable, TargetRuntime.QNN, 1.99, 0.670),
+        (InceptionNetV3Quantizable, TargetRuntime.TFLITE, 23.32, 0.841),
+        (InceptionNetV3Quantizable, TargetRuntime.QNN, 23.85, 0.845),
+        # ConvNextTinyW8A8Quantizable, SourceModelFormat.ONNX not supported yet (#10862)
+        (ConvNextTinyW8A8Quantizable, TargetRuntime.QNN, 28.33, 0.846),
+        # ConvNextTinyW8A16Quantizable, SourceModelFormat.ONNX not supported yet (#10862)
+        (ConvNextTinyW8A16Quantizable, TargetRuntime.QNN, 28.34, 0.876),
     ],
 )
-def test_make_encoding_w8a8_accuracy(
-    source_model_format,
+def test_quantized_accuracy(
+    model_cls,
     target_runtime,
-    hub_needs_calib_data,
+    expected_size_mb,
+    expected_acc,
     test_data,
-    quantized_model,
     data_loaders,
 ):
     """
@@ -160,133 +144,18 @@ def test_make_encoding_w8a8_accuracy(
 
     Note: We don't run profile job to get perf here but leave that to the score card.
     """
-    model = quantized_model
-
-    expected_size_mb_and_acc = {
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, MobileNetV2Quantizable): (
-            3.64,
-            0.801,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, MobileNetV2Quantizable): (
-            4.02,
-            0.801,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, MobileNetV3LargeQuantizable): (
-            5.79,
-            0.859,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, MobileNetV3LargeQuantizable): (
-            None,  # Fails to convert (AISW-87206)
-            None,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet18Quantizable): (
-            11.30,
-            0.778,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet18Quantizable): (
-            11.61,
-            0.789,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet50Quantizable): (
-            25.09,
-            0.837,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet50Quantizable): (
-            25.33,
-            0.834,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet101Quantizable): (
-            43.89,
-            0.827,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet101Quantizable): (
-            44.08,
-            0.831,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNeXt50Quantizable): (
-            24.77,
-            0.888,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNeXt50Quantizable): (
-            24.96,
-            0.888,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNeXt101Quantizable): (
-            87.29,
-            0.906,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNeXt101Quantizable): (
-            87.11,
-            None,  # Fails to infer (#9827)
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, SqueezeNetQuantizable): (
-            1.30,
-            0.609,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, SqueezeNetQuantizable): (
-            1.66,
-            0.609,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, RegNetQuantizable): (
-            15.43,
-            0.859,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, RegNetQuantizable): (
-            15.77,
-            0.859,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, WideResNet50Quantizable): (
-            66.59,
-            0.900,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, WideResNet50Quantizable): (
-            66.78,
-            0.897,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ShufflenetV2Quantizable): (
-            1.47,
-            0.661,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, ShufflenetV2Quantizable): (
-            1.90,
-            0.661,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.TFLITE, InceptionNetV3Quantizable): (
-            23.33,
-            0.843,
-        ),
-        (SourceModelFormat.ONNX, TargetRuntime.QNN, InceptionNetV3Quantizable): (
-            23.81,
-            0.844,
-        ),
-    }
-    expected_size_mb, expected_acc = expected_size_mb_and_acc[
-        (source_model_format, target_runtime, model.__class__)
-    ]
-    if expected_size_mb is None:
-        pytest.skip("Fails to compile")
+    model = model_cls.from_pretrained()
 
     img_test, label_test, hub_dataset = test_data
     calib_loader, test_loader = data_loaders
 
-    # calibration data
-    calibration_data = None
-    if hub_needs_calib_data:
-        # AIMET export has missing encoding and needs calibration data
-        num_calib_batches = 3
-        calib_imgs = []
-        for b, (img_calib, labels) in enumerate(iter(calib_loader)):
-            if b >= num_calib_batches:
-                break
-            img_np = img_calib.numpy()
-            calib_imgs.extend(np.split(img_np, img_np.shape[0]))
-        calibration_data = {list(model.get_input_spec().keys())[0]: calib_imgs}
+    calibration_data = model.get_calibration_data(target_runtime)
 
     # On-device inference
     device = hub.Device("Samsung Galaxy S23")
     hub_model = compile_zoo_model_to_hub(
         model=model,
-        source_model_format=source_model_format,
+        source_model_format=SourceModelFormat.ONNX,
         device=device,
         target_runtime=target_runtime,
         calibration_data=calibration_data,
@@ -294,10 +163,7 @@ def test_make_encoding_w8a8_accuracy(
 
     # Make sure model is quantized
     tgt_model_size_mb = get_model_size_mb(hub_model.model)
-    model_cls = quantized_model.__class__
-    print(
-        f"{model_cls=}, {source_model_format=}, {target_runtime=}, {tgt_model_size_mb=}"
-    )
+    print(f"{model_cls=}, {target_runtime=}, {tgt_model_size_mb=}")
     np.testing.assert_allclose(expected_size_mb, tgt_model_size_mb, rtol=0.1)
 
     if expected_acc is None:
@@ -308,5 +174,5 @@ def test_make_encoding_w8a8_accuracy(
     evaluator = model.get_evaluator()
     evaluator.add_batch(hub_out, label_test)
     hub_acc = evaluator.get_accuracy_score()
-    print(f"{model_cls=}, {source_model_format=}, {target_runtime=}, {hub_acc=}")
+    print(f"{model_cls=}, {target_runtime=}, {hub_acc=}")
     np.testing.assert_allclose(expected_acc, hub_acc, atol=0.01)
diff --git a/scripts/examples/yolov6_evaluation.py b/scripts/examples/yolov6_evaluation.py
index 6ecb2fe7..367e3d71 100644
--- a/scripts/examples/yolov6_evaluation.py
+++ b/scripts/examples/yolov6_evaluation.py
@@ -3,9 +3,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # ---------------------------------------------------------------------
 """
-This is a sample script showing how to take a AIMET model zoo model without
-pre-computed activations, and compute those activations using QAIHM.
-This script assumes the model is added to QAIHM, but is missing quantization parameters.
+This is a sample script showing how to evaluate accuracy (mAP) of a yolov6 model.
 Packages to install: pycocotools, object-detection-metrics==0.4.post1, shapely
 """
 
diff --git a/scripts/tasks/changes.py b/scripts/tasks/changes.py
index 0f806300..bebe6068 100644
--- a/scripts/tasks/changes.py
+++ b/scripts/tasks/changes.py
@@ -37,6 +37,7 @@
         "qai_hub_models/models/resnet18_quantized/model.py",
     ],
     "qai_hub_models/utils/printing.py": REPRESENTATIVE_EXPORT_FILES,
+    "qai_hub_models/utils/config_loaders.py": REPRESENTATIVE_EXPORT_FILES,
 }
 
 
diff --git a/scripts/tasks/venv.py b/scripts/tasks/venv.py
index b1c08eb0..f73fc35c 100644
--- a/scripts/tasks/venv.py
+++ b/scripts/tasks/venv.py
@@ -52,17 +52,8 @@ def __init__(
     ) -> None:
         tasks = []
 
-        extras_str = f"[{','.join(extras)}]" if extras else ""
-        tasks.append(
-            RunCommandsWithVenvTask(
-                group_name=f"Install QAIHM{extras_str}",
-                venv=venv_path,
-                commands=[
-                    f'pip install -e "{PY_PACKAGE_INSTALL_ROOT}{extras_str}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html',
-                ],
-            )
-        )
-
+        # Install AIMET before model requirements to give preference over
+        # model specific versions.
         if include_aimet:
             if can_support_aimet():
                 if is_package_installed("aimet_torch", venv_path):
@@ -95,6 +86,17 @@ def __init__(
                     )
                 )
 
+        extras_str = f"[{','.join(extras)}]" if extras else ""
+        tasks.append(
+            RunCommandsWithVenvTask(
+                group_name=f"Install QAIHM{extras_str}",
+                venv=venv_path,
+                commands=[
+                    f'pip install -e "{PY_PACKAGE_INSTALL_ROOT}{extras_str}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html',
+                ],
+            )
+        )
+
         super().__init__(
             f"Create Local QAIHM{extras_str} Virtual Environment at {venv_path}",
             [task for task in tasks],
diff --git a/setup.py b/setup.py
index a40bd1ae..22aa2931 100644
--- a/setup.py
+++ b/setup.py
@@ -29,10 +29,12 @@ def get_data_files() -> List[str]:
     data_files = []
     for ext in data_file_extensions:
         data_files.extend(
-            glob.glob(f"{str(qaihm_path.absolute())}/**/*.{ext}", recursive=True)
+            glob.glob(
+                f"{str(qaihm_path.absolute() / '**' / f'*.{ext}')}", recursive=True
+            )
         )
     for i in range(0, len(data_files)):
-        data_files[i] = data_files[i].split("/qai_hub_models/")[1]
+        data_files[i] = data_files[i].split("qai_hub_models")[1][1:]
     return data_files