From 5bae6e829b6076ea9aaabd8be75aa27749b5b7f3 Mon Sep 17 00:00:00 2001 From: Qualcomm AI Stack Models Bot Date: Mon, 18 Mar 2024 16:37:50 -0700 Subject: [PATCH] v0.4.0 Signed-off-by: QAIHM Team --- .gitattributes | 1 + .gitignore | 115 -- README.md | 118 +- apps/android/ImageClassification/README.md | 72 ++ apps/android/ImageClassification/build.gradle | 10 + apps/android/ImageClassification/build_apk.py | 163 +++ .../classification/build.gradle | 63 ++ .../classification/proguard-rules.pro | 21 + .../src/main/AndroidManifest.xml | 37 + .../src/main/assets/Sample1.png | 3 + .../src/main/assets/Sample2.png | 3 + .../src/main/assets/Sample3.png | 3 + .../src/main/assets/Sample4.png | 3 + .../src/main/assets/Sample5.png | 3 + .../classification/src/main/assets/labels.txt | 1001 +++++++++++++++++ .../ImageClassification.java | 214 ++++ .../ImageClassificationResult.java | 32 + .../qcom/imageclassification/QNNActivity.java | 177 +++ .../com/qcom/imageclassification/Result.java | 27 + .../com/qcom/imageclassification/Utils.java | 27 + .../drawable-v24/ic_launcher_foreground.xml | 30 + .../res/drawable/ic_launcher_background.xml | 170 +++ .../drawable/image_classification_icon.png | 3 + .../res/layout/activity_classification.xml | 93 ++ .../res/mipmap-anydpi-v26/ic_launcher.xml | 5 + .../mipmap-anydpi-v26/ic_launcher_round.xml | 6 + .../src/main/res/mipmap-hdpi/ic_launcher.png | 3 + .../res/mipmap-hdpi/ic_launcher_round.png | 3 + .../src/main/res/mipmap-mdpi/ic_launcher.png | 3 + .../res/mipmap-mdpi/ic_launcher_round.png | 3 + .../src/main/res/mipmap-xhdpi/ic_launcher.png | 3 + .../res/mipmap-xhdpi/ic_launcher_round.png | 3 + .../main/res/mipmap-xxhdpi/ic_launcher.png | 3 + .../res/mipmap-xxhdpi/ic_launcher_round.png | 3 + .../main/res/mipmap-xxxhdpi/ic_launcher.png | 3 + .../res/mipmap-xxxhdpi/ic_launcher_round.png | 3 + .../src/main/res/values-night/themes.xml | 17 + .../src/main/res/values/colors.xml | 11 + .../src/main/res/values/strings.xml | 4 + .../src/main/res/values/themes.xml | 18 + .../ImageClassification/gradle.properties | 20 + .../gradle/wrapper/gradle-wrapper.jar | 3 + .../gradle/wrapper/gradle-wrapper.properties | 6 + apps/android/ImageClassification/gradlew | 185 +++ apps/android/ImageClassification/gradlew.bat | 89 ++ .../ImageClassification/settings.gradle | 29 + apps/android/ImageSuperResolution/README.md | 66 ++ .../android/ImageSuperResolution/build.gradle | 10 + .../ImageSuperResolution/build.properties | 2 + .../android/ImageSuperResolution/build_apk.py | 182 +++ .../ImageSuperResolution/gradle.properties | 20 + .../gradle/wrapper/gradle-wrapper.jar | 3 + .../gradle/wrapper/gradle-wrapper.properties | 6 + apps/android/ImageSuperResolution/gradlew | 185 +++ apps/android/ImageSuperResolution/gradlew.bat | 89 ++ .../ImageSuperResolution/settings.gradle | 27 + .../superresolution/build.gradle | 71 ++ .../superresolution/proguard-rules.pro | 21 + .../src/main/AndroidManifest.xml | 36 + .../src/main/assets/Sample1.jpg | 3 + .../src/main/assets/Sample2.jpg | 3 + .../com/qcom/imagesuperres/QNNActivity.java | 243 ++++ .../java/com/qcom/imagesuperres/Result.java | 38 + .../qcom/imagesuperres/SuperResolution.java | 165 +++ .../imagesuperres/SuperResolutionResult.java | 19 + .../java/com/qcom/imagesuperres/Utils.java | 36 + .../com/qcom/imagesuperres/UtilsESRGAN.java | 35 + .../drawable-v24/ic_launcher_foreground.xml | 30 + .../res/drawable/ic_launcher_background.xml | 170 +++ .../src/main/res/layout/activity_superres.xml | 140 +++ .../res/mipmap-anydpi-v26/ic_launcher.xml | 5 + .../mipmap-anydpi-v26/ic_launcher_round.xml | 6 + .../src/main/res/mipmap-hdpi/ic_launcher.png | 3 + .../res/mipmap-hdpi/ic_launcher_round.png | 3 + .../src/main/res/mipmap-mdpi/ic_launcher.png | 3 + .../res/mipmap-mdpi/ic_launcher_round.png | 3 + .../src/main/res/mipmap-xhdpi/ic_launcher.png | 3 + .../res/mipmap-xhdpi/ic_launcher_round.png | 3 + .../main/res/mipmap-xxhdpi/ic_launcher.png | 3 + .../res/mipmap-xxhdpi/ic_launcher_round.png | 3 + .../main/res/mipmap-xxxhdpi/ic_launcher.png | 3 + .../res/mipmap-xxxhdpi/ic_launcher_round.png | 3 + .../src/main/res/values-night/themes.xml | 17 + .../src/main/res/values/colors.xml | 11 + .../src/main/res/values/strings.xml | 4 + .../src/main/res/values/themes.xml | 18 + qai_hub_models/_version.py | 2 +- qai_hub_models/asset_bases.yaml | 4 +- qai_hub_models/conftest.py | 7 + qai_hub_models/global_requirements.txt | 44 + .../_shared/cityscapes_segmentation/demo.py | 4 +- qai_hub_models/models/_shared/common.py | 35 +- qai_hub_models/models/_shared/deeplab/demo.py | 5 +- qai_hub_models/models/_shared/detr/demo.py | 5 +- qai_hub_models/models/_shared/detr/model.py | 2 +- qai_hub_models/models/_shared/fastsam/demo.py | 35 +- .../models/_shared/fastsam/model.py | 2 +- qai_hub_models/models/_shared/ffnet/model.py | 11 + .../_shared/imagenet_classifier/demo.py | 21 +- .../_shared/imagenet_classifier/model.py | 19 +- .../_shared/imagenet_classifier/test_utils.py | 2 +- .../models/_shared/quicksrnet/common.py | 24 +- qai_hub_models/models/_shared/repaint/app.py | 38 +- qai_hub_models/models/_shared/repaint/demo.py | 12 +- qai_hub_models/models/_shared/sesr/common.py | 12 +- .../models/_shared/super_resolution/demo.py | 4 +- .../models/_shared/video_classifier/model.py | 2 +- .../models/_shared/whisper/__init__.py | 4 + .../{whisper_asr => _shared/whisper}/app.py | 6 +- .../{whisper_asr => _shared/whisper}/demo.py | 14 +- .../{whisper_asr => _shared/whisper}/model.py | 49 +- .../test.py => _shared/whisper/test_utils.py} | 36 +- qai_hub_models/models/_shared/yolo/demo.py | 5 +- qai_hub_models/models/_shared/yolo/utils.py | 4 +- qai_hub_models/models/aotgan/README.md | 54 + qai_hub_models/models/aotgan/__init__.py | 10 + qai_hub_models/models/aotgan/conftest.py | 26 + qai_hub_models/models/aotgan/demo.py | 19 + qai_hub_models/models/aotgan/export.py | 206 ++++ qai_hub_models/models/aotgan/info.yaml | 31 + qai_hub_models/models/aotgan/model.py | 131 +++ .../models/aotgan/patches/layer_norm.diff | 14 + qai_hub_models/models/aotgan/perf.yaml | 108 ++ qai_hub_models/models/aotgan/test.py | 68 ++ .../models/baichuan_7b_quantized/README.md | 4 +- .../models/baichuan_7b_quantized/info.yaml | 4 +- qai_hub_models/models/common.py | 24 + .../models/controlnet_quantized/README.md | 6 +- .../models/controlnet_quantized/export.py | 77 +- .../models/controlnet_quantized/info.yaml | 2 + .../models/controlnet_quantized/model.py | 41 +- .../controlnet_quantized/requirements.txt | 3 +- .../models/controlnet_quantized/test.py | 5 + qai_hub_models/models/convnext_tiny/README.md | 6 +- .../models/convnext_tiny/conftest.py | 24 + qai_hub_models/models/convnext_tiny/demo.py | 4 +- qai_hub_models/models/convnext_tiny/export.py | 42 +- qai_hub_models/models/convnext_tiny/info.yaml | 2 + qai_hub_models/models/convnext_tiny/model.py | 2 +- qai_hub_models/models/convnext_tiny/perf.yaml | 59 +- qai_hub_models/models/convnext_tiny/test.py | 3 + qai_hub_models/models/ddrnet23_slim/README.md | 6 +- .../models/ddrnet23_slim/conftest.py | 26 + qai_hub_models/models/ddrnet23_slim/demo.py | 4 +- qai_hub_models/models/ddrnet23_slim/export.py | 42 +- qai_hub_models/models/ddrnet23_slim/info.yaml | 2 + qai_hub_models/models/ddrnet23_slim/perf.yaml | 59 +- qai_hub_models/models/ddrnet23_slim/test.py | 2 + .../models/deeplabv3_resnet50/README.md | 6 +- .../models/deeplabv3_resnet50/conftest.py | 26 + .../models/deeplabv3_resnet50/demo.py | 4 +- .../models/deeplabv3_resnet50/export.py | 39 +- .../models/deeplabv3_resnet50/info.yaml | 2 + .../models/deeplabv3_resnet50/model.py | 18 +- .../models/deeplabv3_resnet50/perf.yaml | 69 +- .../models/deeplabv3_resnet50/test.py | 2 + qai_hub_models/models/densenet121/README.md | 6 +- qai_hub_models/models/densenet121/conftest.py | 24 + qai_hub_models/models/densenet121/demo.py | 4 +- qai_hub_models/models/densenet121/export.py | 42 +- qai_hub_models/models/densenet121/info.yaml | 2 + qai_hub_models/models/densenet121/model.py | 2 +- qai_hub_models/models/densenet121/perf.yaml | 69 +- qai_hub_models/models/densenet121/test.py | 3 + .../models/detr_resnet101/README.md | 6 +- .../models/detr_resnet101/conftest.py | 24 + qai_hub_models/models/detr_resnet101/demo.py | 2 +- .../models/detr_resnet101/export.py | 42 +- .../models/detr_resnet101/info.yaml | 2 + .../models/detr_resnet101/perf.yaml | 65 +- .../models/detr_resnet101/requirements.txt | 4 +- qai_hub_models/models/detr_resnet101/test.py | 3 + .../models/detr_resnet101_dc5/README.md | 6 +- .../models/detr_resnet101_dc5/conftest.py | 24 + .../models/detr_resnet101_dc5/demo.py | 2 +- .../models/detr_resnet101_dc5/export.py | 42 +- .../models/detr_resnet101_dc5/info.yaml | 2 + .../models/detr_resnet101_dc5/perf.yaml | 65 +- .../detr_resnet101_dc5/requirements.txt | 4 +- .../models/detr_resnet101_dc5/test.py | 3 + qai_hub_models/models/detr_resnet50/README.md | 6 +- .../models/detr_resnet50/conftest.py | 24 + qai_hub_models/models/detr_resnet50/demo.py | 2 +- qai_hub_models/models/detr_resnet50/export.py | 42 +- qai_hub_models/models/detr_resnet50/info.yaml | 2 + qai_hub_models/models/detr_resnet50/perf.yaml | 65 +- .../models/detr_resnet50/requirements.txt | 4 +- qai_hub_models/models/detr_resnet50/test.py | 3 + .../models/detr_resnet50_dc5/README.md | 6 +- .../models/detr_resnet50_dc5/conftest.py | 24 + .../models/detr_resnet50_dc5/demo.py | 2 +- .../models/detr_resnet50_dc5/export.py | 42 +- .../models/detr_resnet50_dc5/info.yaml | 2 + .../models/detr_resnet50_dc5/perf.yaml | 65 +- .../models/detr_resnet50_dc5/requirements.txt | 4 +- .../models/detr_resnet50_dc5/test.py | 3 + .../models/efficientnet_b0/README.md | 6 +- .../models/efficientnet_b0/conftest.py | 24 + qai_hub_models/models/efficientnet_b0/demo.py | 4 +- .../models/efficientnet_b0/export.py | 42 +- .../models/efficientnet_b0/info.yaml | 2 + .../models/efficientnet_b0/model.py | 2 +- .../models/efficientnet_b0/perf.yaml | 69 +- qai_hub_models/models/efficientnet_b0/test.py | 3 + qai_hub_models/models/esrgan/README.md | 6 +- qai_hub_models/models/esrgan/conftest.py | 26 + qai_hub_models/models/esrgan/demo.py | 1 + qai_hub_models/models/esrgan/export.py | 39 +- qai_hub_models/models/esrgan/info.yaml | 2 + qai_hub_models/models/esrgan/perf.yaml | 69 +- qai_hub_models/models/esrgan/test.py | 2 + .../models/facebook_denoiser/README.md | 6 +- .../models/facebook_denoiser/app.py | 14 +- .../models/facebook_denoiser/conftest.py | 26 + .../models/facebook_denoiser/demo.py | 53 +- .../models/facebook_denoiser/export.py | 43 +- .../models/facebook_denoiser/info.yaml | 2 + .../models/facebook_denoiser/model.py | 26 +- .../models/facebook_denoiser/perf.yaml | 59 +- .../models/facebook_denoiser/requirements.txt | 4 +- .../models/facebook_denoiser/test.py | 13 +- qai_hub_models/models/fastsam_s/README.md | 6 +- qai_hub_models/models/fastsam_s/conftest.py | 24 + qai_hub_models/models/fastsam_s/demo.py | 2 +- qai_hub_models/models/fastsam_s/export.py | 42 +- qai_hub_models/models/fastsam_s/info.yaml | 2 + qai_hub_models/models/fastsam_s/perf.yaml | 59 +- .../models/fastsam_s/requirements.txt | 3 +- qai_hub_models/models/fastsam_x/README.md | 6 +- qai_hub_models/models/fastsam_x/conftest.py | 24 + qai_hub_models/models/fastsam_x/demo.py | 2 +- qai_hub_models/models/fastsam_x/export.py | 42 +- qai_hub_models/models/fastsam_x/info.yaml | 2 + qai_hub_models/models/fastsam_x/perf.yaml | 59 +- .../models/fastsam_x/requirements.txt | 3 +- qai_hub_models/models/fcn_resnet50/README.md | 6 +- .../models/fcn_resnet50/conftest.py | 26 + qai_hub_models/models/fcn_resnet50/demo.py | 4 +- qai_hub_models/models/fcn_resnet50/export.py | 42 +- qai_hub_models/models/fcn_resnet50/info.yaml | 2 + qai_hub_models/models/fcn_resnet50/perf.yaml | 65 +- qai_hub_models/models/fcn_resnet50/test.py | 3 + .../models/ffnet_122ns_lowres/README.md | 6 +- .../models/ffnet_122ns_lowres/conftest.py | 26 + .../models/ffnet_122ns_lowres/export.py | 39 +- .../models/ffnet_122ns_lowres/info.yaml | 4 +- .../models/ffnet_122ns_lowres/perf.yaml | 69 +- .../ffnet_122ns_lowres/requirements.txt | 2 +- qai_hub_models/models/ffnet_40s/README.md | 6 +- qai_hub_models/models/ffnet_40s/conftest.py | 26 + qai_hub_models/models/ffnet_40s/export.py | 39 +- qai_hub_models/models/ffnet_40s/info.yaml | 4 +- qai_hub_models/models/ffnet_40s/perf.yaml | 69 +- .../models/ffnet_40s/requirements.txt | 2 +- .../models/ffnet_40s_quantized/README.md | 6 +- .../models/ffnet_40s_quantized/conftest.py | 26 + .../models/ffnet_40s_quantized/export.py | 37 +- .../models/ffnet_40s_quantized/info.yaml | 4 +- .../models/ffnet_40s_quantized/perf.yaml | 61 +- .../models/ffnet_40s_quantized/test.py | 1 + qai_hub_models/models/ffnet_54s/README.md | 6 +- qai_hub_models/models/ffnet_54s/conftest.py | 26 + qai_hub_models/models/ffnet_54s/export.py | 39 +- qai_hub_models/models/ffnet_54s/info.yaml | 4 +- qai_hub_models/models/ffnet_54s/perf.yaml | 69 +- .../models/ffnet_54s/requirements.txt | 2 +- .../models/ffnet_54s_quantized/README.md | 6 +- .../models/ffnet_54s_quantized/conftest.py | 26 + .../models/ffnet_54s_quantized/export.py | 37 +- .../models/ffnet_54s_quantized/info.yaml | 4 +- .../models/ffnet_54s_quantized/perf.yaml | 61 +- .../models/ffnet_54s_quantized/test.py | 1 + qai_hub_models/models/ffnet_78s/README.md | 6 +- qai_hub_models/models/ffnet_78s/conftest.py | 26 + qai_hub_models/models/ffnet_78s/export.py | 39 +- qai_hub_models/models/ffnet_78s/info.yaml | 4 +- qai_hub_models/models/ffnet_78s/perf.yaml | 69 +- .../models/ffnet_78s/requirements.txt | 2 +- .../models/ffnet_78s_lowres/README.md | 6 +- .../models/ffnet_78s_lowres/conftest.py | 26 + .../models/ffnet_78s_lowres/export.py | 39 +- .../models/ffnet_78s_lowres/info.yaml | 4 +- .../models/ffnet_78s_lowres/perf.yaml | 69 +- .../models/ffnet_78s_lowres/requirements.txt | 2 +- .../models/ffnet_78s_quantized/README.md | 6 +- .../models/ffnet_78s_quantized/conftest.py | 26 + .../models/ffnet_78s_quantized/export.py | 37 +- .../models/ffnet_78s_quantized/info.yaml | 4 +- .../models/ffnet_78s_quantized/perf.yaml | 61 +- .../models/ffnet_78s_quantized/test.py | 1 + qai_hub_models/models/googlenet/README.md | 6 +- qai_hub_models/models/googlenet/conftest.py | 24 + qai_hub_models/models/googlenet/demo.py | 4 +- qai_hub_models/models/googlenet/export.py | 42 +- qai_hub_models/models/googlenet/info.yaml | 2 + qai_hub_models/models/googlenet/model.py | 6 +- qai_hub_models/models/googlenet/perf.yaml | 77 +- qai_hub_models/models/googlenet/test.py | 3 + .../models/googlenet_quantized/README.md | 6 +- .../models/googlenet_quantized/conftest.py | 24 + .../models/googlenet_quantized/demo.py | 7 +- .../models/googlenet_quantized/export.py | 42 +- .../models/googlenet_quantized/info.yaml | 4 +- .../models/googlenet_quantized/model.py | 58 +- .../models/googlenet_quantized/perf.yaml | 85 +- .../models/googlenet_quantized/test.py | 11 - qai_hub_models/models/hrnet_pose/README.md | 6 +- qai_hub_models/models/hrnet_pose/conftest.py | 26 + qai_hub_models/models/hrnet_pose/demo.py | 4 +- qai_hub_models/models/hrnet_pose/export.py | 42 +- qai_hub_models/models/hrnet_pose/info.yaml | 2 + qai_hub_models/models/hrnet_pose/model.py | 14 +- qai_hub_models/models/hrnet_pose/perf.yaml | 69 +- .../models/hrnet_pose/requirements.txt | 4 +- .../models/hrnet_pose_quantized/README.md | 6 +- .../models/hrnet_pose_quantized/conftest.py | 26 + .../models/hrnet_pose_quantized/demo.py | 4 +- .../models/hrnet_pose_quantized/export.py | 40 +- .../models/hrnet_pose_quantized/info.yaml | 2 + .../models/hrnet_pose_quantized/perf.yaml | 61 +- .../hrnet_pose_quantized/requirements.txt | 4 +- .../huggingface_wavlm_base_plus/README.md | 6 +- .../huggingface_wavlm_base_plus/conftest.py | 26 + .../huggingface_wavlm_base_plus/export.py | 39 +- .../huggingface_wavlm_base_plus/info.yaml | 6 +- .../huggingface_wavlm_base_plus/model.py | 20 +- .../huggingface_wavlm_base_plus/perf.yaml | 71 +- .../requirements.txt | 8 +- .../huggingface_wavlm_base_plus/test.py | 2 + qai_hub_models/models/inception_v3/README.md | 6 +- .../models/inception_v3/conftest.py | 24 + qai_hub_models/models/inception_v3/demo.py | 4 +- qai_hub_models/models/inception_v3/export.py | 42 +- qai_hub_models/models/inception_v3/info.yaml | 2 + qai_hub_models/models/inception_v3/model.py | 6 +- qai_hub_models/models/inception_v3/perf.yaml | 77 +- qai_hub_models/models/inception_v3/test.py | 3 + .../models/inception_v3_quantized/README.md | 12 +- .../models/inception_v3_quantized/conftest.py | 24 + .../models/inception_v3_quantized/demo.py | 7 +- .../models/inception_v3_quantized/export.py | 42 +- .../models/inception_v3_quantized/info.yaml | 8 +- .../models/inception_v3_quantized/model.py | 145 ++- .../models/inception_v3_quantized/perf.yaml | 67 +- .../models/inception_v3_quantized/test.py | 11 - qai_hub_models/models/lama_dilated/README.md | 6 +- .../models/lama_dilated/conftest.py | 26 + qai_hub_models/models/lama_dilated/demo.py | 2 +- qai_hub_models/models/lama_dilated/export.py | 42 +- qai_hub_models/models/lama_dilated/info.yaml | 2 + qai_hub_models/models/lama_dilated/model.py | 16 +- qai_hub_models/models/lama_dilated/perf.yaml | 69 +- .../models/lama_dilated/requirements.txt | 9 +- qai_hub_models/models/lama_dilated/test.py | 2 + qai_hub_models/models/litehrnet/README.md | 6 +- qai_hub_models/models/litehrnet/conftest.py | 24 + qai_hub_models/models/litehrnet/demo.py | 4 +- qai_hub_models/models/litehrnet/export.py | 42 +- qai_hub_models/models/litehrnet/info.yaml | 2 + qai_hub_models/models/litehrnet/model.py | 2 +- qai_hub_models/models/litehrnet/perf.yaml | 59 +- .../models/litehrnet/requirements.txt | 4 +- qai_hub_models/models/litehrnet/test.py | 2 + .../llama_v2_7b_chat_quantized/README.md | 4 +- .../llama_v2_7b_chat_quantized/info.yaml | 2 + .../models/mediapipe_face/README.md | 11 +- .../models/mediapipe_face/conftest.py | 26 + .../models/mediapipe_face/export.py | 69 +- .../models/mediapipe_face/info.yaml | 2 + qai_hub_models/models/mediapipe_face/model.py | 6 +- .../models/mediapipe_face/perf.yaml | 139 ++- .../models/mediapipe_face/requirements.txt | 2 - .../models/mediapipe_hand/README.md | 11 +- .../models/mediapipe_hand/conftest.py | 26 + .../models/mediapipe_hand/export.py | 69 +- .../models/mediapipe_hand/info.yaml | 2 + qai_hub_models/models/mediapipe_hand/model.py | 6 +- .../models/mediapipe_hand/perf.yaml | 141 ++- .../models/mediapipe_hand/requirements.txt | 2 - .../models/mediapipe_pose/README.md | 11 +- .../models/mediapipe_pose/conftest.py | 26 + .../models/mediapipe_pose/export.py | 69 +- .../models/mediapipe_pose/info.yaml | 2 + qai_hub_models/models/mediapipe_pose/model.py | 6 +- .../models/mediapipe_pose/perf.yaml | 143 ++- .../models/mediapipe_pose/requirements.txt | 2 - .../models/mediapipe_selfie/README.md | 6 +- .../models/mediapipe_selfie/conftest.py | 24 + .../models/mediapipe_selfie/demo.py | 43 +- .../models/mediapipe_selfie/export.py | 43 +- .../models/mediapipe_selfie/info.yaml | 2 + .../models/mediapipe_selfie/model.py | 5 +- .../models/mediapipe_selfie/perf.yaml | 67 +- qai_hub_models/models/mnasnet05/README.md | 6 +- qai_hub_models/models/mnasnet05/conftest.py | 24 + qai_hub_models/models/mnasnet05/demo.py | 4 +- qai_hub_models/models/mnasnet05/export.py | 42 +- qai_hub_models/models/mnasnet05/info.yaml | 2 + qai_hub_models/models/mnasnet05/model.py | 2 +- qai_hub_models/models/mnasnet05/perf.yaml | 69 +- qai_hub_models/models/mnasnet05/test.py | 3 + qai_hub_models/models/mobilenet_v2/README.md | 6 +- .../models/mobilenet_v2/conftest.py | 26 + qai_hub_models/models/mobilenet_v2/demo.py | 4 +- qai_hub_models/models/mobilenet_v2/export.py | 42 +- qai_hub_models/models/mobilenet_v2/info.yaml | 2 + qai_hub_models/models/mobilenet_v2/model.py | 9 +- qai_hub_models/models/mobilenet_v2/perf.yaml | 69 +- qai_hub_models/models/mobilenet_v2/test.py | 3 + .../models/mobilenet_v2_quantized/README.md | 6 +- .../models/mobilenet_v2_quantized/conftest.py | 26 + .../models/mobilenet_v2_quantized/demo.py | 7 +- .../models/mobilenet_v2_quantized/export.py | 40 +- .../models/mobilenet_v2_quantized/info.yaml | 2 + .../models/mobilenet_v2_quantized/model.py | 35 +- .../models/mobilenet_v2_quantized/perf.yaml | 79 +- .../models/mobilenet_v2_quantized/test.py | 10 - .../models/mobilenet_v3_large/README.md | 8 +- .../models/mobilenet_v3_large/conftest.py | 24 + .../models/mobilenet_v3_large/demo.py | 4 +- .../models/mobilenet_v3_large/export.py | 42 +- .../models/mobilenet_v3_large/info.yaml | 4 +- .../models/mobilenet_v3_large/model.py | 2 +- .../models/mobilenet_v3_large/perf.yaml | 59 +- .../models/mobilenet_v3_large/test.py | 3 + .../mobilenet_v3_large_quantized/README.md | 54 + .../mobilenet_v3_large_quantized/__init__.py | 13 + .../mobilenet_v3_large_quantized/conftest.py | 24 + .../mobilenet_v3_large_quantized/demo.py | 23 + .../mobilenet_v3_large_quantized/export.py | 202 ++++ .../mobilenet_v3_large_quantized/info.yaml | 44 + .../mobilenet_v3_large_quantized/model.py | 85 ++ .../mobilenet_v3_large_quantized/perf.yaml | 108 ++ .../mobilenet_v3_large_quantized/test.py | 29 + .../models/mobilenet_v3_small/README.md | 6 +- .../models/mobilenet_v3_small/conftest.py | 24 + .../models/mobilenet_v3_small/demo.py | 4 +- .../models/mobilenet_v3_small/export.py | 42 +- .../models/mobilenet_v3_small/info.yaml | 2 + .../models/mobilenet_v3_small/model.py | 2 +- .../models/mobilenet_v3_small/perf.yaml | 59 +- .../models/mobilenet_v3_small/test.py | 3 + qai_hub_models/models/openai_clip/README.md | 6 +- qai_hub_models/models/openai_clip/app.py | 2 +- qai_hub_models/models/openai_clip/conftest.py | 26 + qai_hub_models/models/openai_clip/export.py | 69 +- qai_hub_models/models/openai_clip/info.yaml | 2 + qai_hub_models/models/openai_clip/model.py | 4 +- qai_hub_models/models/openai_clip/perf.yaml | 145 ++- .../models/openai_clip/requirements.txt | 1 - qai_hub_models/models/openpose/README.md | 6 +- qai_hub_models/models/openpose/conftest.py | 26 + qai_hub_models/models/openpose/demo.py | 30 +- qai_hub_models/models/openpose/export.py | 43 +- qai_hub_models/models/openpose/info.yaml | 2 + qai_hub_models/models/openpose/model.py | 2 +- qai_hub_models/models/openpose/perf.yaml | 67 +- .../models/openpose/requirements.txt | 4 +- qai_hub_models/models/protocols.py | 194 ++++ .../models/quicksrnetlarge/README.md | 6 +- .../models/quicksrnetlarge/conftest.py | 26 + qai_hub_models/models/quicksrnetlarge/demo.py | 1 + .../models/quicksrnetlarge/export.py | 43 +- .../models/quicksrnetlarge/info.yaml | 4 +- .../models/quicksrnetlarge/model.py | 2 - .../models/quicksrnetlarge/perf.yaml | 67 +- qai_hub_models/models/quicksrnetlarge/test.py | 1 + .../quicksrnetlarge_quantized/README.md | 54 + .../quicksrnetlarge_quantized/__init__.py | 10 + .../quicksrnetlarge_quantized/conftest.py | 26 + .../models/quicksrnetlarge_quantized/demo.py | 28 + .../quicksrnetlarge_quantized/export.py | 215 ++++ .../quicksrnetlarge_quantized/info.yaml | 35 + .../models/quicksrnetlarge_quantized/model.py | 99 ++ .../quicksrnetlarge_quantized/perf.yaml | 108 ++ .../models/quicksrnetlarge_quantized/test.py | 89 ++ .../models/quicksrnetmedium/README.md | 6 +- .../models/quicksrnetmedium/conftest.py | 26 + .../models/quicksrnetmedium/demo.py | 1 + .../models/quicksrnetmedium/export.py | 43 +- .../models/quicksrnetmedium/info.yaml | 4 +- .../models/quicksrnetmedium/model.py | 2 - .../models/quicksrnetmedium/perf.yaml | 163 ++- .../models/quicksrnetmedium/test.py | 1 + .../quicksrnetmedium_quantized/README.md | 54 + .../quicksrnetmedium_quantized/__init__.py | 10 + .../quicksrnetmedium_quantized/conftest.py | 26 + .../models/quicksrnetmedium_quantized/demo.py | 28 + .../quicksrnetmedium_quantized/export.py | 215 ++++ .../quicksrnetmedium_quantized/info.yaml | 35 + .../quicksrnetmedium_quantized/model.py | 98 ++ .../quicksrnetmedium_quantized/perf.yaml | 108 ++ .../models/quicksrnetmedium_quantized/test.py | 91 ++ .../models/quicksrnetsmall/README.md | 6 +- .../models/quicksrnetsmall/conftest.py | 26 + qai_hub_models/models/quicksrnetsmall/demo.py | 1 + .../models/quicksrnetsmall/export.py | 42 +- .../models/quicksrnetsmall/info.yaml | 4 +- .../models/quicksrnetsmall/model.py | 2 - .../models/quicksrnetsmall/perf.yaml | 67 +- qai_hub_models/models/quicksrnetsmall/test.py | 1 + .../quicksrnetsmall_quantized/README.md | 54 + .../quicksrnetsmall_quantized/__init__.py | 10 + .../quicksrnetsmall_quantized/conftest.py | 26 + .../models/quicksrnetsmall_quantized/demo.py | 28 + .../quicksrnetsmall_quantized/export.py | 215 ++++ .../quicksrnetsmall_quantized/info.yaml | 35 + .../models/quicksrnetsmall_quantized/model.py | 97 ++ .../quicksrnetsmall_quantized/perf.yaml | 108 ++ .../models/quicksrnetsmall_quantized/test.py | 87 ++ .../models/real_esrgan_general_x4v3/README.md | 6 +- .../real_esrgan_general_x4v3/conftest.py | 26 + .../models/real_esrgan_general_x4v3/demo.py | 1 + .../models/real_esrgan_general_x4v3/export.py | 43 +- .../models/real_esrgan_general_x4v3/info.yaml | 2 + .../models/real_esrgan_general_x4v3/perf.yaml | 69 +- .../real_esrgan_general_x4v3/requirements.txt | 9 +- .../models/real_esrgan_x4plus/README.md | 6 +- .../models/real_esrgan_x4plus/conftest.py | 26 + .../models/real_esrgan_x4plus/demo.py | 1 + .../models/real_esrgan_x4plus/export.py | 43 +- .../models/real_esrgan_x4plus/info.yaml | 2 + .../models/real_esrgan_x4plus/perf.yaml | 79 +- .../real_esrgan_x4plus/requirements.txt | 7 +- .../models/real_esrgan_x4plus/test.py | 1 + qai_hub_models/models/regnet/README.md | 6 +- qai_hub_models/models/regnet/conftest.py | 24 + qai_hub_models/models/regnet/demo.py | 4 +- qai_hub_models/models/regnet/export.py | 42 +- qai_hub_models/models/regnet/info.yaml | 2 + qai_hub_models/models/regnet/perf.yaml | 69 +- qai_hub_models/models/regnet/test.py | 3 + qai_hub_models/models/resnet101/README.md | 6 +- qai_hub_models/models/resnet101/conftest.py | 24 + qai_hub_models/models/resnet101/demo.py | 4 +- qai_hub_models/models/resnet101/export.py | 42 +- qai_hub_models/models/resnet101/info.yaml | 2 + qai_hub_models/models/resnet101/perf.yaml | 65 +- qai_hub_models/models/resnet101/test.py | 3 + .../models/resnet101_quantized/README.md | 6 +- .../models/resnet101_quantized/conftest.py | 24 + .../models/resnet101_quantized/demo.py | 7 +- .../models/resnet101_quantized/export.py | 42 +- .../models/resnet101_quantized/info.yaml | 2 + .../models/resnet101_quantized/model.py | 33 +- .../models/resnet101_quantized/perf.yaml | 85 +- .../models/resnet101_quantized/test.py | 11 - qai_hub_models/models/resnet18/README.md | 6 +- qai_hub_models/models/resnet18/conftest.py | 24 + qai_hub_models/models/resnet18/demo.py | 4 +- qai_hub_models/models/resnet18/export.py | 42 +- qai_hub_models/models/resnet18/info.yaml | 2 + qai_hub_models/models/resnet18/perf.yaml | 69 +- qai_hub_models/models/resnet18/test.py | 3 + .../models/resnet18_quantized/README.md | 6 +- .../models/resnet18_quantized/conftest.py | 24 + .../models/resnet18_quantized/demo.py | 4 +- .../models/resnet18_quantized/export.py | 40 +- .../models/resnet18_quantized/info.yaml | 2 + .../models/resnet18_quantized/model.py | 37 +- .../models/resnet18_quantized/perf.yaml | 71 +- .../models/resnet18_quantized/test.py | 11 - qai_hub_models/models/resnet50/README.md | 6 +- qai_hub_models/models/resnet50/conftest.py | 24 + qai_hub_models/models/resnet50/demo.py | 4 +- qai_hub_models/models/resnet50/export.py | 42 +- qai_hub_models/models/resnet50/info.yaml | 2 + qai_hub_models/models/resnet50/perf.yaml | 69 +- qai_hub_models/models/resnet50/test.py | 3 + qai_hub_models/models/resnext101/README.md | 6 +- qai_hub_models/models/resnext101/conftest.py | 24 + qai_hub_models/models/resnext101/demo.py | 4 +- qai_hub_models/models/resnext101/export.py | 42 +- qai_hub_models/models/resnext101/info.yaml | 2 + qai_hub_models/models/resnext101/perf.yaml | 65 +- qai_hub_models/models/resnext101/test.py | 3 + .../models/resnext101_quantized/README.md | 6 +- .../models/resnext101_quantized/conftest.py | 24 + .../models/resnext101_quantized/demo.py | 7 +- .../models/resnext101_quantized/export.py | 40 +- .../models/resnext101_quantized/info.yaml | 2 + .../models/resnext101_quantized/model.py | 34 +- .../models/resnext101_quantized/perf.yaml | 65 +- .../models/resnext101_quantized/test.py | 11 - qai_hub_models/models/resnext50/README.md | 6 +- qai_hub_models/models/resnext50/conftest.py | 24 + qai_hub_models/models/resnext50/demo.py | 4 +- qai_hub_models/models/resnext50/export.py | 42 +- qai_hub_models/models/resnext50/info.yaml | 2 + qai_hub_models/models/resnext50/model.py | 2 +- qai_hub_models/models/resnext50/perf.yaml | 63 +- qai_hub_models/models/resnext50/test.py | 3 + .../models/resnext50_quantized/README.md | 54 + .../models/resnext50_quantized/__init__.py | 10 + .../models/resnext50_quantized/conftest.py | 24 + .../models/resnext50_quantized/demo.py | 17 + .../models/resnext50_quantized/export.py | 209 ++++ .../models/resnext50_quantized/info.yaml | 43 + .../models/resnext50_quantized/model.py | 93 ++ .../perf.yaml | 75 +- .../models/resnext50_quantized/test.py | 30 + qai_hub_models/models/sam/README.md | 6 +- qai_hub_models/models/sam/conftest.py | 28 + qai_hub_models/models/sam/export.py | 79 +- qai_hub_models/models/sam/info.yaml | 2 + qai_hub_models/models/sam/model.py | 64 +- qai_hub_models/models/sam/perf.yaml | 69 +- qai_hub_models/models/sam/requirements.txt | 6 +- qai_hub_models/models/sam/test.py | 4 +- qai_hub_models/models/sesr_m5/README.md | 6 +- qai_hub_models/models/sesr_m5/conftest.py | 26 + qai_hub_models/models/sesr_m5/demo.py | 1 + qai_hub_models/models/sesr_m5/export.py | 39 +- qai_hub_models/models/sesr_m5/info.yaml | 2 + qai_hub_models/models/sesr_m5/model.py | 2 - qai_hub_models/models/sesr_m5/perf.yaml | 69 +- qai_hub_models/models/sesr_m5/test.py | 1 + .../models/sesr_m5_quantized/README.md | 6 +- .../models/sesr_m5_quantized/conftest.py | 26 + .../models/sesr_m5_quantized/demo.py | 1 + .../models/sesr_m5_quantized/export.py | 37 +- .../models/sesr_m5_quantized/info.yaml | 2 + .../models/sesr_m5_quantized/model.py | 4 +- .../models/sesr_m5_quantized/perf.yaml | 61 +- .../models/sesr_m5_quantized/test.py | 4 +- qai_hub_models/models/shufflenet_v2/README.md | 6 +- .../models/shufflenet_v2/conftest.py | 24 + qai_hub_models/models/shufflenet_v2/demo.py | 4 +- qai_hub_models/models/shufflenet_v2/export.py | 42 +- qai_hub_models/models/shufflenet_v2/info.yaml | 2 + qai_hub_models/models/shufflenet_v2/model.py | 2 +- qai_hub_models/models/shufflenet_v2/perf.yaml | 69 +- qai_hub_models/models/shufflenet_v2/test.py | 3 + .../models/shufflenet_v2_quantized/README.md | 6 +- .../shufflenet_v2_quantized/conftest.py | 24 + .../models/shufflenet_v2_quantized/demo.py | 7 +- .../models/shufflenet_v2_quantized/export.py | 42 +- .../models/shufflenet_v2_quantized/info.yaml | 2 + .../models/shufflenet_v2_quantized/model.py | 104 +- .../models/shufflenet_v2_quantized/perf.yaml | 81 +- .../models/shufflenet_v2_quantized/test.py | 11 - qai_hub_models/models/sinet/README.md | 6 +- qai_hub_models/models/sinet/conftest.py | 26 + qai_hub_models/models/sinet/demo.py | 4 +- qai_hub_models/models/sinet/export.py | 42 +- qai_hub_models/models/sinet/info.yaml | 2 + qai_hub_models/models/sinet/model.py | 19 +- qai_hub_models/models/sinet/perf.yaml | 69 +- qai_hub_models/models/sinet/test.py | 1 + qai_hub_models/models/squeezenet1_1/README.md | 6 +- .../models/squeezenet1_1/conftest.py | 24 + qai_hub_models/models/squeezenet1_1/demo.py | 4 +- qai_hub_models/models/squeezenet1_1/export.py | 42 +- qai_hub_models/models/squeezenet1_1/info.yaml | 2 + qai_hub_models/models/squeezenet1_1/model.py | 2 +- qai_hub_models/models/squeezenet1_1/perf.yaml | 67 +- qai_hub_models/models/squeezenet1_1/test.py | 3 + .../models/squeezenet1_1_quantized/README.md | 6 +- .../squeezenet1_1_quantized/conftest.py | 24 + .../models/squeezenet1_1_quantized/demo.py | 7 +- .../models/squeezenet1_1_quantized/export.py | 40 +- .../models/squeezenet1_1_quantized/info.yaml | 2 + .../models/squeezenet1_1_quantized/model.py | 29 +- .../models/squeezenet1_1_quantized/perf.yaml | 79 +- .../models/squeezenet1_1_quantized/test.py | 11 - .../stable_diffusion_quantized/README.md | 6 +- .../stable_diffusion_quantized/export.py | 65 +- .../stable_diffusion_quantized/info.yaml | 2 + .../stable_diffusion_quantized/model.py | 32 +- .../requirements.txt | 2 +- .../models/stable_diffusion_quantized/test.py | 7 + qai_hub_models/models/stylegan2/README.md | 6 +- qai_hub_models/models/stylegan2/conftest.py | 26 + qai_hub_models/models/stylegan2/demo.py | 4 +- qai_hub_models/models/stylegan2/export.py | 39 +- qai_hub_models/models/stylegan2/info.yaml | 2 + qai_hub_models/models/stylegan2/model.py | 38 +- qai_hub_models/models/stylegan2/perf.yaml | 63 +- .../models/stylegan2/requirements.txt | 2 +- qai_hub_models/models/stylegan2/test.py | 2 + qai_hub_models/models/swin_base/README.md | 6 +- qai_hub_models/models/swin_base/conftest.py | 24 + qai_hub_models/models/swin_base/demo.py | 4 +- qai_hub_models/models/swin_base/export.py | 42 +- qai_hub_models/models/swin_base/info.yaml | 2 + qai_hub_models/models/swin_base/perf.yaml | 67 +- qai_hub_models/models/swin_base/test.py | 2 +- qai_hub_models/models/swin_small/README.md | 6 +- qai_hub_models/models/swin_small/conftest.py | 24 + qai_hub_models/models/swin_small/demo.py | 4 +- qai_hub_models/models/swin_small/export.py | 42 +- qai_hub_models/models/swin_small/info.yaml | 2 + qai_hub_models/models/swin_small/perf.yaml | 67 +- qai_hub_models/models/swin_small/test.py | 2 +- qai_hub_models/models/swin_tiny/README.md | 6 +- qai_hub_models/models/swin_tiny/conftest.py | 24 + qai_hub_models/models/swin_tiny/demo.py | 4 +- qai_hub_models/models/swin_tiny/export.py | 42 +- qai_hub_models/models/swin_tiny/info.yaml | 2 + qai_hub_models/models/swin_tiny/perf.yaml | 67 +- qai_hub_models/models/swin_tiny/test.py | 2 +- qai_hub_models/models/trocr/README.md | 6 +- qai_hub_models/models/trocr/conftest.py | 24 + qai_hub_models/models/trocr/export.py | 69 +- qai_hub_models/models/trocr/info.yaml | 2 + qai_hub_models/models/trocr/model.py | 25 +- qai_hub_models/models/trocr/perf.yaml | 115 +- qai_hub_models/models/trocr/requirements.txt | 4 +- .../models/unet_segmentation/README.md | 6 +- .../models/unet_segmentation/conftest.py | 24 + .../models/unet_segmentation/demo.py | 6 +- .../models/unet_segmentation/export.py | 42 +- .../models/unet_segmentation/info.yaml | 2 + .../models/unet_segmentation/perf.yaml | 69 +- qai_hub_models/models/vit/README.md | 6 +- qai_hub_models/models/vit/conftest.py | 24 + qai_hub_models/models/vit/demo.py | 4 +- qai_hub_models/models/vit/export.py | 42 +- qai_hub_models/models/vit/info.yaml | 2 + qai_hub_models/models/vit/model.py | 2 +- qai_hub_models/models/vit/perf.yaml | 59 +- qai_hub_models/models/whisper_asr/info.yaml | 38 - .../README.md | 22 +- .../__init__.py | 5 +- .../models/whisper_base_en/conftest.py | 24 + qai_hub_models/models/whisper_base_en/demo.py | 14 + .../export.py | 77 +- .../models/whisper_base_en/info.yaml | 40 + .../models/whisper_base_en/model.py | 16 + .../models/whisper_base_en/perf.yaml | 186 +++ .../requirements.txt | 0 qai_hub_models/models/whisper_base_en/test.py | 22 + .../models/whisper_small_en/README.md | 59 + .../models/whisper_small_en/__init__.py | 8 + .../models/whisper_small_en/conftest.py | 24 + .../models/whisper_small_en/demo.py | 14 + .../models/whisper_small_en/export.py | 229 ++++ .../models/whisper_small_en/info.yaml | 40 + .../models/whisper_small_en/model.py | 16 + .../models/whisper_small_en/perf.yaml | 186 +++ .../models/whisper_small_en/requirements.txt | 2 + .../models/whisper_small_en/test.py | 22 + .../models/whisper_small_multi/code-gen.yaml | 4 + .../models/whisper_small_multi/demo.py | 14 + .../models/whisper_tiny_en/README.md | 59 + .../models/whisper_tiny_en/__init__.py | 8 + .../models/whisper_tiny_en/conftest.py | 24 + qai_hub_models/models/whisper_tiny_en/demo.py | 14 + .../models/whisper_tiny_en/export.py | 229 ++++ .../models/whisper_tiny_en/info.yaml | 40 + .../models/whisper_tiny_en/model.py | 16 + .../models/whisper_tiny_en/perf.yaml | 186 +++ .../models/whisper_tiny_en/requirements.txt | 2 + qai_hub_models/models/whisper_tiny_en/test.py | 22 + qai_hub_models/models/wideresnet50/README.md | 6 +- .../models/wideresnet50/conftest.py | 24 + qai_hub_models/models/wideresnet50/demo.py | 4 +- qai_hub_models/models/wideresnet50/export.py | 42 +- qai_hub_models/models/wideresnet50/info.yaml | 2 + qai_hub_models/models/wideresnet50/model.py | 2 +- qai_hub_models/models/wideresnet50/perf.yaml | 69 +- qai_hub_models/models/wideresnet50/test.py | 3 + .../models/wideresnet50_quantized/README.md | 6 +- .../models/wideresnet50_quantized/conftest.py | 24 + .../models/wideresnet50_quantized/demo.py | 7 +- .../models/wideresnet50_quantized/export.py | 40 +- .../models/wideresnet50_quantized/info.yaml | 2 + .../models/wideresnet50_quantized/model.py | 40 +- .../models/wideresnet50_quantized/perf.yaml | 77 +- .../models/wideresnet50_quantized/test.py | 11 - qai_hub_models/models/xlsr/README.md | 6 +- qai_hub_models/models/xlsr/conftest.py | 26 + qai_hub_models/models/xlsr/demo.py | 2 +- qai_hub_models/models/xlsr/export.py | 39 +- qai_hub_models/models/xlsr/info.yaml | 2 + qai_hub_models/models/xlsr/model.py | 10 +- qai_hub_models/models/xlsr/perf.yaml | 69 +- .../models/xlsr_quantized/README.md | 6 +- .../models/xlsr_quantized/conftest.py | 26 + qai_hub_models/models/xlsr_quantized/demo.py | 1 + .../models/xlsr_quantized/export.py | 37 +- .../models/xlsr_quantized/info.yaml | 2 + .../models/xlsr_quantized/perf.yaml | 61 +- qai_hub_models/models/yolov6/README.md | 6 +- qai_hub_models/models/yolov6/conftest.py | 26 + qai_hub_models/models/yolov6/demo.py | 1 + qai_hub_models/models/yolov6/export.py | 42 +- qai_hub_models/models/yolov6/info.yaml | 3 + qai_hub_models/models/yolov6/perf.yaml | 69 +- qai_hub_models/models/yolov6/test.py | 1 + qai_hub_models/models/yolov7/README.md | 6 +- qai_hub_models/models/yolov7/conftest.py | 26 + qai_hub_models/models/yolov7/demo.py | 1 + qai_hub_models/models/yolov7/export.py | 42 +- qai_hub_models/models/yolov7/info.yaml | 2 + qai_hub_models/models/yolov7/model.py | 5 +- qai_hub_models/models/yolov7/perf.yaml | 59 +- qai_hub_models/models/yolov7/requirements.txt | 9 +- qai_hub_models/models/yolov7/test.py | 2 + qai_hub_models/models/yolov8_det/README.md | 14 +- qai_hub_models/models/yolov8_det/conftest.py | 24 + qai_hub_models/models/yolov8_det/demo.py | 1 + qai_hub_models/models/yolov8_det/export.py | 44 +- qai_hub_models/models/yolov8_det/info.yaml | 10 +- qai_hub_models/models/yolov8_det/perf.yaml | 71 +- .../models/yolov8_det/requirements.txt | 2 + qai_hub_models/models/yolov8_det/test.py | 2 - qai_hub_models/models/yolov8_seg/README.md | 14 +- qai_hub_models/models/yolov8_seg/conftest.py | 24 + qai_hub_models/models/yolov8_seg/demo.py | 4 +- qai_hub_models/models/yolov8_seg/export.py | 44 +- qai_hub_models/models/yolov8_seg/info.yaml | 10 +- qai_hub_models/models/yolov8_seg/model.py | 2 +- qai_hub_models/models/yolov8_seg/perf.yaml | 59 +- .../models/yolov8_seg/requirements.txt | 2 + qai_hub_models/models/yolov8_seg/test.py | 2 + qai_hub_models/requirements-dev.txt | 24 +- qai_hub_models/requirements.txt | 32 +- qai_hub_models/test/e2e/test_aimet_compile.py | 11 +- qai_hub_models/utils/aimet/config_loader.py | 14 +- .../utils/aimet/default_config.json | 21 +- .../utils/aimet/default_config_legacy_v1.json | 71 ++ ...nel.json => default_config_legacy_v2.json} | 0 qai_hub_models/utils/aimet/repo.py | 32 + qai_hub_models/utils/args.py | 36 +- qai_hub_models/utils/asset_loaders.py | 49 +- qai_hub_models/utils/base_model.py | 225 ++-- qai_hub_models/utils/compare.py | 4 +- qai_hub_models/utils/config_loaders.py | 84 +- qai_hub_models/utils/inference.py | 75 +- qai_hub_models/utils/input_spec.py | 2 +- qai_hub_models/utils/measurement.py | 22 +- qai_hub_models/utils/model_card.py | 160 ++- qai_hub_models/utils/path_helpers.py | 9 +- qai_hub_models/utils/perf_summary.py | 155 +-- qai_hub_models/utils/printing.py | 4 +- qai_hub_models/utils/qai_hub_helpers.py | 20 +- qai_hub_models/utils/qnn_helpers.py | 7 +- qai_hub_models/utils/quantization_aimet.py | 228 ++-- scripts/build_and_test.py | 43 +- scripts/ci/git-credential-helper.sh | 4 + scripts/{ => examples}/quantize_ffnet.py | 0 .../examples/quantize_imagenet_classifier.py | 41 +- ..._numerics_imagenet_classifier_quantized.py | 306 +++++ .../test_numerics_mobilenet_v2_quantized.py | 177 --- scripts/github/create-aws-profile.sh | 18 + scripts/tasks/changes.py | 39 +- scripts/tasks/release.py | 1 + scripts/tasks/test.py | 54 +- scripts/tasks/venv.py | 42 +- scripts/util/common.sh | 4 + scripts/util/env_create.sh | 4 + scripts/util/env_sync.sh | 4 + scripts/util/github.sh | 4 + scripts/util/pytest_with_coverage.sh | 4 + scripts/util/run_mypy.sh | 6 +- setup.py | 2 +- 857 files changed, 21748 insertions(+), 4591 deletions(-) delete mode 100644 .gitignore create mode 100644 apps/android/ImageClassification/README.md create mode 100644 apps/android/ImageClassification/build.gradle create mode 100644 apps/android/ImageClassification/build_apk.py create mode 100644 apps/android/ImageClassification/classification/build.gradle create mode 100644 apps/android/ImageClassification/classification/proguard-rules.pro create mode 100644 apps/android/ImageClassification/classification/src/main/AndroidManifest.xml create mode 100644 apps/android/ImageClassification/classification/src/main/assets/Sample1.png create mode 100644 apps/android/ImageClassification/classification/src/main/assets/Sample2.png create mode 100644 apps/android/ImageClassification/classification/src/main/assets/Sample3.png create mode 100644 apps/android/ImageClassification/classification/src/main/assets/Sample4.png create mode 100644 apps/android/ImageClassification/classification/src/main/assets/Sample5.png create mode 100644 apps/android/ImageClassification/classification/src/main/assets/labels.txt create mode 100644 apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassification.java create mode 100644 apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassificationResult.java create mode 100644 apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/QNNActivity.java create mode 100644 apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Result.java create mode 100644 apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Utils.java create mode 100644 apps/android/ImageClassification/classification/src/main/res/drawable-v24/ic_launcher_foreground.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/drawable/ic_launcher_background.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/drawable/image_classification_icon.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/layout/activity_classification.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher_round.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher_round.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher_round.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher_round.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png create mode 100644 apps/android/ImageClassification/classification/src/main/res/values-night/themes.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/values/colors.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/values/strings.xml create mode 100644 apps/android/ImageClassification/classification/src/main/res/values/themes.xml create mode 100644 apps/android/ImageClassification/gradle.properties create mode 100644 apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.jar create mode 100644 apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.properties create mode 100644 apps/android/ImageClassification/gradlew create mode 100644 apps/android/ImageClassification/gradlew.bat create mode 100644 apps/android/ImageClassification/settings.gradle create mode 100644 apps/android/ImageSuperResolution/README.md create mode 100644 apps/android/ImageSuperResolution/build.gradle create mode 100644 apps/android/ImageSuperResolution/build.properties create mode 100644 apps/android/ImageSuperResolution/build_apk.py create mode 100644 apps/android/ImageSuperResolution/gradle.properties create mode 100644 apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.jar create mode 100644 apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.properties create mode 100644 apps/android/ImageSuperResolution/gradlew create mode 100644 apps/android/ImageSuperResolution/gradlew.bat create mode 100644 apps/android/ImageSuperResolution/settings.gradle create mode 100644 apps/android/ImageSuperResolution/superresolution/build.gradle create mode 100644 apps/android/ImageSuperResolution/superresolution/proguard-rules.pro create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/AndroidManifest.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample1.jpg create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample2.jpg create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/QNNActivity.java create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Result.java create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolution.java create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolutionResult.java create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Utils.java create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/UtilsESRGAN.java create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/drawable-v24/ic_launcher_foreground.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/drawable/ic_launcher_background.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/layout/activity_superres.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher_round.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher_round.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher_round.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher_round.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/values-night/themes.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/values/colors.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/values/strings.xml create mode 100644 apps/android/ImageSuperResolution/superresolution/src/main/res/values/themes.xml create mode 100644 qai_hub_models/global_requirements.txt create mode 100644 qai_hub_models/models/_shared/whisper/__init__.py rename qai_hub_models/models/{whisper_asr => _shared/whisper}/app.py (98%) rename qai_hub_models/models/{whisper_asr => _shared/whisper}/demo.py (82%) rename qai_hub_models/models/{whisper_asr => _shared/whisper}/model.py (88%) rename qai_hub_models/models/{whisper_asr/test.py => _shared/whisper/test_utils.py} (71%) create mode 100644 qai_hub_models/models/aotgan/README.md create mode 100644 qai_hub_models/models/aotgan/__init__.py create mode 100644 qai_hub_models/models/aotgan/conftest.py create mode 100644 qai_hub_models/models/aotgan/demo.py create mode 100644 qai_hub_models/models/aotgan/export.py create mode 100644 qai_hub_models/models/aotgan/info.yaml create mode 100644 qai_hub_models/models/aotgan/model.py create mode 100644 qai_hub_models/models/aotgan/patches/layer_norm.diff create mode 100644 qai_hub_models/models/aotgan/perf.yaml create mode 100644 qai_hub_models/models/aotgan/test.py create mode 100644 qai_hub_models/models/common.py create mode 100644 qai_hub_models/models/convnext_tiny/conftest.py create mode 100644 qai_hub_models/models/ddrnet23_slim/conftest.py create mode 100644 qai_hub_models/models/deeplabv3_resnet50/conftest.py create mode 100644 qai_hub_models/models/densenet121/conftest.py create mode 100644 qai_hub_models/models/detr_resnet101/conftest.py create mode 100644 qai_hub_models/models/detr_resnet101_dc5/conftest.py create mode 100644 qai_hub_models/models/detr_resnet50/conftest.py create mode 100644 qai_hub_models/models/detr_resnet50_dc5/conftest.py create mode 100644 qai_hub_models/models/efficientnet_b0/conftest.py create mode 100644 qai_hub_models/models/esrgan/conftest.py create mode 100644 qai_hub_models/models/facebook_denoiser/conftest.py create mode 100644 qai_hub_models/models/fastsam_s/conftest.py create mode 100644 qai_hub_models/models/fastsam_x/conftest.py create mode 100644 qai_hub_models/models/fcn_resnet50/conftest.py create mode 100644 qai_hub_models/models/ffnet_122ns_lowres/conftest.py create mode 100644 qai_hub_models/models/ffnet_40s/conftest.py create mode 100644 qai_hub_models/models/ffnet_40s_quantized/conftest.py create mode 100644 qai_hub_models/models/ffnet_54s/conftest.py create mode 100644 qai_hub_models/models/ffnet_54s_quantized/conftest.py create mode 100644 qai_hub_models/models/ffnet_78s/conftest.py create mode 100644 qai_hub_models/models/ffnet_78s_lowres/conftest.py create mode 100644 qai_hub_models/models/ffnet_78s_quantized/conftest.py create mode 100644 qai_hub_models/models/googlenet/conftest.py create mode 100644 qai_hub_models/models/googlenet_quantized/conftest.py create mode 100644 qai_hub_models/models/hrnet_pose/conftest.py create mode 100644 qai_hub_models/models/hrnet_pose_quantized/conftest.py create mode 100644 qai_hub_models/models/huggingface_wavlm_base_plus/conftest.py create mode 100644 qai_hub_models/models/inception_v3/conftest.py create mode 100644 qai_hub_models/models/inception_v3_quantized/conftest.py create mode 100644 qai_hub_models/models/lama_dilated/conftest.py create mode 100644 qai_hub_models/models/litehrnet/conftest.py create mode 100644 qai_hub_models/models/mediapipe_face/conftest.py delete mode 100644 qai_hub_models/models/mediapipe_face/requirements.txt create mode 100644 qai_hub_models/models/mediapipe_hand/conftest.py delete mode 100644 qai_hub_models/models/mediapipe_hand/requirements.txt create mode 100644 qai_hub_models/models/mediapipe_pose/conftest.py delete mode 100644 qai_hub_models/models/mediapipe_pose/requirements.txt create mode 100644 qai_hub_models/models/mediapipe_selfie/conftest.py create mode 100644 qai_hub_models/models/mnasnet05/conftest.py create mode 100644 qai_hub_models/models/mobilenet_v2/conftest.py create mode 100644 qai_hub_models/models/mobilenet_v2_quantized/conftest.py create mode 100644 qai_hub_models/models/mobilenet_v3_large/conftest.py create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/README.md create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/__init__.py create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/conftest.py create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/demo.py create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/export.py create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/info.yaml create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/model.py create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml create mode 100644 qai_hub_models/models/mobilenet_v3_large_quantized/test.py create mode 100644 qai_hub_models/models/mobilenet_v3_small/conftest.py create mode 100644 qai_hub_models/models/openai_clip/conftest.py create mode 100644 qai_hub_models/models/openpose/conftest.py create mode 100644 qai_hub_models/models/protocols.py create mode 100644 qai_hub_models/models/quicksrnetlarge/conftest.py create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/README.md create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/__init__.py create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/conftest.py create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/demo.py create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/export.py create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/info.yaml create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/model.py create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml create mode 100644 qai_hub_models/models/quicksrnetlarge_quantized/test.py create mode 100644 qai_hub_models/models/quicksrnetmedium/conftest.py create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/README.md create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/__init__.py create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/conftest.py create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/demo.py create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/export.py create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/info.yaml create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/model.py create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml create mode 100644 qai_hub_models/models/quicksrnetmedium_quantized/test.py create mode 100644 qai_hub_models/models/quicksrnetsmall/conftest.py create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/README.md create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/__init__.py create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/conftest.py create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/demo.py create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/export.py create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/info.yaml create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/model.py create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml create mode 100644 qai_hub_models/models/quicksrnetsmall_quantized/test.py create mode 100644 qai_hub_models/models/real_esrgan_general_x4v3/conftest.py create mode 100644 qai_hub_models/models/real_esrgan_x4plus/conftest.py create mode 100644 qai_hub_models/models/regnet/conftest.py create mode 100644 qai_hub_models/models/resnet101/conftest.py create mode 100644 qai_hub_models/models/resnet101_quantized/conftest.py create mode 100644 qai_hub_models/models/resnet18/conftest.py create mode 100644 qai_hub_models/models/resnet18_quantized/conftest.py create mode 100644 qai_hub_models/models/resnet50/conftest.py create mode 100644 qai_hub_models/models/resnext101/conftest.py create mode 100644 qai_hub_models/models/resnext101_quantized/conftest.py create mode 100644 qai_hub_models/models/resnext50/conftest.py create mode 100644 qai_hub_models/models/resnext50_quantized/README.md create mode 100644 qai_hub_models/models/resnext50_quantized/__init__.py create mode 100644 qai_hub_models/models/resnext50_quantized/conftest.py create mode 100644 qai_hub_models/models/resnext50_quantized/demo.py create mode 100644 qai_hub_models/models/resnext50_quantized/export.py create mode 100644 qai_hub_models/models/resnext50_quantized/info.yaml create mode 100644 qai_hub_models/models/resnext50_quantized/model.py rename qai_hub_models/models/{whisper_asr => resnext50_quantized}/perf.yaml (71%) create mode 100644 qai_hub_models/models/resnext50_quantized/test.py create mode 100644 qai_hub_models/models/sam/conftest.py create mode 100644 qai_hub_models/models/sesr_m5/conftest.py create mode 100644 qai_hub_models/models/sesr_m5_quantized/conftest.py create mode 100644 qai_hub_models/models/shufflenet_v2/conftest.py create mode 100644 qai_hub_models/models/shufflenet_v2_quantized/conftest.py create mode 100644 qai_hub_models/models/sinet/conftest.py create mode 100644 qai_hub_models/models/squeezenet1_1/conftest.py create mode 100644 qai_hub_models/models/squeezenet1_1_quantized/conftest.py create mode 100644 qai_hub_models/models/stylegan2/conftest.py create mode 100644 qai_hub_models/models/swin_base/conftest.py create mode 100644 qai_hub_models/models/swin_small/conftest.py create mode 100644 qai_hub_models/models/swin_tiny/conftest.py create mode 100644 qai_hub_models/models/trocr/conftest.py create mode 100644 qai_hub_models/models/unet_segmentation/conftest.py create mode 100644 qai_hub_models/models/vit/conftest.py delete mode 100644 qai_hub_models/models/whisper_asr/info.yaml rename qai_hub_models/models/{whisper_asr => whisper_base_en}/README.md (58%) rename qai_hub_models/models/{whisper_asr => whisper_base_en}/__init__.py (67%) create mode 100644 qai_hub_models/models/whisper_base_en/conftest.py create mode 100644 qai_hub_models/models/whisper_base_en/demo.py rename qai_hub_models/models/{whisper_asr => whisper_base_en}/export.py (74%) create mode 100644 qai_hub_models/models/whisper_base_en/info.yaml create mode 100644 qai_hub_models/models/whisper_base_en/model.py create mode 100644 qai_hub_models/models/whisper_base_en/perf.yaml rename qai_hub_models/models/{whisper_asr => whisper_base_en}/requirements.txt (100%) create mode 100644 qai_hub_models/models/whisper_base_en/test.py create mode 100644 qai_hub_models/models/whisper_small_en/README.md create mode 100644 qai_hub_models/models/whisper_small_en/__init__.py create mode 100644 qai_hub_models/models/whisper_small_en/conftest.py create mode 100644 qai_hub_models/models/whisper_small_en/demo.py create mode 100644 qai_hub_models/models/whisper_small_en/export.py create mode 100644 qai_hub_models/models/whisper_small_en/info.yaml create mode 100644 qai_hub_models/models/whisper_small_en/model.py create mode 100644 qai_hub_models/models/whisper_small_en/perf.yaml create mode 100644 qai_hub_models/models/whisper_small_en/requirements.txt create mode 100644 qai_hub_models/models/whisper_small_en/test.py create mode 100644 qai_hub_models/models/whisper_small_multi/code-gen.yaml create mode 100644 qai_hub_models/models/whisper_small_multi/demo.py create mode 100644 qai_hub_models/models/whisper_tiny_en/README.md create mode 100644 qai_hub_models/models/whisper_tiny_en/__init__.py create mode 100644 qai_hub_models/models/whisper_tiny_en/conftest.py create mode 100644 qai_hub_models/models/whisper_tiny_en/demo.py create mode 100644 qai_hub_models/models/whisper_tiny_en/export.py create mode 100644 qai_hub_models/models/whisper_tiny_en/info.yaml create mode 100644 qai_hub_models/models/whisper_tiny_en/model.py create mode 100644 qai_hub_models/models/whisper_tiny_en/perf.yaml create mode 100644 qai_hub_models/models/whisper_tiny_en/requirements.txt create mode 100644 qai_hub_models/models/whisper_tiny_en/test.py create mode 100644 qai_hub_models/models/wideresnet50/conftest.py create mode 100644 qai_hub_models/models/wideresnet50_quantized/conftest.py create mode 100644 qai_hub_models/models/xlsr/conftest.py create mode 100644 qai_hub_models/models/xlsr_quantized/conftest.py create mode 100644 qai_hub_models/models/yolov6/conftest.py create mode 100644 qai_hub_models/models/yolov7/conftest.py create mode 100644 qai_hub_models/models/yolov8_det/conftest.py create mode 100644 qai_hub_models/models/yolov8_seg/conftest.py create mode 100644 qai_hub_models/utils/aimet/default_config_legacy_v1.json rename qai_hub_models/utils/aimet/{default_config_per_channel.json => default_config_legacy_v2.json} (100%) create mode 100644 qai_hub_models/utils/aimet/repo.py rename scripts/{ => examples}/quantize_ffnet.py (100%) create mode 100644 scripts/examples/test_numerics_imagenet_classifier_quantized.py delete mode 100644 scripts/examples/test_numerics_mobilenet_v2_quantized.py create mode 100755 scripts/github/create-aws-profile.sh diff --git a/.gitattributes b/.gitattributes index 3a241177..308063f2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ *.jpg filter=lfs diff=lfs merge=lfs -text *.png filter=lfs diff=lfs merge=lfs -text +*.jar filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 916d385a..00000000 --- a/.gitignore +++ /dev/null @@ -1,115 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# Distribution / packaging -.Python -/bench/ -build/ -dist/ -demo_artifacts/ -develop-eggs/ -downloads/ -eggs/ -.eggs/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version -qaihm-dev - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Environments -.venv -env/ -envs/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -*build/* -*.onnx -*.mlmodelc* -*.pt -*.wav -*.npy -*.csv -*.dylib -*.fpie -*bin/ADSP_Inference_Test -*.DS_Store - -# Hub exports -**/*.mlmodel -**/*.tflite - -# Zoo Readmes (these are autogenerated) -qai_hub_models/**/README.md -# Hugging Face Model Cards (these are autogenerated) -qai_hub_models/**/HF_MODEL_CARD.md diff --git a/README.md b/README.md index 4d69627d..a6efef3b 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ memory etc.) and ready to deploy on Qualcomm® devices. * View open-source recipes to quantize, optimize, and deploy these models on-device. * Browse through [performance metrics](https://aihub.qualcomm.com/models) captured for these models on several devices. * Access the models through [Hugging Face](https://huggingface.co/qualcomm). -* [Sign up](https://aihub.qualcomm.com/) to run these models on hosted Qualcomm® devices. +* [Sign up](https://myaccount.qualcomm.com/signup) to run these models on hosted Qualcomm® devices. Supported runtimes * [TensorFlow Lite](https://www.tensorflow.org/lite) @@ -67,7 +67,7 @@ pip install "qai_hub_models[yolov7]" Each model comes with the following set of CLI demos: * Locally runnable PyTorch based CLI demo to validate the model off device. -* On-device CLI demo that produces a model ready for on-device deployment and runs the model on a hosted Qualcomm® device (needs [sign up](https://aihub.qualcomm.com/)). +* On-device CLI demo that produces a model ready for on-device deployment and runs the model on a hosted Qualcomm® device (needs [sign up](https://myaccount.qualcomm.com/signup)). All the models produced by these demos are freely available on [Hugging Face](https://huggingface.co/qualcomm) or through our @@ -129,7 +129,7 @@ Image.fromarray(pred_image).show() Qualcomm® device using [Qualcomm® AI Hub](https://aihub.qualcomm.com). To run the model on a hosted device, [sign up for access to Qualcomm® AI -Hub](https://aihub.qualcomm.com). Sign-in to Qualcomm® AI Hub with your +Hub](https://myaccount.qualcomm.com/signup). Sign-in to Qualcomm® AI Hub with your Qualcomm® ID. Once signed in navigate to Account -> Settings -> API Token. With this API token, you can configure your client to run models on the cloud @@ -242,6 +242,14 @@ python -m pytest --pyargs qai_hub_models.models.yolov7.test For any issues, please contact us at ai-hub-support@qti.qualcomm.com. + +--- + +### LICENSE + +Qualcomm® AI Hub Models is licensed under BSD-3. See the [LICENSE file](../LICENSE). + + --- ## Model Directory @@ -252,93 +260,99 @@ For any issues, please contact us at ai-hub-support@qti.qualcomm.com. | -- | -- | -- | -- | -- | | | | | | **Image Classification** -| [MobileNet-v2-Quantized](https://aihub.qualcomm.com/models/mobilenet_v2_quantized) | [qai_hub_models.models.mobilenet_v2_quantized](qai_hub_models/models/mobilenet_v2_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [VIT](https://aihub.qualcomm.com/models/vit) | [qai_hub_models.models.vit](qai_hub_models/models/vit/README.md) | ✔️ | ✔️ | ✔️ +| [Inception-v3-Quantized](https://aihub.qualcomm.com/models/inception_v3_quantized) | [qai_hub_models.models.inception_v3_quantized](qai_hub_models/models/inception_v3_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Inception-v3](https://aihub.qualcomm.com/models/inception_v3) | [qai_hub_models.models.inception_v3](qai_hub_models/models/inception_v3/README.md) | ✔️ | ✔️ | ✔️ +| [MobileNet-v3-Large](https://aihub.qualcomm.com/models/mobilenet_v3_large) | [qai_hub_models.models.mobilenet_v3_large](qai_hub_models/models/mobilenet_v3_large/README.md) | ✔️ | ✔️ | ✔️ | [GoogLeNet](https://aihub.qualcomm.com/models/googlenet) | [qai_hub_models.models.googlenet](qai_hub_models/models/googlenet/README.md) | ✔️ | ✔️ | ✔️ +| [ResNeXt101](https://aihub.qualcomm.com/models/resnext101) | [qai_hub_models.models.resnext101](qai_hub_models/models/resnext101/README.md) | ✔️ | ✔️ | ✔️ | [ResNet50](https://aihub.qualcomm.com/models/resnet50) | [qai_hub_models.models.resnet50](qai_hub_models/models/resnet50/README.md) | ✔️ | ✔️ | ✔️ -| [Swin-Small](https://aihub.qualcomm.com/models/swin_small) | [qai_hub_models.models.swin_small](qai_hub_models/models/swin_small/README.md) | ✔️ | ✔️ | ✔️ -| [Inception-v3Quantized](https://aihub.qualcomm.com/models/inception_v3_quantized) | [qai_hub_models.models.inception_v3_quantized](qai_hub_models/models/inception_v3_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [MobileNet-v3-Small](https://aihub.qualcomm.com/models/mobilenet_v3_small) | [qai_hub_models.models.mobilenet_v3_small](qai_hub_models/models/mobilenet_v3_small/README.md) | ✔️ | ✔️ | ✔️ -| [GoogLeNetQuantized](https://aihub.qualcomm.com/models/googlenet_quantized) | [qai_hub_models.models.googlenet_quantized](qai_hub_models/models/googlenet_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [RegNet](https://aihub.qualcomm.com/models/regnet) | [qai_hub_models.models.regnet](qai_hub_models/models/regnet/README.md) | ✔️ | ✔️ | ✔️ | [ResNeXt50](https://aihub.qualcomm.com/models/resnext50) | [qai_hub_models.models.resnext50](qai_hub_models/models/resnext50/README.md) | ✔️ | ✔️ | ✔️ -| [VIT](https://aihub.qualcomm.com/models/vit) | [qai_hub_models.models.vit](qai_hub_models/models/vit/README.md) | ✔️ | ✔️ | ✔️ -| [ResNet18Quantized](https://aihub.qualcomm.com/models/resnet18_quantized) | [qai_hub_models.models.resnet18_quantized](qai_hub_models/models/resnet18_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [ResNet101](https://aihub.qualcomm.com/models/resnet101) | [qai_hub_models.models.resnet101](qai_hub_models/models/resnet101/README.md) | ✔️ | ✔️ | ✔️ -| [ResNeXt101](https://aihub.qualcomm.com/models/resnext101) | [qai_hub_models.models.resnext101](qai_hub_models/models/resnext101/README.md) | ✔️ | ✔️ | ✔️ -| [MobileNet-v2](https://aihub.qualcomm.com/models/mobilenet_v2) | [qai_hub_models.models.mobilenet_v2](qai_hub_models/models/mobilenet_v2/README.md) | ✔️ | ✔️ | ✔️ | [SqueezeNet-1_1](https://aihub.qualcomm.com/models/squeezenet1_1) | [qai_hub_models.models.squeezenet1_1](qai_hub_models/models/squeezenet1_1/README.md) | ✔️ | ✔️ | ✔️ -| [SqueezeNet-1_1Quantized](https://aihub.qualcomm.com/models/squeezenet1_1_quantized) | [qai_hub_models.models.squeezenet1_1_quantized](qai_hub_models/models/squeezenet1_1_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [Inception-v3](https://aihub.qualcomm.com/models/inception_v3) | [qai_hub_models.models.inception_v3](qai_hub_models/models/inception_v3/README.md) | ✔️ | ✔️ | ✔️ -| [WideResNet50](https://aihub.qualcomm.com/models/wideresnet50) | [qai_hub_models.models.wideresnet50](qai_hub_models/models/wideresnet50/README.md) | ✔️ | ✔️ | ✔️ -| [ResNet101Quantized](https://aihub.qualcomm.com/models/resnet101_quantized) | [qai_hub_models.models.resnet101_quantized](qai_hub_models/models/resnet101_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [MNASNet05](https://aihub.qualcomm.com/models/mnasnet05) | [qai_hub_models.models.mnasnet05](qai_hub_models/models/mnasnet05/README.md) | ✔️ | ✔️ | ✔️ -| [Swin-Base](https://aihub.qualcomm.com/models/swin_base) | [qai_hub_models.models.swin_base](qai_hub_models/models/swin_base/README.md) | ✔️ | ✔️ | ✔️ -| [DenseNet-121](https://aihub.qualcomm.com/models/densenet121) | [qai_hub_models.models.densenet121](qai_hub_models/models/densenet121/README.md) | ✔️ | ✔️ | ✔️ +| [ResNeXt101Quantized](https://aihub.qualcomm.com/models/resnext101_quantized) | [qai_hub_models.models.resnext101_quantized](qai_hub_models/models/resnext101_quantized/README.md) | ✔️ | ✔️ | ✔️ | [Shufflenet-v2Quantized](https://aihub.qualcomm.com/models/shufflenet_v2_quantized) | [qai_hub_models.models.shufflenet_v2_quantized](qai_hub_models/models/shufflenet_v2_quantized/README.md) | ✔️ | ✔️ | ✔️ | [Shufflenet-v2](https://aihub.qualcomm.com/models/shufflenet_v2) | [qai_hub_models.models.shufflenet_v2](qai_hub_models/models/shufflenet_v2/README.md) | ✔️ | ✔️ | ✔️ -| [ResNeXt101Quantized](https://aihub.qualcomm.com/models/resnext101_quantized) | [qai_hub_models.models.resnext101_quantized](qai_hub_models/models/resnext101_quantized/README.md) | ✔️ | ✔️ | ✔️ | [ResNet18](https://aihub.qualcomm.com/models/resnet18) | [qai_hub_models.models.resnet18](qai_hub_models/models/resnet18/README.md) | ✔️ | ✔️ | ✔️ +| [ResNeXt50Quantized](https://aihub.qualcomm.com/models/resnext50_quantized) | [qai_hub_models.models.resnext50_quantized](qai_hub_models/models/resnext50_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [DenseNet-121](https://aihub.qualcomm.com/models/densenet121) | [qai_hub_models.models.densenet121](qai_hub_models/models/densenet121/README.md) | ✔️ | ✔️ | ✔️ +| [Swin-Base](https://aihub.qualcomm.com/models/swin_base) | [qai_hub_models.models.swin_base](qai_hub_models/models/swin_base/README.md) | ✔️ | ✔️ | ✔️ +| [ResNet101](https://aihub.qualcomm.com/models/resnet101) | [qai_hub_models.models.resnet101](qai_hub_models/models/resnet101/README.md) | ✔️ | ✔️ | ✔️ | [EfficientNet-B0](https://aihub.qualcomm.com/models/efficientnet_b0) | [qai_hub_models.models.efficientnet_b0](qai_hub_models/models/efficientnet_b0/README.md) | ✔️ | ✔️ | ✔️ -| [MobileNet-v3-Large](https://aihub.qualcomm.com/models/mobilenet_v3_large) | [qai_hub_models.models.mobilenet_v3_large](qai_hub_models/models/mobilenet_v3_large/README.md) | ✔️ | ✔️ | ✔️ +| [ResNet101Quantized](https://aihub.qualcomm.com/models/resnet101_quantized) | [qai_hub_models.models.resnet101_quantized](qai_hub_models/models/resnet101_quantized/README.md) | ✔️ | ✔️ | ✔️ | [WideResNet50-Quantized](https://aihub.qualcomm.com/models/wideresnet50_quantized) | [qai_hub_models.models.wideresnet50_quantized](qai_hub_models/models/wideresnet50_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [ConvNext-Tiny](https://aihub.qualcomm.com/models/convnext_tiny) | [qai_hub_models.models.convnext_tiny](qai_hub_models/models/convnext_tiny/README.md) | ✔️ | ✔️ | ✔️ +| [SqueezeNet-1_1Quantized](https://aihub.qualcomm.com/models/squeezenet1_1_quantized) | [qai_hub_models.models.squeezenet1_1_quantized](qai_hub_models/models/squeezenet1_1_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [MNASNet05](https://aihub.qualcomm.com/models/mnasnet05) | [qai_hub_models.models.mnasnet05](qai_hub_models/models/mnasnet05/README.md) | ✔️ | ✔️ | ✔️ +| [MobileNet-v3-Small](https://aihub.qualcomm.com/models/mobilenet_v3_small) | [qai_hub_models.models.mobilenet_v3_small](qai_hub_models/models/mobilenet_v3_small/README.md) | ✔️ | ✔️ | ✔️ +| [ResNet18Quantized](https://aihub.qualcomm.com/models/resnet18_quantized) | [qai_hub_models.models.resnet18_quantized](qai_hub_models/models/resnet18_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Swin-Small](https://aihub.qualcomm.com/models/swin_small) | [qai_hub_models.models.swin_small](qai_hub_models/models/swin_small/README.md) | ✔️ | ✔️ | ✔️ +| [GoogLeNetQuantized](https://aihub.qualcomm.com/models/googlenet_quantized) | [qai_hub_models.models.googlenet_quantized](qai_hub_models/models/googlenet_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [MobileNet-v3-Large-Quantized](https://aihub.qualcomm.com/models/mobilenet_v3_large_quantized) | [qai_hub_models.models.mobilenet_v3_large_quantized](qai_hub_models/models/mobilenet_v3_large_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [MobileNet-v2](https://aihub.qualcomm.com/models/mobilenet_v2) | [qai_hub_models.models.mobilenet_v2](qai_hub_models/models/mobilenet_v2/README.md) | ✔️ | ✔️ | ✔️ +| [WideResNet50](https://aihub.qualcomm.com/models/wideresnet50) | [qai_hub_models.models.wideresnet50](qai_hub_models/models/wideresnet50/README.md) | ✔️ | ✔️ | ✔️ +| [RegNet](https://aihub.qualcomm.com/models/regnet) | [qai_hub_models.models.regnet](qai_hub_models/models/regnet/README.md) | ✔️ | ✔️ | ✔️ | [Swin-Tiny](https://aihub.qualcomm.com/models/swin_tiny) | [qai_hub_models.models.swin_tiny](qai_hub_models/models/swin_tiny/README.md) | ✔️ | ✔️ | ✔️ +| [ConvNext-Tiny](https://aihub.qualcomm.com/models/convnext_tiny) | [qai_hub_models.models.convnext_tiny](qai_hub_models/models/convnext_tiny/README.md) | ✔️ | ✔️ | ✔️ +| [MobileNet-v2-Quantized](https://aihub.qualcomm.com/models/mobilenet_v2_quantized) | [qai_hub_models.models.mobilenet_v2_quantized](qai_hub_models/models/mobilenet_v2_quantized/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Image Editing** | [LaMa-Dilated](https://aihub.qualcomm.com/models/lama_dilated) | [qai_hub_models.models.lama_dilated](qai_hub_models/models/lama_dilated/README.md) | ✔️ | ✔️ | ✔️ +| [AOT-GAN](https://aihub.qualcomm.com/models/aotgan) | [qai_hub_models.models.aotgan](qai_hub_models/models/aotgan/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Image Generation** | [StyleGAN2](https://aihub.qualcomm.com/models/stylegan2) | [qai_hub_models.models.stylegan2](qai_hub_models/models/stylegan2/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Super Resolution** -| [QuickSRNetLarge](https://aihub.qualcomm.com/models/quicksrnetlarge) | [qai_hub_models.models.quicksrnetlarge](qai_hub_models/models/quicksrnetlarge/README.md) | ✔️ | ✔️ | ✔️ -| [ESRGAN](https://aihub.qualcomm.com/models/esrgan) | [qai_hub_models.models.esrgan](qai_hub_models/models/esrgan/README.md) | ✔️ | ✔️ | ✔️ -| [Real-ESRGAN-x4plus](https://aihub.qualcomm.com/models/real_esrgan_x4plus) | [qai_hub_models.models.real_esrgan_x4plus](qai_hub_models/models/real_esrgan_x4plus/README.md) | ✔️ | ✔️ | ✔️ | [XLSR-Quantized](https://aihub.qualcomm.com/models/xlsr_quantized) | [qai_hub_models.models.xlsr_quantized](qai_hub_models/models/xlsr_quantized/README.md) | ✔️ | ✔️ | ✔️ -| [QuickSRNetMedium](https://aihub.qualcomm.com/models/quicksrnetmedium) | [qai_hub_models.models.quicksrnetmedium](qai_hub_models/models/quicksrnetmedium/README.md) | ✔️ | ✔️ | ✔️ +| [QuickSRNetLarge-Quantized](https://aihub.qualcomm.com/models/quicksrnetlarge_quantized) | [qai_hub_models.models.quicksrnetlarge_quantized](qai_hub_models/models/quicksrnetlarge_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [QuickSRNetMedium-Quantized](https://aihub.qualcomm.com/models/quicksrnetmedium_quantized) | [qai_hub_models.models.quicksrnetmedium_quantized](qai_hub_models/models/quicksrnetmedium_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Real-ESRGAN-x4plus](https://aihub.qualcomm.com/models/real_esrgan_x4plus) | [qai_hub_models.models.real_esrgan_x4plus](qai_hub_models/models/real_esrgan_x4plus/README.md) | ✔️ | ✔️ | ✔️ | [Real-ESRGAN-General-x4v3](https://aihub.qualcomm.com/models/real_esrgan_general_x4v3) | [qai_hub_models.models.real_esrgan_general_x4v3](qai_hub_models/models/real_esrgan_general_x4v3/README.md) | ✔️ | ✔️ | ✔️ -| [SESR-M5-Quantized](https://aihub.qualcomm.com/models/sesr_m5_quantized) | [qai_hub_models.models.sesr_m5_quantized](qai_hub_models/models/sesr_m5_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [QuickSRNetMedium](https://aihub.qualcomm.com/models/quicksrnetmedium) | [qai_hub_models.models.quicksrnetmedium](qai_hub_models/models/quicksrnetmedium/README.md) | ✔️ | ✔️ | ✔️ +| [ESRGAN](https://aihub.qualcomm.com/models/esrgan) | [qai_hub_models.models.esrgan](qai_hub_models/models/esrgan/README.md) | ✔️ | ✔️ | ✔️ | [QuickSRNetSmall](https://aihub.qualcomm.com/models/quicksrnetsmall) | [qai_hub_models.models.quicksrnetsmall](qai_hub_models/models/quicksrnetsmall/README.md) | ✔️ | ✔️ | ✔️ | [SESR-M5](https://aihub.qualcomm.com/models/sesr_m5) | [qai_hub_models.models.sesr_m5](qai_hub_models/models/sesr_m5/README.md) | ✔️ | ✔️ | ✔️ +| [SESR-M5-Quantized](https://aihub.qualcomm.com/models/sesr_m5_quantized) | [qai_hub_models.models.sesr_m5_quantized](qai_hub_models/models/sesr_m5_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [QuickSRNetSmall-Quantized](https://aihub.qualcomm.com/models/quicksrnetsmall_quantized) | [qai_hub_models.models.quicksrnetsmall_quantized](qai_hub_models/models/quicksrnetsmall_quantized/README.md) | ✔️ | ✔️ | ✔️ | [XLSR](https://aihub.qualcomm.com/models/xlsr) | [qai_hub_models.models.xlsr](qai_hub_models/models/xlsr/README.md) | ✔️ | ✔️ | ✔️ +| [QuickSRNetLarge](https://aihub.qualcomm.com/models/quicksrnetlarge) | [qai_hub_models.models.quicksrnetlarge](qai_hub_models/models/quicksrnetlarge/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Semantic Segmentation** -| [Yolo-v8-Segmentation](https://aihub.qualcomm.com/models/yolov8_seg) | [qai_hub_models.models.yolov8_seg](qai_hub_models/models/yolov8_seg/README.md) | ✔️ | ✔️ | ✔️ -| [SINet](https://aihub.qualcomm.com/models/sinet) | [qai_hub_models.models.sinet](qai_hub_models/models/sinet/README.md) | ✔️ | ✔️ | ✔️ -| [Unet-Segmentation](https://aihub.qualcomm.com/models/unet_segmentation) | [qai_hub_models.models.unet_segmentation](qai_hub_models/models/unet_segmentation/README.md) | ✔️ | ✔️ | ✔️ -| [FCN_ResNet50](https://aihub.qualcomm.com/models/fcn_resnet50) | [qai_hub_models.models.fcn_resnet50](qai_hub_models/models/fcn_resnet50/README.md) | ✔️ | ✔️ | ✔️ -| [DDRNet23-Slim](https://aihub.qualcomm.com/models/ddrnet23_slim) | [qai_hub_models.models.ddrnet23_slim](qai_hub_models/models/ddrnet23_slim/README.md) | ✔️ | ✔️ | ✔️ -| [FastSam-S](https://aihub.qualcomm.com/models/fastsam_s) | [qai_hub_models.models.fastsam_s](qai_hub_models/models/fastsam_s/README.md) | ✔️ | ✔️ | ✔️ -| [FFNet-122NS-LowRes](https://aihub.qualcomm.com/models/ffnet_122ns_lowres) | [qai_hub_models.models.ffnet_122ns_lowres](qai_hub_models/models/ffnet_122ns_lowres/README.md) | ✔️ | ✔️ | ✔️ -| [FFNet-78S-Quantized](https://aihub.qualcomm.com/models/ffnet_78s_quantized) | [qai_hub_models.models.ffnet_78s_quantized](qai_hub_models/models/ffnet_78s_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [FFNet-54S-Quantized](https://aihub.qualcomm.com/models/ffnet_54s_quantized) | [qai_hub_models.models.ffnet_54s_quantized](qai_hub_models/models/ffnet_54s_quantized/README.md) | ✔️ | ✔️ | ✔️ | [FFNet-40S-Quantized](https://aihub.qualcomm.com/models/ffnet_40s_quantized) | [qai_hub_models.models.ffnet_40s_quantized](qai_hub_models/models/ffnet_40s_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [FCN_ResNet50](https://aihub.qualcomm.com/models/fcn_resnet50) | [qai_hub_models.models.fcn_resnet50](qai_hub_models/models/fcn_resnet50/README.md) | ✔️ | ✔️ | ✔️ +| [FastSam-X](https://aihub.qualcomm.com/models/fastsam_x) | [qai_hub_models.models.fastsam_x](qai_hub_models/models/fastsam_x/README.md) | ✔️ | ✔️ | ✔️ | [MediaPipe-Selfie-Segmentation](https://aihub.qualcomm.com/models/mediapipe_selfie) | [qai_hub_models.models.mediapipe_selfie](qai_hub_models/models/mediapipe_selfie/README.md) | ✔️ | ✔️ | ✔️ +| [Segment-Anything-Model](https://aihub.qualcomm.com/models/sam) | [qai_hub_models.models.sam](qai_hub_models/models/sam/README.md) | ✔️ | ✔️ | ✔️ +| [Unet-Segmentation](https://aihub.qualcomm.com/models/unet_segmentation) | [qai_hub_models.models.unet_segmentation](qai_hub_models/models/unet_segmentation/README.md) | ✔️ | ✔️ | ✔️ +| [FFNet-40S](https://aihub.qualcomm.com/models/ffnet_40s) | [qai_hub_models.models.ffnet_40s](qai_hub_models/models/ffnet_40s/README.md) | ✔️ | ✔️ | ✔️ +| [DDRNet23-Slim](https://aihub.qualcomm.com/models/ddrnet23_slim) | [qai_hub_models.models.ddrnet23_slim](qai_hub_models/models/ddrnet23_slim/README.md) | ✔️ | ✔️ | ✔️ | [DeepLabV3-ResNet50](https://aihub.qualcomm.com/models/deeplabv3_resnet50) | [qai_hub_models.models.deeplabv3_resnet50](qai_hub_models/models/deeplabv3_resnet50/README.md) | ✔️ | ✔️ | ✔️ -| [FastSam-X](https://aihub.qualcomm.com/models/fastsam_x) | [qai_hub_models.models.fastsam_x](qai_hub_models/models/fastsam_x/README.md) | ✔️ | ✔️ | ✔️ +| [SINet](https://aihub.qualcomm.com/models/sinet) | [qai_hub_models.models.sinet](qai_hub_models/models/sinet/README.md) | ✔️ | ✔️ | ✔️ +| [FFNet-78S-Quantized](https://aihub.qualcomm.com/models/ffnet_78s_quantized) | [qai_hub_models.models.ffnet_78s_quantized](qai_hub_models/models/ffnet_78s_quantized/README.md) | ✔️ | ✔️ | ✔️ | [FFNet-54S](https://aihub.qualcomm.com/models/ffnet_54s) | [qai_hub_models.models.ffnet_54s](qai_hub_models/models/ffnet_54s/README.md) | ✔️ | ✔️ | ✔️ +| [FFNet-122NS-LowRes](https://aihub.qualcomm.com/models/ffnet_122ns_lowres) | [qai_hub_models.models.ffnet_122ns_lowres](qai_hub_models/models/ffnet_122ns_lowres/README.md) | ✔️ | ✔️ | ✔️ | [FFNet-78S-LowRes](https://aihub.qualcomm.com/models/ffnet_78s_lowres) | [qai_hub_models.models.ffnet_78s_lowres](qai_hub_models/models/ffnet_78s_lowres/README.md) | ✔️ | ✔️ | ✔️ -| [Segment-Anything-Model](https://aihub.qualcomm.com/models/sam) | [qai_hub_models.models.sam](qai_hub_models/models/sam/README.md) | ✔️ | ✔️ | ✔️ +| [YOLOv8-Segmentation](https://aihub.qualcomm.com/models/yolov8_seg) | [qai_hub_models.models.yolov8_seg](qai_hub_models/models/yolov8_seg/README.md) | ✔️ | ✔️ | ✔️ | [FFNet-78S](https://aihub.qualcomm.com/models/ffnet_78s) | [qai_hub_models.models.ffnet_78s](qai_hub_models/models/ffnet_78s/README.md) | ✔️ | ✔️ | ✔️ -| [FFNet-40S](https://aihub.qualcomm.com/models/ffnet_40s) | [qai_hub_models.models.ffnet_40s](qai_hub_models/models/ffnet_40s/README.md) | ✔️ | ✔️ | ✔️ -| [FFNet-54S-Quantized](https://aihub.qualcomm.com/models/ffnet_54s_quantized) | [qai_hub_models.models.ffnet_54s_quantized](qai_hub_models/models/ffnet_54s_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [FastSam-S](https://aihub.qualcomm.com/models/fastsam_s) | [qai_hub_models.models.fastsam_s](qai_hub_models/models/fastsam_s/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Object Detection** -| [MediaPipe-Hand-Detection](https://aihub.qualcomm.com/models/mediapipe_hand) | [qai_hub_models.models.mediapipe_hand](qai_hub_models/models/mediapipe_hand/README.md) | ✔️ | ✔️ | ✔️ -| [DETR-ResNet50-DC5](https://aihub.qualcomm.com/models/detr_resnet50_dc5) | [qai_hub_models.models.detr_resnet50_dc5](qai_hub_models/models/detr_resnet50_dc5/README.md) | ✔️ | ✔️ | ✔️ -| [DETR-ResNet101-DC5](https://aihub.qualcomm.com/models/detr_resnet101_dc5) | [qai_hub_models.models.detr_resnet101_dc5](qai_hub_models/models/detr_resnet101_dc5/README.md) | ✔️ | ✔️ | ✔️ -| [Yolo-v8-Detection](https://aihub.qualcomm.com/models/yolov8_det) | [qai_hub_models.models.yolov8_det](qai_hub_models/models/yolov8_det/README.md) | ✔️ | ✔️ | ✔️ | [DETR-ResNet101](https://aihub.qualcomm.com/models/detr_resnet101) | [qai_hub_models.models.detr_resnet101](qai_hub_models/models/detr_resnet101/README.md) | ✔️ | ✔️ | ✔️ -| [DETR-ResNet50](https://aihub.qualcomm.com/models/detr_resnet50) | [qai_hub_models.models.detr_resnet50](qai_hub_models/models/detr_resnet50/README.md) | ✔️ | ✔️ | ✔️ +| [MediaPipe-Face-Detection](https://aihub.qualcomm.com/models/mediapipe_face) | [qai_hub_models.models.mediapipe_face](qai_hub_models/models/mediapipe_face/README.md) | ✔️ | ✔️ | ✔️ +| [DETR-ResNet50-DC5](https://aihub.qualcomm.com/models/detr_resnet50_dc5) | [qai_hub_models.models.detr_resnet50_dc5](qai_hub_models/models/detr_resnet50_dc5/README.md) | ✔️ | ✔️ | ✔️ | [Yolo-v7](https://aihub.qualcomm.com/models/yolov7) | [qai_hub_models.models.yolov7](qai_hub_models/models/yolov7/README.md) | ✔️ | ✔️ | ✔️ +| [YOLOv8-Detection](https://aihub.qualcomm.com/models/yolov8_det) | [qai_hub_models.models.yolov8_det](qai_hub_models/models/yolov8_det/README.md) | ✔️ | ✔️ | ✔️ | [Yolo-v6](https://aihub.qualcomm.com/models/yolov6) | [qai_hub_models.models.yolov6](qai_hub_models/models/yolov6/README.md) | ✔️ | ✔️ | ✔️ -| [MediaPipe-Face-Detection](https://aihub.qualcomm.com/models/mediapipe_face) | [qai_hub_models.models.mediapipe_face](qai_hub_models/models/mediapipe_face/README.md) | ✔️ | ✔️ | ✔️ +| [DETR-ResNet101-DC5](https://aihub.qualcomm.com/models/detr_resnet101_dc5) | [qai_hub_models.models.detr_resnet101_dc5](qai_hub_models/models/detr_resnet101_dc5/README.md) | ✔️ | ✔️ | ✔️ +| [DETR-ResNet50](https://aihub.qualcomm.com/models/detr_resnet50) | [qai_hub_models.models.detr_resnet50](qai_hub_models/models/detr_resnet50/README.md) | ✔️ | ✔️ | ✔️ +| [MediaPipe-Hand-Detection](https://aihub.qualcomm.com/models/mediapipe_hand) | [qai_hub_models.models.mediapipe_hand](qai_hub_models/models/mediapipe_hand/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Pose Estimation** -| [HRNetPoseQuantized](https://aihub.qualcomm.com/models/hrnet_pose_quantized) | [qai_hub_models.models.hrnet_pose_quantized](qai_hub_models/models/hrnet_pose_quantized/README.md) | ✔️ | ✔️ | ✔️ | [MediaPipe-Pose-Estimation](https://aihub.qualcomm.com/models/mediapipe_pose) | [qai_hub_models.models.mediapipe_pose](qai_hub_models/models/mediapipe_pose/README.md) | ✔️ | ✔️ | ✔️ +| [OpenPose](https://aihub.qualcomm.com/models/openpose) | [qai_hub_models.models.openpose](qai_hub_models/models/openpose/README.md) | ✔️ | ✔️ | ✔️ | [LiteHRNet](https://aihub.qualcomm.com/models/litehrnet) | [qai_hub_models.models.litehrnet](qai_hub_models/models/litehrnet/README.md) | ✔️ | ✔️ | ✔️ | [HRNetPose](https://aihub.qualcomm.com/models/hrnet_pose) | [qai_hub_models.models.hrnet_pose](qai_hub_models/models/hrnet_pose/README.md) | ✔️ | ✔️ | ✔️ -| [OpenPose](https://aihub.qualcomm.com/models/openpose) | [qai_hub_models.models.openpose](qai_hub_models/models/openpose/README.md) | ✔️ | ✔️ | ✔️ +| [HRNetPoseQuantized](https://aihub.qualcomm.com/models/hrnet_pose_quantized) | [qai_hub_models.models.hrnet_pose_quantized](qai_hub_models/models/hrnet_pose_quantized/README.md) | ✔️ | ✔️ | ✔️ ### Audio @@ -346,7 +360,9 @@ For any issues, please contact us at ai-hub-support@qti.qualcomm.com. | -- | -- | -- | -- | -- | | | | | | **Speech Recognition** -| [Whisper-Base](https://aihub.qualcomm.com/models/whisper_asr) | [qai_hub_models.models.whisper_asr](qai_hub_models/models/whisper_asr/README.md) | ✔️ | ✔️ | ✔️ +| [Whisper-Small-En](https://aihub.qualcomm.com/models/whisper_small_en) | [qai_hub_models.models.whisper_small_en](qai_hub_models/models/whisper_small_en/README.md) | ✔️ | ✔️ | ✔️ +| [Whisper-Tiny-En](https://aihub.qualcomm.com/models/whisper_tiny_en) | [qai_hub_models.models.whisper_tiny_en](qai_hub_models/models/whisper_tiny_en/README.md) | ✔️ | ✔️ | ✔️ +| [Whisper-Base-En](https://aihub.qualcomm.com/models/whisper_base_en) | [qai_hub_models.models.whisper_base_en](qai_hub_models/models/whisper_base_en/README.md) | ✔️ | ✔️ | ✔️ | [HuggingFace-WavLM-Base-Plus](https://aihub.qualcomm.com/models/huggingface_wavlm_base_plus) | [qai_hub_models.models.huggingface_wavlm_base_plus](qai_hub_models/models/huggingface_wavlm_base_plus/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Audio Enhancement** @@ -366,8 +382,8 @@ For any issues, please contact us at ai-hub-support@qti.qualcomm.com. | -- | -- | -- | -- | -- | | | | | | **Image Generation** -| [Stable-Diffusion](https://aihub.qualcomm.com/models/stable_diffusion_quantized) | [qai_hub_models.models.stable_diffusion_quantized](qai_hub_models/models/stable_diffusion_quantized/README.md) | ✔️ | ✔️ | ✔️ | [ControlNet](https://aihub.qualcomm.com/models/controlnet_quantized) | [qai_hub_models.models.controlnet_quantized](qai_hub_models/models/controlnet_quantized/README.md) | ✔️ | ✔️ | ✔️ +| [Stable-Diffusion](https://aihub.qualcomm.com/models/stable_diffusion_quantized) | [qai_hub_models.models.stable_diffusion_quantized](qai_hub_models/models/stable_diffusion_quantized/README.md) | ✔️ | ✔️ | ✔️ | | | | | | **Text Generation** | [Llama-v2-7B-Chat](https://aihub.qualcomm.com/models/llama_v2_7b_chat_quantized) | [qai_hub_models.models.llama_v2_7b_chat_quantized](qai_hub_models/models/llama_v2_7b_chat_quantized/README.md) | ✔️ | ✔️ | ✔️ diff --git a/apps/android/ImageClassification/README.md b/apps/android/ImageClassification/README.md new file mode 100644 index 00000000..84a83be4 --- /dev/null +++ b/apps/android/ImageClassification/README.md @@ -0,0 +1,72 @@ +### Requirements + +1. Java, android-sdk and sdkmanager is already set at user's end +2. User should have Linux QNN SDK in local machine. + + +## Info +Right now we use mobilenet_v3_small.tflite model which takes 224x224 as input and gives array of 1000 as output. You can replace it with any tflite classification model, but you have to change the pre-processing, post-processing and dimensions in the app code based on model parameters. + + +## Preprocessing + + +``` + for (int x = 0; x < input_dims1; x++) { + for (int y = 0; y < input_dims2; y++) { + int pixel = inputBitmap.getPixel(x, y); + List rgb = Arrays.asList((float)Color.red(pixel), (float)Color.green(pixel), (float)Color.blue(pixel)); + for(int z = 0;z<3; z++){ + floatinputarray[0][z][x][y] = (float)((rgb.get(z))-ImageMean.get(z))/ImageStd.get(z); + } + } + } +``` + + +## PostProcessing + + +``` + public static List findTop3Indices(float[] arr) { + List topIndices = new ArrayList<>(); + + for (int i = 0; i < 3; i++) { + int maxIndex = 0; + float maxValue = arr[0]; + + for (int j = 1; j < arr.length; j++) { + if (arr[j] > maxValue && !topIndices.contains(j)) { + maxValue = arr[j]; + maxIndex = j; + } + } + + topIndices.add(maxIndex); + } + + return topIndices; + } +``` + +### Build App: + +You have to run build_apk.py for Image Classification. It will generate classification-debug.apk and install it in connected device. + + + build_apk.py [-h] -q QNNSDK (-m MODEL_PATH | -e MODEL_NAME) + + + +### Example + +Here, with -m, give your tflite model path i.e. till `*.tflite file`, and it will copy model file to assets folder to build andoid app. +``` + python build_apk.py -q "" -m "Path\to\TFLITE\Model" +``` + +Also, you can use AI-HUB Model name as mentioned in models directory, to directly export the model from AI-Hub and copy it to app assets. + +``` + python build_apk.py -q "" -e +``` diff --git a/apps/android/ImageClassification/build.gradle b/apps/android/ImageClassification/build.gradle new file mode 100644 index 00000000..798f7515 --- /dev/null +++ b/apps/android/ImageClassification/build.gradle @@ -0,0 +1,10 @@ + +// Top-level build file where you can add configuration options common to all sub-projects/modules. +plugins { + id 'com.android.application' version '7.2.1' apply false + id 'com.android.library' version '7.2.1' apply false +} + +task clean(type: Delete) { + delete rootProject.buildDir +} diff --git a/apps/android/ImageClassification/build_apk.py b/apps/android/ImageClassification/build_apk.py new file mode 100644 index 00000000..66e6cd0d --- /dev/null +++ b/apps/android/ImageClassification/build_apk.py @@ -0,0 +1,163 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import argparse +import glob +import os +import shutil +import subprocess +import sys +from enum import Enum + + +class MODELNAME(Enum): + mobilenet_v3_large = 1 + resnet50 = 2 + resnext50 = 3 + inception_v3 = 4 + + +def printmenu(): + print("*****************************") + print("* TYPE OF MODEL *") + print("*****************************") + for m in MODELNAME: + print(str(m.value) + ". " + m.name) + print("*****************************") + + +## Initialize parser +parser = argparse.ArgumentParser() +parser.add_argument("-q", "--qnnsdk", required=True, help="Give path of QNN SDK") + +parser.add_argument("-m", "--model_name", type=str, help="Model Name") + + +# group = parser.add_mutually_exclusive_group() +# group.add_argument('-stopdownload', '--stopdownload', action = "store_true", help = "Do NOT Download Model from AI HUB") +parser.add_argument("-path", "--model_path", type=str, help="TFLITE model file") + +args = parser.parse_args() + + +##based on this pre-post can be decided +if not args.model_name: + printmenu() + inp_model_name = int(input("Please select one: ")) + args.model_name = MODELNAME(inp_model_name).name + + +destAsset = os.path.join(".", "classification", "src", "main", "assets") +if not os.path.exists(destAsset): + os.makedirs(destAsset) + + +## MODEL PATH NOT MENTIONED, add information into model_path +if not args.model_path: + exportstatus = input("Do you want us to download the model from AI hub (y/n)") + + ##DOWNLAOD USING EXPORT.PY + if exportstatus.lower().startswith("y"): + print("EXPORT form path") + pathtomodel = os.path.join( + "..", + "..", + "..", + "", + "qai_hub_models", + "models", + args.model_name, + "export.py", + ) + if not os.path.exists(pathtomodel): + print("PATH DO NOT EXIST: " + pathtomodel) + exit() + subprocess.run(["python", pathtomodel, "--skip-inferencing"]) + tflite_file = glob.glob( + "build" + os.sep + args.model_name + os.sep + "*.tflite", recursive=True + ) + args.model_path = tflite_file[0] + # shutil.copy(tflite_file[0], destAsset+os.sep+"superresmodel.tflite") + + ##GET USER TO GIVE PATH + else: + args.model_path = input("Give model File as input") + # if not os.path.exists(tflite_file): + # print("PATH DO NOT EXIST: "+tflite_file) + # exit() + # shutil.copy(tflite_file, destAsset+os.sep+"superresmodel.tflite") + + +if args.model_path: + print(args.model_path) + if not os.path.exists(args.model_path): + print("PATH DO NOT EXIST: " + args.model_path) + exit() + shutil.copy(args.model_path, destAsset + os.sep + "classification.tflite") + + +## COPYING REQUIRED FILES FROM QNN SDK +destJNI = os.path.join(".", "classification", "src", "main", "jniLibs", "arm64-v8a") +if not os.path.exists(destJNI): + os.makedirs(destJNI) + +# copy *.so from $qnn_sdk/libs/aarch64-android to $jni_lib_dir +qnnbasiclibs = os.path.join(args.qnnsdk, "lib", "aarch64-android") +shutil.copytree(qnnbasiclibs, destJNI, dirs_exist_ok=True) + +# copy $qnn_sdk/lib/hexagon-v**/unsigned/libQnnHtpV**Skel.so to $jni_lib_dir +skelstubfiles = os.path.join(args.qnnsdk, "lib", "hexagon-v**", "unsigned", "*.so") +for file in glob.glob(skelstubfiles): + shutil.copy(file, destJNI) + +# copy qtld-release.aar to $test_app_root/Application/ +destaar = os.path.join(".", "classification", "libs") +if not os.path.exists(destaar): + os.makedirs(destaar) +aarfile = os.path.join(args.qnnsdk, "lib", "android", "qtld-release.aar") +shutil.copy(aarfile, destaar) + + +## BUILDING APK +if sys.platform.startswith("win"): + print("Detected platform is windows") + gradleoutput = subprocess.run(["gradlew.bat", "assembleDebug"], cwd=".") +elif sys.platform.startswith("darwin"): + print("Detected platform is MAC") + gradleoutput = subprocess.run(["./gradlew", "assembleDebug"], cwd=".") +else: + print("Detected platform is Linux") + gradleoutput = subprocess.run(["./gradlew", "assembleDebug"], cwd=".") + + +## COPYING APK TO CWD +ApkPath = os.path.join( + os.getcwd(), + "classification", + "build", + "outputs", + "apk", + "debug", + "classification-debug.apk", +) +print("APK Is copied at current Working Directory") +shutil.copy(ApkPath, ".") + + +install_perm = input("Do you want to install this apk in connected device") +## INSTALLING AND RUNNING APK +if install_perm.lower().startswith("y"): + command_to_install = ["adb", "install", "classification-debug.apk"] + subprocess.run(command_to_install, cwd=".") + command_to_run = [ + "adb", + "shell", + "am", + "start", + "-a", + "com.example.ACTION_NAME", + "-n", + "com.qcom.imagesuperres/com.qcom.imagesuperres.QNNActivity", + ] + subprocess.run(command_to_run, cwd=".") diff --git a/apps/android/ImageClassification/classification/build.gradle b/apps/android/ImageClassification/classification/build.gradle new file mode 100644 index 00000000..5f289662 --- /dev/null +++ b/apps/android/ImageClassification/classification/build.gradle @@ -0,0 +1,63 @@ + +plugins { + id 'com.android.application' +} + + + +android { + compileSdk 32 + + defaultConfig { + applicationId "com.qcom.imageclassification" + minSdk 26 + targetSdk 32 + versionCode 1 + versionName "1.0" + + testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" + externalNativeBuild { + cmake { + cppFlags '' + } + } + + } + aaptOptions { + noCompress "tflite" + } + + buildTypes { + release { + minifyEnabled false + proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' + } + } + compileOptions { + sourceCompatibility JavaVersion.VERSION_1_8 + targetCompatibility JavaVersion.VERSION_1_8 + } + + packagingOptions + { + doNotStrip "**/*.so" + } +} +project.ext.LIB_DIR = projectDir.toString() + '/libs/' +project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets' + +dependencies { + + implementation 'androidx.appcompat:appcompat:1.4.2' + implementation 'com.google.android.material:material:1.6.1' + implementation 'androidx.constraintlayout:constraintlayout:2.1.4' + testImplementation 'junit:junit:4.13.2' + androidTestImplementation 'androidx.test.ext:junit:1.1.3' + androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0' + implementation 'org.tensorflow:tensorflow-lite:2.13.0' + implementation 'org.tensorflow:tensorflow-lite-select-tf-ops:2.9.0' + implementation 'org.tensorflow:tensorflow-lite-support:0.4.3' + implementation fileTree(dir: "libs", include: ["*.aar"]) + implementation files('libs/qtld-release') + +} diff --git a/apps/android/ImageClassification/classification/proguard-rules.pro b/apps/android/ImageClassification/classification/proguard-rules.pro new file mode 100644 index 00000000..36e00091 --- /dev/null +++ b/apps/android/ImageClassification/classification/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/apps/android/ImageClassification/classification/src/main/AndroidManifest.xml b/apps/android/ImageClassification/classification/src/main/AndroidManifest.xml new file mode 100644 index 00000000..598795bf --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/AndroidManifest.xml @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/apps/android/ImageClassification/classification/src/main/assets/Sample1.png b/apps/android/ImageClassification/classification/src/main/assets/Sample1.png new file mode 100644 index 00000000..24969fbd --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/assets/Sample1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8649d7ebb60c7e52ef496739616111a1ecd9797e7ecec3e1881504449a648077 +size 3155242 diff --git a/apps/android/ImageClassification/classification/src/main/assets/Sample2.png b/apps/android/ImageClassification/classification/src/main/assets/Sample2.png new file mode 100644 index 00000000..eed3f738 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/assets/Sample2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d4a1bdf3ca10aa7d7c03fdceccda5ef3f6811bcbe3cb0968a8d5c606e572529 +size 1284181 diff --git a/apps/android/ImageClassification/classification/src/main/assets/Sample3.png b/apps/android/ImageClassification/classification/src/main/assets/Sample3.png new file mode 100644 index 00000000..319ca063 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/assets/Sample3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c18dc2c8ef40fddddf2a68367ba2a434427518beae80f83777036d08e9c04e1 +size 1335289 diff --git a/apps/android/ImageClassification/classification/src/main/assets/Sample4.png b/apps/android/ImageClassification/classification/src/main/assets/Sample4.png new file mode 100644 index 00000000..24969fbd --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/assets/Sample4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8649d7ebb60c7e52ef496739616111a1ecd9797e7ecec3e1881504449a648077 +size 3155242 diff --git a/apps/android/ImageClassification/classification/src/main/assets/Sample5.png b/apps/android/ImageClassification/classification/src/main/assets/Sample5.png new file mode 100644 index 00000000..eed3f738 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/assets/Sample5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d4a1bdf3ca10aa7d7c03fdceccda5ef3f6811bcbe3cb0968a8d5c606e572529 +size 1284181 diff --git a/apps/android/ImageClassification/classification/src/main/assets/labels.txt b/apps/android/ImageClassification/classification/src/main/assets/labels.txt new file mode 100644 index 00000000..a85dcd85 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/assets/labels.txt @@ -0,0 +1,1001 @@ +background +tench +goldfish +great white shark +tiger shark +hammerhead +electric ray +stingray +cock +hen +ostrich +brambling +goldfinch +house finch +junco +indigo bunting +robin +bulbul +jay +magpie +chickadee +water ouzel +kite +bald eagle +vulture +great grey owl +European fire salamander +common newt +eft +spotted salamander +axolotl +bullfrog +tree frog +tailed frog +loggerhead +leatherback turtle +mud turtle +terrapin +box turtle +banded gecko +common iguana +American chameleon +whiptail +agama +frilled lizard +alligator lizard +Gila monster +green lizard +African chameleon +Komodo dragon +African crocodile +American alligator +triceratops +thunder snake +ringneck snake +hognose snake +green snake +king snake +garter snake +water snake +vine snake +night snake +boa constrictor +rock python +Indian cobra +green mamba +sea snake +horned viper +diamondback +sidewinder +trilobite +harvestman +scorpion +black and gold garden spider +barn spider +garden spider +black widow +tarantula +wolf spider +tick +centipede +black grouse +ptarmigan +ruffed grouse +prairie chicken +peacock +quail +partridge +African grey +macaw +sulphur-crested cockatoo +lorikeet +coucal +bee eater +hornbill +hummingbird +jacamar +toucan +drake +red-breasted merganser +goose +black swan +tusker +echidna +platypus +wallaby +koala +wombat +jellyfish +sea anemone +brain coral +flatworm +nematode +conch +snail +slug +sea slug +chiton +chambered nautilus +Dungeness crab +rock crab +fiddler crab +king crab +American lobster +spiny lobster +crayfish +hermit crab +isopod +white stork +black stork +spoonbill +flamingo +little blue heron +American egret +bittern +crane +limpkin +European gallinule +American coot +bustard +ruddy turnstone +red-backed sandpiper +redshank +dowitcher +oystercatcher +pelican +king penguin +albatross +grey whale +killer whale +dugong +sea lion +Chihuahua +Japanese spaniel +Maltese dog +Pekinese +Shih-Tzu +Blenheim spaniel +papillon +toy terrier +Rhodesian ridgeback +Afghan hound +basset +beagle +bloodhound +bluetick +black-and-tan coonhound +Walker hound +English foxhound +redbone +borzoi +Irish wolfhound +Italian greyhound +whippet +Ibizan hound +Norwegian elkhound +otterhound +Saluki +Scottish deerhound +Weimaraner +Staffordshire bullterrier +American Staffordshire terrier +Bedlington terrier +Border terrier +Kerry blue terrier +Irish terrier +Norfolk terrier +Norwich terrier +Yorkshire terrier +wire-haired fox terrier +Lakeland terrier +Sealyham terrier +Airedale +cairn +Australian terrier +Dandie Dinmont +Boston bull +miniature schnauzer +giant schnauzer +standard schnauzer +Scotch terrier +Tibetan terrier +silky terrier +soft-coated wheaten terrier +West Highland white terrier +Lhasa +flat-coated retriever +curly-coated retriever +golden retriever +Labrador retriever +Chesapeake Bay retriever +German short-haired pointer +vizsla +English setter +Irish setter +Gordon setter +Brittany spaniel +clumber +English springer +Welsh springer spaniel +cocker spaniel +Sussex spaniel +Irish water spaniel +kuvasz +schipperke +groenendael +malinois +briard +kelpie +komondor +Old English sheepdog +Shetland sheepdog +collie +Border collie +Bouvier des Flandres +Rottweiler +German shepherd +Doberman +miniature pinscher +Greater Swiss Mountain dog +Bernese mountain dog +Appenzeller +EntleBucher +boxer +bull mastiff +Tibetan mastiff +French bulldog +Great Dane +Saint Bernard +Eskimo dog +malamute +Siberian husky +dalmatian +affenpinscher +basenji +pug +Leonberg +Newfoundland +Great Pyrenees +Samoyed +Pomeranian +chow +keeshond +Brabancon griffon +Pembroke +Cardigan +toy poodle +miniature poodle +standard poodle +Mexican hairless +timber wolf +white wolf +red wolf +coyote +dingo +dhole +African hunting dog +hyena +red fox +kit fox +Arctic fox +grey fox +tabby +tiger cat +Persian cat +Siamese cat +Egyptian cat +cougar +lynx +leopard +snow leopard +jaguar +lion +tiger +cheetah +brown bear +American black bear +ice bear +sloth bear +mongoose +meerkat +tiger beetle +ladybug +ground beetle +long-horned beetle +leaf beetle +dung beetle +rhinoceros beetle +weevil +fly +bee +ant +grasshopper +cricket +walking stick +cockroach +mantis +cicada +leafhopper +lacewing +dragonfly +damselfly +admiral +ringlet +monarch +cabbage butterfly +sulphur butterfly +lycaenid +starfish +sea urchin +sea cucumber +wood rabbit +hare +Angora +hamster +porcupine +fox squirrel +marmot +beaver +guinea pig +sorrel +zebra +hog +wild boar +warthog +hippopotamus +ox +water buffalo +bison +ram +bighorn +ibex +hartebeest +impala +gazelle +Arabian camel +llama +weasel +mink +polecat +black-footed ferret +otter +skunk +badger +armadillo +three-toed sloth +orangutan +gorilla +chimpanzee +gibbon +siamang +guenon +patas +baboon +macaque +langur +colobus +proboscis monkey +marmoset +capuchin +howler monkey +titi +spider monkey +squirrel monkey +Madagascar cat +indri +Indian elephant +African elephant +lesser panda +giant panda +barracouta +eel +coho +rock beauty +anemone fish +sturgeon +gar +lionfish +puffer +abacus +abaya +academic gown +accordion +acoustic guitar +aircraft carrier +airliner +airship +altar +ambulance +amphibian +analog clock +apiary +apron +ashcan +assault rifle +backpack +bakery +balance beam +balloon +ballpoint +Band Aid +banjo +bannister +barbell +barber chair +barbershop +barn +barometer +barrel +barrow +baseball +basketball +bassinet +bassoon +bathing cap +bath towel +bathtub +beach wagon +beacon +beaker +bearskin +beer bottle +beer glass +bell cote +bib +bicycle-built-for-two +bikini +binder +binoculars +birdhouse +boathouse +bobsled +bolo tie +bonnet +bookcase +bookshop +bottlecap +bow +bow tie +brass +brassiere +breakwater +breastplate +broom +bucket +buckle +bulletproof vest +bullet train +butcher shop +cab +caldron +candle +cannon +canoe +can opener +cardigan +car mirror +carousel +carpenter's kit +carton +car wheel +cash machine +cassette +cassette player +castle +catamaran +CD player +cello +cellular telephone +chain +chainlink fence +chain mail +chain saw +chest +chiffonier +chime +china cabinet +Christmas stocking +church +cinema +cleaver +cliff dwelling +cloak +clog +cocktail shaker +coffee mug +coffeepot +coil +combination lock +computer keyboard +confectionery +container ship +convertible +corkscrew +cornet +cowboy boot +cowboy hat +cradle +crane +crash helmet +crate +crib +Crock Pot +croquet ball +crutch +cuirass +dam +desk +desktop computer +dial telephone +diaper +digital clock +digital watch +dining table +dishrag +dishwasher +disk brake +dock +dogsled +dome +doormat +drilling platform +drum +drumstick +dumbbell +Dutch oven +electric fan +electric guitar +electric locomotive +entertainment center +envelope +espresso maker +face powder +feather boa +file +fireboat +fire engine +fire screen +flagpole +flute +folding chair +football helmet +forklift +fountain +fountain pen +four-poster +freight car +French horn +frying pan +fur coat +garbage truck +gasmask +gas pump +goblet +go-kart +golf ball +golfcart +gondola +gong +gown +grand piano +greenhouse +grille +grocery store +guillotine +hair slide +hair spray +half track +hammer +hamper +hand blower +hand-held computer +handkerchief +hard disc +harmonica +harp +harvester +hatchet +holster +home theater +honeycomb +hook +hoopskirt +horizontal bar +horse cart +hourglass +iPod +iron +jack-o'-lantern +jean +jeep +jersey +jigsaw puzzle +jinrikisha +joystick +kimono +knee pad +knot +lab coat +ladle +lampshade +laptop +lawn mower +lens cap +letter opener +library +lifeboat +lighter +limousine +liner +lipstick +Loafer +lotion +loudspeaker +loupe +lumbermill +magnetic compass +mailbag +mailbox +maillot +maillot +manhole cover +maraca +marimba +mask +matchstick +maypole +maze +measuring cup +medicine chest +megalith +microphone +microwave +military uniform +milk can +minibus +miniskirt +minivan +missile +mitten +mixing bowl +mobile home +Model T +modem +monastery +monitor +moped +mortar +mortarboard +mosque +mosquito net +motor scooter +mountain bike +mountain tent +mouse +mousetrap +moving van +muzzle +nail +neck brace +necklace +nipple +notebook +obelisk +oboe +ocarina +odometer +oil filter +organ +oscilloscope +overskirt +oxcart +oxygen mask +packet +paddle +paddlewheel +padlock +paintbrush +pajama +palace +panpipe +paper towel +parachute +parallel bars +park bench +parking meter +passenger car +patio +pay-phone +pedestal +pencil box +pencil sharpener +perfume +Petri dish +photocopier +pick +pickelhaube +picket fence +pickup +pier +piggy bank +pill bottle +pillow +ping-pong ball +pinwheel +pirate +pitcher +plane +planetarium +plastic bag +plate rack +plow +plunger +Polaroid camera +pole +police van +poncho +pool table +pop bottle +pot +potter's wheel +power drill +prayer rug +printer +prison +projectile +projector +puck +punching bag +purse +quill +quilt +racer +racket +radiator +radio +radio telescope +rain barrel +recreational vehicle +reel +reflex camera +refrigerator +remote control +restaurant +revolver +rifle +rocking chair +rotisserie +rubber eraser +rugby ball +rule +running shoe +safe +safety pin +saltshaker +sandal +sarong +sax +scabbard +scale +school bus +schooner +scoreboard +screen +screw +screwdriver +seat belt +sewing machine +shield +shoe shop +shoji +shopping basket +shopping cart +shovel +shower cap +shower curtain +ski +ski mask +sleeping bag +slide rule +sliding door +slot +snorkel +snowmobile +snowplow +soap dispenser +soccer ball +sock +solar dish +sombrero +soup bowl +space bar +space heater +space shuttle +spatula +speedboat +spider web +spindle +sports car +spotlight +stage +steam locomotive +steel arch bridge +steel drum +stethoscope +stole +stone wall +stopwatch +stove +strainer +streetcar +stretcher +studio couch +stupa +submarine +suit +sundial +sunglass +sunglasses +sunscreen +suspension bridge +swab +sweatshirt +swimming trunks +swing +switch +syringe +table lamp +tank +tape player +teapot +teddy +television +tennis ball +thatch +theater curtain +thimble +thresher +throne +tile roof +toaster +tobacco shop +toilet seat +torch +totem pole +tow truck +toyshop +tractor +trailer truck +tray +trench coat +tricycle +trimaran +tripod +triumphal arch +trolleybus +trombone +tub +turnstile +typewriter keyboard +umbrella +unicycle +upright +vacuum +vase +vault +velvet +vending machine +vestment +viaduct +violin +volleyball +waffle iron +wall clock +wallet +wardrobe +warplane +washbasin +washer +water bottle +water jug +water tower +whiskey jug +whistle +wig +window screen +window shade +Windsor tie +wine bottle +wing +wok +wooden spoon +wool +worm fence +wreck +yawl +yurt +web site +comic book +crossword puzzle +street sign +traffic light +book jacket +menu +plate +guacamole +consomme +hot pot +trifle +ice cream +ice lolly +French loaf +bagel +pretzel +cheeseburger +hotdog +mashed potato +head cabbage +broccoli +cauliflower +zucchini +spaghetti squash +acorn squash +butternut squash +cucumber +artichoke +bell pepper +cardoon +mushroom +Granny Smith +strawberry +orange +lemon +fig +pineapple +banana +jackfruit +custard apple +pomegranate +hay +carbonara +chocolate sauce +dough +meat loaf +pizza +potpie +burrito +red wine +espresso +cup +eggnog +alp +bubble +cliff +coral reef +geyser +lakeside +promontory +sandbar +seashore +valley +volcano +ballplayer +groom +scuba diver +rapeseed +daisy +yellow lady's slipper +corn +acorn +hip +buckeye +coral fungus +agaric +gyromitra +stinkhorn +earthstar +hen-of-the-woods +bolete +ear +toilet tissue diff --git a/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassification.java b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassification.java new file mode 100644 index 00000000..50adfcf8 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassification.java @@ -0,0 +1,214 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imageclassification; + +import android.content.Context; +import android.graphics.Bitmap; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.MappedByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import com.qualcomm.qti.QnnDelegate; +import org.tensorflow.lite.Interpreter; +import android.content.res.AssetFileDescriptor; +import android.content.res.AssetManager; +import android.util.Log; +import android.widget.Toast; +import java.io.FileInputStream; +import java.nio.channels.FileChannel; + +public class ImageClassification { + + private Context context; + private MappedByteBuffer tfLiteModel; + private Interpreter tfLite; + private Interpreter tfLite_QNN; + private QnnDelegate qnnDelegate = null; + private static final String TAG = "Sahin"; + private static final float IMAGE_MEAN = 127.7f; + private static final float IMAGE_STD =128f; + private List labelList; + private static final String LABEL_PATH = "labels.txt"; + boolean model_loaded = false; + public boolean getBuildStatus() + { + return model_loaded; + } + public void close() + { + if(qnnDelegate!=null) { + qnnDelegate.close(); + } + + if(tfLite != null){ + tfLite.close(); + } + + if(tfLiteModel!=null) + tfLiteModel.clear(); + + if(labelList!=null) + labelList.clear(); + } + public boolean initializeModel(Context context,String TFLITE_FILE) throws IOException { + + this.context = context; + + try { + tfLiteModel = loadModelFile(context.getApplicationContext().getAssets(), TFLITE_FILE); + Log.i(TAG, "MODEL LOADED"); + Interpreter.Options tfLiteOptions = new Interpreter.Options(); + tfLiteOptions.setNumThreads(4); + tfLiteOptions.setUseXNNPACK(true); + + tfLite = new Interpreter(tfLiteModel, tfLiteOptions); + + QnnDelegate.Options options = new QnnDelegate.Options(); + options.setBackendType(QnnDelegate.Options.BackendType.HTP_BACKEND); + options.setHtpPerformanceMode(QnnDelegate.Options.HtpPerformanceMode.HTP_PERFORMANCE_BURST); + options.setHtpPrecision(QnnDelegate.Options.HtpPrecision.HTP_PRECISION_FP16); + + Log.i(TAG, "NATIVE LIB PATH: " + context.getApplicationInfo().nativeLibraryDir); + options.setSkelLibraryDir(context.getApplicationInfo().nativeLibraryDir); + + qnnDelegate = new QnnDelegate(options); + tfLiteOptions.addDelegate(qnnDelegate); + tfLite_QNN = new Interpreter(tfLiteModel,tfLiteOptions); + Log.i(TAG, "QnnDelegate Option Added"); + model_loaded= true; + Log.d(TAG,"Label list Loaded Successfully"); + labelList =loadLabelList(LABEL_PATH); + return true; + + } catch (IOException e) { + Log.e(TAG,"TFLite Model Loading Unsuccessfull"); + e.printStackTrace(); + return false; + } + } + + + public static List findTop3Indices(float[] arr) { + List topIndices = new ArrayList<>(); + + for (int i = 0; i < 3; i++) { + int maxIndex = 0; + float maxValue = arr[0]; + + for (int j = 1; j < arr.length; j++) { + if (arr[j] > maxValue && !topIndices.contains(j)) { + maxValue = arr[j]; + maxIndex = j; + } + } + + topIndices.add(maxIndex); + } + + return topIndices; + } + + public static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename) + throws IOException { + AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename); + FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor()); + FileChannel fileChannel = inputStream.getChannel(); + long startOffset = fileDescriptor.getStartOffset(); + long declaredLength = fileDescriptor.getDeclaredLength(); + return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength); + } + + public Result inference(Bitmap[] images, String backend) { + System.out.println("Processing %d images %dx%d."+ images.length+ images[0].getWidth()+ images[0].getHeight()); + + try { + + long Preprocessing_StartTime = System.nanoTime(); + Log.d(TAG,"Image Preprocessing"); + + Utils util = new Utils(); + List img_mean = Arrays.asList(IMAGE_MEAN, IMAGE_MEAN, IMAGE_MEAN); + List img_std = Arrays.asList(IMAGE_STD, IMAGE_STD, IMAGE_STD); + + int[] arr = tfLite.getInputTensor(0).shape(); //FOR VISION MODEL - input is normally like (B,H,W,C) + int channel = arr[3]; + int input_dims1 = arr[1]; + int input_dims2 = arr[2]; + + Bitmap scaledBitmap = Bitmap.createScaledBitmap(images[0],input_dims1,input_dims2,true); + + float[][][][] floatinputarray = new float[1][input_dims1][input_dims1][channel]; + util.PreProcess(scaledBitmap, input_dims1, input_dims2, floatinputarray, img_mean, img_std); + + long Preprocessing_EndTime = System.nanoTime(); + long Preporccsing_TimeDiff=Preprocessing_EndTime-Preprocessing_StartTime; + + Log.d(TAG,"Preprocessing Time: "+Preporccsing_TimeDiff/1000000+"ms"); + + Object[] inputArray = {floatinputarray}; + float[][] floatoutputarray = new float[1][1000]; + Map outputMap = new HashMap<>(); + outputMap.put(0, floatoutputarray); + + long inferenceStartTime = System.nanoTime(); + + if (backend.equals("NPU") && tfLite_QNN != null) { + System.out.println("NPU BACKEND"); + tfLite_QNN.runForMultipleInputsOutputs(inputArray, outputMap); + } + else if (backend.equals("CPU") && tfLite != null) { + System.out.println("TFLITE BACKEND"); + tfLite.runForMultipleInputsOutputs(inputArray, outputMap); + } + else + { + System.out.println("Sycronisation issue"); + } + + Log.i(TAG, "MODEL EXECUTED"); + long inferenceEndTime = System.nanoTime(); + long TimeDiff=inferenceEndTime-inferenceStartTime; + + Toast.makeText(context,"Inference Time: "+TimeDiff/1000000+"ms",Toast.LENGTH_SHORT).show(); + Log.i(TAG,"Inference Completed"); + + String res=""; + List indexList = findTop3Indices(floatoutputarray[0]); + + for(int i=0;i<3;i++){ + res+=labelList.get(indexList.get(i)+1)+", "; + } + + res = res.substring(0, res.length() - 2); //Removing comma from last + + ImageClassificationResult result = new ImageClassificationResult(indexList, res); + + return new Result<>(result, + (inferenceEndTime - inferenceStartTime) / 1000000); + + } catch (Exception ex) { + ex.printStackTrace(); + return null; + + } + } + + private List loadLabelList(String labelPath) throws IOException { + List labelList = new ArrayList<>(); + AssetManager assetManager= context.getAssets(); + BufferedReader reader = new BufferedReader(new InputStreamReader(assetManager.open(labelPath))); + String line; + while ((line = reader.readLine()) != null) { + labelList.add(line); + } + reader.close(); + return labelList; + } +} diff --git a/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassificationResult.java b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassificationResult.java new file mode 100644 index 00000000..51c797fa --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/ImageClassificationResult.java @@ -0,0 +1,32 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imageclassification; + +import android.graphics.Bitmap; +import android.media.Image; + +import java.util.ArrayList; +import java.util.List; + +public class ImageClassificationResult { + + private List topindices; + private String ResultString; + + public ImageClassificationResult(List customlist, String res) + { + this.topindices = customlist; + this.ResultString = res; + } + + public List getIndices() + { + return topindices; + } + public String getResultString() + { + return ResultString; + } +} diff --git a/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/QNNActivity.java b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/QNNActivity.java new file mode 100644 index 00000000..38ba98ea --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/QNNActivity.java @@ -0,0 +1,177 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imageclassification; + +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.os.Bundle; +import android.view.View; +import android.widget.AdapterView; +import android.widget.ArrayAdapter; +import android.widget.ImageView; +import android.widget.RadioButton; +import android.widget.RadioGroup; +import android.widget.Spinner; +import android.widget.TextView; +import android.widget.Toast; + +import androidx.appcompat.app.AppCompatActivity; + +import java.io.IOException; +import java.io.InputStream; + +public class QNNActivity extends AppCompatActivity { + + public static InputStream originalFile = null; + ImageClassification imageClassification; + private final static String TFLITE_FILE = "classification.tflite"; + + //creating objects for UI element used in layout files (activity_classification.xml) + RadioButton rb1, rb2, rb3; + + String prev_runtime = ""; + ImageView imageView; + RadioGroup radioGroup; + TextView predicted_view; + Bitmap bmps = null; + private boolean spinInitialized = false; + private boolean radioGroupInitialized = false; + public static Result result = null; + Spinner spin; + private static final String TAG="Image_Classification"; + + String[] options = {"No Selection","Sample1.png","Sample2.png","Sample3.png"}; //Image filenames on which model inference is made + protected void executeRadioButton(int checkedId) { + switch (checkedId) { + case R.id.rb1: + // set text for your textview here + System.out.println("CPU instance running"); + result = process(bmps, "CPU"); + break; + case R.id.rb2: + // set text for your textview here + System.out.println("NPU instance running"); + System.out.println("Device runtime " + "NPU"); + result = process(bmps, "NPU"); + break; + default: + System.out.println("Do Nothing"); + } + } + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + + //Initialization + setContentView(R.layout.activity_classification); + rb1 = (RadioButton) findViewById(R.id.rb1); + rb2 = (RadioButton) findViewById(R.id.rb2); + imageView = (ImageView) findViewById(R.id.im1); + radioGroup = (RadioGroup) findViewById(R.id.rg1); + spin = (Spinner) findViewById((R.id.spinner)); + predicted_view=(TextView)findViewById(R.id.textView4); + + predicted_view.setVisibility(View.INVISIBLE); + + imageClassification = new ImageClassification(); + + ArrayAdapter ad = new ArrayAdapter(this, android.R.layout.simple_spinner_item, options); + ad.setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item); + spin.setAdapter(ad); + + + radioGroup.setOnCheckedChangeListener(new RadioGroup.OnCheckedChangeListener() { + @Override + public void onCheckedChanged(RadioGroup group, int checkedId) { + if (originalFile!=null && bmps!=null){ + executeRadioButton(checkedId); + } + else{ + if(radioGroupInitialized) { + Toast.makeText(getApplicationContext(), "Please select image first", Toast.LENGTH_SHORT).show(); + } + else + { + radioGroupInitialized = true; + } + } + } + }); + + spin.setOnItemSelectedListener(new AdapterView.OnItemSelectedListener() { + @Override + public void onItemSelected(AdapterView parent, View view, int position, long id) { + + // loading picture from assets... + if (!parent.getItemAtPosition(position).equals("No Selection")) { + try { + originalFile = getAssets().open((String) parent.getItemAtPosition(position)); + } catch (IOException e) { + e.printStackTrace(); + } + + // Convert input image to Bitmap + bmps = BitmapFactory.decodeStream(originalFile); + + //Scaling the image size to show it on the ImageView + Bitmap scaled1 = Bitmap.createScaledBitmap(bmps, 512, 512, true); + try { + // Set the input image in UI view + imageView.setImageBitmap(scaled1); + } catch (Exception e) { + e.printStackTrace(); + } + //Taking the Runtime Environment input from Radio Button + int checkedID_RB = radioGroup.getCheckedRadioButtonId(); + if (originalFile!=null && bmps!=null && checkedID_RB !=-1){ + executeRadioButton(checkedID_RB); + } + } + else{ + + originalFile=null; + bmps=null; + imageView.setImageResource(R.drawable.ic_launcher_background); + radioGroup.clearCheck(); + + if(spinInitialized){ + Toast.makeText(getApplicationContext(), "Please select image first", Toast.LENGTH_SHORT).show(); + } + else + { + spinInitialized = true; + } + } + } + @Override + public void onNothingSelected(AdapterView parent) { + System.out.println("Nothing"); + } + }); + } + + public Result process(Bitmap bmps, String run_time){ + + Result result = null; + try { + if(imageClassification.getBuildStatus()==false) + imageClassification.initializeModel(this, TFLITE_FILE); + + result = imageClassification.inference(new Bitmap[] {bmps}, run_time); + } catch (IOException e) { + throw new RuntimeException(e); + } + predicted_view.setVisibility(View.VISIBLE); + predicted_view.setText(result.getResults().getResultString()); + return result; + } + + @Override + protected void onDestroy() { + super.onDestroy(); + imageClassification.close(); + } +} diff --git a/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Result.java b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Result.java new file mode 100644 index 00000000..5428bbb9 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Result.java @@ -0,0 +1,27 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imageclassification; +import java.util.List; + +public class Result { + + private final E results; + private final long inferenceTime; + public Result(E results, long inferenceTime) { + + this.results = results; + this.inferenceTime = inferenceTime; + } + + public E getResults() { + return results; + } + + + public long getInferenceTime() { + return inferenceTime; + } + +} diff --git a/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Utils.java b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Utils.java new file mode 100644 index 00000000..c3f8eadf --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/java/com/qcom/imageclassification/Utils.java @@ -0,0 +1,27 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imageclassification; + + +import android.graphics.Bitmap; +import android.graphics.Color; +import java.util.Arrays; +import java.util.List; + +public class Utils { + + //PRE PROCESSING Model Input + public void PreProcess(Bitmap inputBitmap, int input_dims1, int input_dims2, float[][][][] floatinputarray, List ImageMean, List ImageStd){ + for (int x = 0; x < input_dims1; x++) { + for (int y = 0; y < input_dims2; y++) { + int pixel = inputBitmap.getPixel(x, y); + List rgb = Arrays.asList((float)Color.red(pixel), (float)Color.green(pixel), (float)Color.blue(pixel)); + for(int z = 0;z<3; z++){ + floatinputarray[0][x][y][z] = (float)((rgb.get(z))-ImageMean.get(z))/ImageStd.get(z); + } + } + } + } +} diff --git a/apps/android/ImageClassification/classification/src/main/res/drawable-v24/ic_launcher_foreground.xml b/apps/android/ImageClassification/classification/src/main/res/drawable-v24/ic_launcher_foreground.xml new file mode 100644 index 00000000..1ff1154f --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/drawable-v24/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + diff --git a/apps/android/ImageClassification/classification/src/main/res/drawable/ic_launcher_background.xml b/apps/android/ImageClassification/classification/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 00000000..a4f78de5 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apps/android/ImageClassification/classification/src/main/res/drawable/image_classification_icon.png b/apps/android/ImageClassification/classification/src/main/res/drawable/image_classification_icon.png new file mode 100644 index 00000000..23528a78 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/drawable/image_classification_icon.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d97af3b197c294062cbb9f35bd277438befcd158eb6b47fa67552c00053dce49 +size 26951 diff --git a/apps/android/ImageClassification/classification/src/main/res/layout/activity_classification.xml b/apps/android/ImageClassification/classification/src/main/res/layout/activity_classification.xml new file mode 100644 index 00000000..6df6e740 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/layout/activity_classification.xml @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher.xml new file mode 100644 index 00000000..3564f5b0 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml new file mode 100644 index 00000000..0351084b --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher.png new file mode 100644 index 00000000..1ae43951 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee6ca2903e3094d64110dee90aea432f9eb9bc747c7e5c134496b8f7feff3b8 +size 3593 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher_round.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher_round.png new file mode 100644 index 00000000..e898edde --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-hdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:876e35db6919f81d28dbe042d8535414b2b3bb13d1139c16265f652ca5df65ac +size 5339 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher.png new file mode 100644 index 00000000..10afb4e6 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62c4db47abd938c35f4926ea0b7b31b9d6c41bef1ddacb2c7685b5c6ea0890e5 +size 2636 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher_round.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher_round.png new file mode 100644 index 00000000..dd1c781b --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-mdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9bc0ce206c8715db22eacfbceddff820f9b8a0ef3a519bbb88f7b6e65806d71 +size 3388 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher.png new file mode 100644 index 00000000..cc082f9a --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a5e85ebec8c77c18f5c34d762949674373c7a95de57a8a82ce165c8db8cedbb +size 4926 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher_round.png new file mode 100644 index 00000000..3910511a --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-xhdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81b31d4cabd3e4358db2d99e6c31474b82ae6379dc06ea0c00267f41da1bd599 +size 7472 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher.png new file mode 100644 index 00000000..db7bbe05 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01cec2d4d6cc59f250e4bbfa445042c3a3b9ebf9788b9f34796e85c0af874da4 +size 7909 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher_round.png new file mode 100644 index 00000000..17327017 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxhdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf24682fa050affbc3bdaae15cd5532c13758a5a0e6d5305cfad52ddefc4d571 +size 11873 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher.png new file mode 100644 index 00000000..6413db3b --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f8a8d39587eb912f8d217f2160c4ba4e21a60fd7a6f84849c102bcac725975 +size 10652 diff --git a/apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png new file mode 100644 index 00000000..5d1c9ba8 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ed7010aa67cb843af6df357ff1a74ec1598d2407e4884d12633aa0f11e3a4b +size 16570 diff --git a/apps/android/ImageClassification/classification/src/main/res/values-night/themes.xml b/apps/android/ImageClassification/classification/src/main/res/values-night/themes.xml new file mode 100644 index 00000000..a9e583cb --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/values-night/themes.xml @@ -0,0 +1,17 @@ + + + + + diff --git a/apps/android/ImageClassification/classification/src/main/res/values/colors.xml b/apps/android/ImageClassification/classification/src/main/res/values/colors.xml new file mode 100644 index 00000000..977bb9f2 --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/values/colors.xml @@ -0,0 +1,11 @@ + + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + diff --git a/apps/android/ImageClassification/classification/src/main/res/values/strings.xml b/apps/android/ImageClassification/classification/src/main/res/values/strings.xml new file mode 100644 index 00000000..2967480c --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/values/strings.xml @@ -0,0 +1,4 @@ + + + Image_Classification + diff --git a/apps/android/ImageClassification/classification/src/main/res/values/themes.xml b/apps/android/ImageClassification/classification/src/main/res/values/themes.xml new file mode 100644 index 00000000..038239bb --- /dev/null +++ b/apps/android/ImageClassification/classification/src/main/res/values/themes.xml @@ -0,0 +1,18 @@ + + + + + + diff --git a/apps/android/ImageClassification/gradle.properties b/apps/android/ImageClassification/gradle.properties new file mode 100644 index 00000000..08e95206 --- /dev/null +++ b/apps/android/ImageClassification/gradle.properties @@ -0,0 +1,20 @@ +# Project-wide Gradle settings. +# IDE (e.g. Android Studio) users: +# Gradle settings configured through the IDE *will override* +# any settings specified in this file. +# For more details on how to configure your build environment visit +# http://www.gradle.org/docs/current/userguide/build_environment.html +# Specifies the JVM arguments used for the daemon process. +# The setting is particularly useful for tweaking memory settings. +org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8 +# When configured, Gradle will run in incubating parallel mode. +# This option should only be used with decoupled projects. More details, visit +# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects +# org.gradle.parallel=true +# AndroidX package structure to make it clearer which packages are bundled with the +# Android operating system, and which are packaged with your app"s APK +# https://developer.android.com/topic/libraries/support-library/androidx-rn +android.useAndroidX=true +# Automatically convert third-party libraries to use AndroidX +#android.enableJetifier=true +android.nonTransitiveRClass=true diff --git a/apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.jar b/apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 00000000..c4868dfc --- /dev/null +++ b/apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.jar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33ad4583fd7ee156f533778736fa1b4940bd83b433934d1cc4e9f608e99a6a89 +size 59536 diff --git a/apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.properties b/apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 00000000..f79f7cc5 --- /dev/null +++ b/apps/android/ImageClassification/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Wed Feb 07 17:55:01 IST 2024 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/apps/android/ImageClassification/gradlew b/apps/android/ImageClassification/gradlew new file mode 100644 index 00000000..744e882e --- /dev/null +++ b/apps/android/ImageClassification/gradlew @@ -0,0 +1,185 @@ +#!/usr/bin/env sh + +# +# Copyright 2015 the original author or authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MSYS* | MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin or MSYS, switch paths to Windows format before running java +if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=`expr $i + 1` + done + case $i in + 0) set -- ;; + 1) set -- "$args0" ;; + 2) set -- "$args0" "$args1" ;; + 3) set -- "$args0" "$args1" "$args2" ;; + 4) set -- "$args0" "$args1" "$args2" "$args3" ;; + 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=`save "$@"` + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +exec "$JAVACMD" "$@" diff --git a/apps/android/ImageClassification/gradlew.bat b/apps/android/ImageClassification/gradlew.bat new file mode 100644 index 00000000..ac1b06f9 --- /dev/null +++ b/apps/android/ImageClassification/gradlew.bat @@ -0,0 +1,89 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/apps/android/ImageClassification/settings.gradle b/apps/android/ImageClassification/settings.gradle new file mode 100644 index 00000000..2f5039bf --- /dev/null +++ b/apps/android/ImageClassification/settings.gradle @@ -0,0 +1,29 @@ +pluginManagement { + repositories { + gradlePluginPortal() + google() + mavenCentral() + } +} + +dependencyResolutionManagement { + repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) + repositories { + google() + mavenCentral() + maven { // add this repo to use snapshots + name 'ossrh-snapshot' + url 'https://oss.sonatype.org/content/repositories/snapshots' + + } + flatDir { + dirs 'libs' + } + } +} + + + +//include ':snpe-release' +rootProject.name = "classification" +include ':classification' diff --git a/apps/android/ImageSuperResolution/README.md b/apps/android/ImageSuperResolution/README.md new file mode 100644 index 00000000..70658c0e --- /dev/null +++ b/apps/android/ImageSuperResolution/README.md @@ -0,0 +1,66 @@ +### Requirements + +1. Java, android-sdk and sdkmanager is already set at user's end +2. User should have Linux QNN SDK in local machine. +3. ANDROID_HOME is set to android-sdk path +4. AI-Hub is properly configured with user token. + + +## Info +Please execute build_apk.py. This script will compile and download a model from AI-Hub and paste it in your Android Proect and Generate superresolution-debug.apk + +This app takes model with image of size 128x128 as input and gives 512x512 as output. If you want, you can replace the model with any superesolution tflite model, but you have to change the pre-processing, post-processing and dimensions in the app code based on model parameters. + + +## Preprocessing + + +``` + public void PreProcess(Bitmap inputBitmap, int input_dims1, int input_dims2, float[][][][] floatinputarray){ + for (int x = 0; x < input_dims1; x++) { + for (int y = 0; y < input_dims2; y++) { + int pixel = inputBitmap.getPixel(x, y); + // Normalize channel values to [-1.0, 1.0]. Here, pixel values + // are positive so the effective range will be [0.0, 1.0] + floatinputarray[0][x][y][0] = (Color.red(pixel))/255.0f; + floatinputarray[0][x][y][1] = (Color.green(pixel))/255.0f; + floatinputarray[0][x][y][2] = (Color.blue(pixel))/255.0f; + } + } + } +``` + + +## PostProcessing + + +``` + public void PostProcess(Bitmap outbmp, int output_dims1, int output_dims2, float[][][][] floatoutputarray) { + for (int x = 0; x < output_dims1; x++) { + for (int y = 0; y < output_dims2; y++) { + int red = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][0] * 255))); + int green = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][1] * 255))); + int blue = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][2] * 255))); + int color = Color.argb(255, red, green, blue); + outbmp.setPixel(x, y, color); + } + } + } +``` + +### Build App: + +You have to run build_apk.py for Image Classification. It will generate classification-debug.apk and install it in connected device. + + + build_apk.py [-h] -q QNNSDK [-m MODEL_NAME] [-path MODEL_PATH] + +``` +options: + + -h, --help show this help message and exit + -q QNNSDK, --qnnsdk QNNSDK Give path of QNN SDK (REQUIRED) + -m MODEL_NAME, --model_name MODEL_NAME Model Name (Optional) + -path MODEL_PATH, --model_path MODEL_PATH Model Path (Optional) + +``` diff --git a/apps/android/ImageSuperResolution/build.gradle b/apps/android/ImageSuperResolution/build.gradle new file mode 100644 index 00000000..798f7515 --- /dev/null +++ b/apps/android/ImageSuperResolution/build.gradle @@ -0,0 +1,10 @@ + +// Top-level build file where you can add configuration options common to all sub-projects/modules. +plugins { + id 'com.android.application' version '7.2.1' apply false + id 'com.android.library' version '7.2.1' apply false +} + +task clean(type: Delete) { + delete rootProject.buildDir +} diff --git a/apps/android/ImageSuperResolution/build.properties b/apps/android/ImageSuperResolution/build.properties new file mode 100644 index 00000000..0ed786f4 --- /dev/null +++ b/apps/android/ImageSuperResolution/build.properties @@ -0,0 +1,2 @@ +MODELTYPE=XLSR +APPTYPE=SUPERRES diff --git a/apps/android/ImageSuperResolution/build_apk.py b/apps/android/ImageSuperResolution/build_apk.py new file mode 100644 index 00000000..28deae76 --- /dev/null +++ b/apps/android/ImageSuperResolution/build_apk.py @@ -0,0 +1,182 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import argparse +import glob +import os +import shutil +import subprocess +import sys +from enum import Enum + + +class MODELNAME(Enum): + xlsr = 1 + esrgan = 2 + real_esrgan_general_x4v3 = 3 + # real_esrgan_x4plus = 4 + sesr_m5 = 5 + # quicksrnetsmall = 6 + # QuickSRNetMedium = 7 + # QuickSRNetLarge = 8 + # sesr_m5_quantized = 9 + # xlsr_quantized = 10 + + +def printmenu(): + print("*****************************") + print("* TYPE OF MODEL *") + print("*****************************") + for m in MODELNAME: + print(str(m.value) + ". " + m.name) + print("*****************************") + + +## Initialize parser +parser = argparse.ArgumentParser() +parser.add_argument("-q", "--qnnsdk", required=True, help="Give path of QNN SDK") + +parser.add_argument("-m", "--model_name", type=str, help="Model Name") + + +# group = parser.add_mutually_exclusive_group() +# group.add_argument('-stopdownload', '--stopdownload', action = "store_true", help = "Do NOT Download Model from AI HUB") +parser.add_argument("-path", "--model_path", type=str, help="TFLITE model file") + +args = parser.parse_args() + + +##based on this pre-post can be decided +if not args.model_name: + printmenu() + inp_model_name = int(input("Please select one: ")) + args.model_name = MODELNAME(inp_model_name).name + + +destAsset = os.path.join(".", "superresolution", "src", "main", "assets") +if not os.path.exists(destAsset): + os.makedirs(destAsset) + + +## MODEL PATH NOT MENTIONED, add information into model_path +if not args.model_path: + exportstatus = input("Do you want us to download the model from AI hub (y/n)") + + ##DOWNLAOD USING EXPORT.PY + if exportstatus.lower().startswith("y"): + print("EXPORT form path") + pathtomodel = os.path.join( + "..", + "..", + "..", + "", + "qai_hub_models", + "models", + args.model_name, + "export.py", + ) + if not os.path.exists(pathtomodel): + print("PATH DO NOT EXIST: " + pathtomodel) + exit() + subprocess.run(["python", pathtomodel, "--skip-inferencing"]) + tflite_file = glob.glob( + "build" + os.sep + args.model_name + os.sep + "*.tflite", recursive=True + ) + args.model_path = tflite_file[0] + # shutil.copy(tflite_file[0], destAsset+os.sep+"superresmodel.tflite") + + ##GET USER TO GIVE PATH + else: + args.model_path = input("Give model File as input") + # if not os.path.exists(tflite_file): + # print("PATH DO NOT EXIST: "+tflite_file) + # exit() + # shutil.copy(tflite_file, destAsset+os.sep+"superresmodel.tflite") + + +if args.model_path: + print(args.model_path) + if not os.path.exists(args.model_path): + print("PATH DO NOT EXIST: " + args.model_path) + exit() + shutil.copy(args.model_path, destAsset + os.sep + "superresmodel.tflite") + + +## GETTING PRE/POST PROCESSS BASED ON MODEL NAME +ProPostModel = "XLSR" +AppType = "SUPERRES" + +if args.model_name: + if "esrgan" in args.model_name.lower(): + print("MODEL IS ESRAGAN") + ProPostModel = "ESRGAN" + +with open("build.properties", "w") as f: + f.write(f"MODELTYPE={ProPostModel}\n") + f.write(f"APPTYPE={AppType}\n") + +## COPYING REQUIRED FILES FROM QNN SDK +destJNI = os.path.join(".", "superresolution", "src", "main", "jniLibs", "arm64-v8a") +if not os.path.exists(destJNI): + os.makedirs(destJNI) + +# copy *.so from $qnn_sdk/libs/aarch64-android to $jni_lib_dir +qnnbasiclibs = os.path.join(args.qnnsdk, "lib", "aarch64-android") +shutil.copytree(qnnbasiclibs, destJNI, dirs_exist_ok=True) + +# copy $qnn_sdk/lib/hexagon-v**/unsigned/libQnnHtpV**Skel.so to $jni_lib_dir +skelstubfiles = os.path.join(args.qnnsdk, "lib", "hexagon-v**", "unsigned", "*.so") +for file in glob.glob(skelstubfiles): + shutil.copy(file, destJNI) + +# copy qtld-release.aar to $test_app_root/Application/ +destaar = os.path.join(".", "superresolution", "libs") +if not os.path.exists(destaar): + os.makedirs(destaar) +aarfile = os.path.join(args.qnnsdk, "lib", "android", "qtld-release.aar") +shutil.copy(aarfile, destaar) + + +## BUILDING APK +if sys.platform.startswith("win"): + print("Detected platform is windows") + gradleoutput = subprocess.run(["gradlew.bat", "assembleDebug"], cwd=".") +elif sys.platform.startswith("darwin"): + print("Detected platform is MAC") + gradleoutput = subprocess.run(["./gradlew", "assembleDebug"], cwd=".") +else: + print("Detected platform is Linux") + gradleoutput = subprocess.run(["./gradlew", "assembleDebug"], cwd=".") + + +## COPYING APK TO CWD +ApkPath = os.path.join( + os.getcwd(), + "superresolution", + "build", + "outputs", + "apk", + "debug", + "superresolution-debug.apk", +) +print("APK Is copied at current Working Directory") +shutil.copy(ApkPath, ".") + + +install_perm = input("Do you want to install this apk in connected device") +## INSTALLING AND RUNNING APK +if install_perm.lower().startswith("y"): + command_to_install = ["adb", "install", "superresolution-debug.apk"] + subprocess.run(command_to_install, cwd=".") + command_to_run = [ + "adb", + "shell", + "am", + "start", + "-a", + "com.example.ACTION_NAME", + "-n", + "com.qcom.imagesuperres/com.qcom.imagesuperres.QNNActivity", + ] + subprocess.run(command_to_run, cwd=".") diff --git a/apps/android/ImageSuperResolution/gradle.properties b/apps/android/ImageSuperResolution/gradle.properties new file mode 100644 index 00000000..08e95206 --- /dev/null +++ b/apps/android/ImageSuperResolution/gradle.properties @@ -0,0 +1,20 @@ +# Project-wide Gradle settings. +# IDE (e.g. Android Studio) users: +# Gradle settings configured through the IDE *will override* +# any settings specified in this file. +# For more details on how to configure your build environment visit +# http://www.gradle.org/docs/current/userguide/build_environment.html +# Specifies the JVM arguments used for the daemon process. +# The setting is particularly useful for tweaking memory settings. +org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8 +# When configured, Gradle will run in incubating parallel mode. +# This option should only be used with decoupled projects. More details, visit +# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects +# org.gradle.parallel=true +# AndroidX package structure to make it clearer which packages are bundled with the +# Android operating system, and which are packaged with your app"s APK +# https://developer.android.com/topic/libraries/support-library/androidx-rn +android.useAndroidX=true +# Automatically convert third-party libraries to use AndroidX +#android.enableJetifier=true +android.nonTransitiveRClass=true diff --git a/apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.jar b/apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 00000000..c4868dfc --- /dev/null +++ b/apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.jar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33ad4583fd7ee156f533778736fa1b4940bd83b433934d1cc4e9f608e99a6a89 +size 59536 diff --git a/apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.properties b/apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 00000000..4a0ccd62 --- /dev/null +++ b/apps/android/ImageSuperResolution/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Fri Sep 09 10:14:39 IST 2022 +distributionBase=GRADLE_USER_HOME +distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip +distributionPath=wrapper/dists +zipStorePath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME diff --git a/apps/android/ImageSuperResolution/gradlew b/apps/android/ImageSuperResolution/gradlew new file mode 100644 index 00000000..744e882e --- /dev/null +++ b/apps/android/ImageSuperResolution/gradlew @@ -0,0 +1,185 @@ +#!/usr/bin/env sh + +# +# Copyright 2015 the original author or authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MSYS* | MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin or MSYS, switch paths to Windows format before running java +if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=`expr $i + 1` + done + case $i in + 0) set -- ;; + 1) set -- "$args0" ;; + 2) set -- "$args0" "$args1" ;; + 3) set -- "$args0" "$args1" "$args2" ;; + 4) set -- "$args0" "$args1" "$args2" "$args3" ;; + 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=`save "$@"` + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +exec "$JAVACMD" "$@" diff --git a/apps/android/ImageSuperResolution/gradlew.bat b/apps/android/ImageSuperResolution/gradlew.bat new file mode 100644 index 00000000..ac1b06f9 --- /dev/null +++ b/apps/android/ImageSuperResolution/gradlew.bat @@ -0,0 +1,89 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/apps/android/ImageSuperResolution/settings.gradle b/apps/android/ImageSuperResolution/settings.gradle new file mode 100644 index 00000000..667b31b5 --- /dev/null +++ b/apps/android/ImageSuperResolution/settings.gradle @@ -0,0 +1,27 @@ +pluginManagement { + repositories { + gradlePluginPortal() + google() + mavenCentral() + } +} + +dependencyResolutionManagement { + repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) + repositories { + google() + mavenCentral() + maven { // add this repo to use snapshots + name 'ossrh-snapshot' + url 'https://oss.sonatype.org/content/repositories/snapshots' + + } + flatDir { + dirs 'libs' + } + } +} + + +rootProject.name = "superresolution" +include ':superresolution' diff --git a/apps/android/ImageSuperResolution/superresolution/build.gradle b/apps/android/ImageSuperResolution/superresolution/build.gradle new file mode 100644 index 00000000..115fe986 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/build.gradle @@ -0,0 +1,71 @@ + +plugins { + id 'com.android.application' +} + + + +android { + compileSdk 32 + Properties properties = new Properties() + properties.load(project.rootProject.file("build.properties").newDataInputStream()) + + defaultConfig { + applicationId "com.qcom.imagesuperres" + minSdk 26 + targetSdk 32 + versionCode 1 + versionName "1.0" + + testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" + externalNativeBuild { + cmake { + cppFlags '' + } + } + + resValue "string", "modeltype", properties.getProperty("MODELTYPE", "") + resValue "string", "apptype", properties.getProperty("APPTYPE", "") + + } + aaptOptions { + noCompress "tflite" + } + + buildTypes { + release { + minifyEnabled false + proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' + } + } + compileOptions { + sourceCompatibility JavaVersion.VERSION_1_8 + targetCompatibility JavaVersion.VERSION_1_8 + } + + packagingOptions + { + doNotStrip "**/*.so" + } +} +project.ext.LIB_DIR = projectDir.toString() + '/libs/' +project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets' + +dependencies { + + implementation 'androidx.appcompat:appcompat:1.4.2' + implementation 'com.google.android.material:material:1.6.1' + implementation 'androidx.constraintlayout:constraintlayout:2.1.4' + testImplementation 'junit:junit:4.13.2' + androidTestImplementation 'androidx.test.ext:junit:1.1.3' + androidTestImplementation 'androidx.test.espresso:espresso-core:3.4.0' + implementation 'org.tensorflow:tensorflow-lite:2.13.0' + implementation 'org.tensorflow:tensorflow-lite-select-tf-ops:2.9.0' + implementation 'org.tensorflow:tensorflow-lite-support:0.4.3' + + implementation fileTree(dir: "libs", include: ["*.aar"]) + implementation files('libs/qtld-release') +// implementation 'org.pytorch:pytorch_android_lite:1.10.0' + + // implementation fileTree(dir: "libs", include: ["*.aar"]) +} diff --git a/apps/android/ImageSuperResolution/superresolution/proguard-rules.pro b/apps/android/ImageSuperResolution/superresolution/proguard-rules.pro new file mode 100644 index 00000000..36e00091 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/AndroidManifest.xml b/apps/android/ImageSuperResolution/superresolution/src/main/AndroidManifest.xml new file mode 100644 index 00000000..e4b778c7 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/AndroidManifest.xml @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample1.jpg b/apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample1.jpg new file mode 100644 index 00000000..7a647f20 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:811f0c08ff16ef506de4855ca2b609ba6c67622f5d65550896cdd7e60c200db2 +size 17244 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample2.jpg b/apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample2.jpg new file mode 100644 index 00000000..84a8f5a3 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/assets/Sample2.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1916e20918eb069c147377b581647c0d86277fdf01ba8e2ed07cf995e2a86cee +size 13864 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/QNNActivity.java b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/QNNActivity.java new file mode 100644 index 00000000..6f6fa9fb --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/QNNActivity.java @@ -0,0 +1,243 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imagesuperres; + +import android.graphics.Bitmap; +import android.graphics.BitmapFactory; +import android.os.Bundle; +import android.view.MotionEvent; +import android.view.View; +import android.view.WindowManager; +import android.widget.AdapterView; +import android.widget.ArrayAdapter; +import android.widget.ImageView; +import android.widget.ProgressBar; +import android.widget.RadioGroup; +import android.widget.Spinner; +import android.widget.TextView; +import android.widget.Toast; +import androidx.appcompat.app.AppCompatActivity; +import java.io.IOException; +import java.io.InputStream; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +public class QNNActivity extends AppCompatActivity { + public static final String MODEL_FILE_NAME = "superresmodel.tflite"; //Model file name + public static InputStream originalFile = null; + + private boolean spinInitialized = false; + private boolean radioGroupInitialized = false; + SuperResolution superResolution; + + String prev_runtime = ""; + //creating objects for UI element used in layout files (activity_superres.xml) + TextView txt_stat, tx_pr, tx_out, tx_sug; + private static int input_dims1 = 128; + private static int input_dims2 = 128; + ImageView imageView, imageView2; + RadioGroup radioGroup; + Bitmap bmps = null; + public static Result result = null; + Spinner spin; + String[] options = {"No Selection","Sample1.jpg","Sample2.jpg"}; //Image filenames on which model inference is made + protected void executeRadioButton(int checkedId) { + + ProgressBar progressBar; + progressBar = findViewById(R.id.indeterminateBar); + ExecutorService service = Executors.newSingleThreadExecutor(); + progressBar.setVisibility(View.VISIBLE); + getWindow().setFlags(WindowManager.LayoutParams.FLAG_NOT_TOUCHABLE, + WindowManager.LayoutParams.FLAG_NOT_TOUCHABLE); + + service.execute(new Runnable() { + @Override + public void run() { + try { + switch (checkedId) { + case R.id.rb1: + // set text for your textview here + System.out.println("CPU instance running"); + result = process(bmps, "TFLITE"); + break; + + case R.id.rb3: + System.out.println("NPU instance running"); + result = process(bmps, "QNNDELEGATE"); + break; + default: + System.out.println("Do Nothing"); + } + boolean final_status = result.getStatus(); + final String final_timestr = "INFERENCE TIME: "+ String.valueOf(result.getInferenceTime())+" ms"; + runOnUiThread(new Runnable() { + @Override + public void run() { + txt_stat.setText(final_timestr); + progressBar.setVisibility(View.INVISIBLE); + getWindow().clearFlags(WindowManager.LayoutParams.FLAG_NOT_TOUCHABLE); + if (final_status == true) { + String remark = result.getRemarks(); + if(!remark.equals("")) + Toast.makeText(getApplicationContext(),remark,Toast.LENGTH_LONG).show(); + imageView2.setImageBitmap(result.getResults().getHighResolutionImages()[0]); + imageView2.setVisibility(View.VISIBLE); + System.out.println("result displayed"); + txt_stat.setVisibility(View.VISIBLE); + tx_pr.setVisibility(View.INVISIBLE); + tx_out.setVisibility(View.VISIBLE); + tx_sug.setVisibility(View.VISIBLE); + } + } + }); + } catch (Exception e) { + runOnUiThread(new Runnable() { + @Override + public void run() { + getWindow().clearFlags(WindowManager.LayoutParams.FLAG_NOT_TOUCHABLE); + e.printStackTrace(); + } + }); + } + } + }); + } + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_superres); + spin = (Spinner) findViewById((R.id.spinner)); + txt_stat = findViewById(R.id.textView4); + imageView = findViewById(R.id.im1); + imageView2 = findViewById(R.id.im2); + radioGroup = findViewById(R.id.rg1); + tx_pr = findViewById(R.id.textView); + tx_out = findViewById(R.id.textView2); + tx_sug = findViewById(R.id.textView_suggest); + imageView2.setVisibility(View.INVISIBLE); + tx_out.setVisibility(View.INVISIBLE); + tx_sug.setVisibility(View.INVISIBLE); + + superResolution = new SuperResolution(); + + imageView2.setOnTouchListener((view, motionEvent) -> { + switch (motionEvent.getAction()) { + case MotionEvent.ACTION_DOWN: { + imageView2.setVisibility(view.INVISIBLE); + tx_out.setVisibility(view.INVISIBLE); + tx_pr.setVisibility(view.VISIBLE); + break; + } + case MotionEvent.ACTION_UP: { + imageView2.setVisibility(view.VISIBLE); + tx_out.setVisibility(view.VISIBLE); + tx_pr.setVisibility(view.INVISIBLE); + break; + } + } + return false; + }); + + ArrayAdapter ad = new ArrayAdapter(this, android.R.layout.simple_spinner_item, options); + ad.setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item); + spin.setAdapter(ad); + spin.setOnItemSelectedListener(new AdapterView.OnItemSelectedListener() { + @Override + public void onItemSelected(AdapterView parent, View view, int position, long id) { + // loading picture from assets... + if (!parent.getItemAtPosition(position).equals("No Selection")) { + imageView2.setImageResource(R.drawable.ic_launcher_background); + txt_stat.setText("Stats"); + try { + originalFile = getAssets().open((String) parent.getItemAtPosition(position)); + } catch (IOException e) { + e.printStackTrace(); + } + + // Convert input image to Bitmap + bmps = BitmapFactory.decodeStream(originalFile); + Bitmap scaled1 = Bitmap.createScaledBitmap(bmps, input_dims1, input_dims2, true); + try { + // Set the input image in UI view + imageView.setImageBitmap(scaled1); + + } catch (Exception e) { + e.printStackTrace(); + } + int checkedID_RB = radioGroup.getCheckedRadioButtonId(); + if (originalFile!=null && bmps!=null && checkedID_RB !=-1){ + executeRadioButton(checkedID_RB); + } + + } + else{ + originalFile=null; + bmps=null; + imageView.setImageResource(R.drawable.ic_launcher_background); + imageView2.setImageResource(R.drawable.ic_launcher_background); + imageView2.setVisibility(view.INVISIBLE); + txt_stat.setText("Stats"); + radioGroup.clearCheck(); + + if(spinInitialized){ + Toast.makeText(getApplicationContext(), "Please select image first", Toast.LENGTH_SHORT).show(); + } + else + { + spinInitialized = true; + } + } + } + @Override + public void onNothingSelected(AdapterView parent) { + System.out.println("Nothing"); + } + }); + + radioGroup.setOnCheckedChangeListener(new RadioGroup.OnCheckedChangeListener() { + @Override + public void onCheckedChanged(RadioGroup group, int checkedId) { + if (originalFile!=null && bmps!=null){ + executeRadioButton(checkedId); + } + else{ + if(radioGroupInitialized) { + Toast.makeText(getApplicationContext(), "Please select image first", Toast.LENGTH_SHORT).show(); + } + else + { + radioGroupInitialized = true; + } + } + } + }); + } + + public Result process(Bitmap bmps, String run_time) { + + Result result; + try { + + if(superResolution.getBuildStatus()==false) + superResolution.initializeModel(this, MODEL_FILE_NAME); + + //INFERENCING ON MODEL + result = superResolution.inference(new Bitmap[]{bmps}, run_time); + return result; + + } catch (Exception e) { + e.printStackTrace(); + return null; + } + + } + + @Override + protected void onDestroy() + { + super.onDestroy(); + superResolution.close(); + } +} diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Result.java b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Result.java new file mode 100644 index 00000000..17c5f909 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Result.java @@ -0,0 +1,38 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imagesuperres; +import java.util.List; + +public class Result { + + private final E results; + private final long inferenceTime; + private final String remarks; + private boolean status = false; + public Result(E results, long inferenceTime,String remarks) { + + this.results = results; + this.inferenceTime = inferenceTime; + this.remarks = remarks; + + + if(inferenceTime>0) this.status = true; + } + + public E getResults() { + return results; + } + + public String getRemarks() { + return remarks; + } + + public long getInferenceTime() { + return inferenceTime; + } + + public boolean getStatus(){return status; } + +} diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolution.java b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolution.java new file mode 100644 index 00000000..3b96a77f --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolution.java @@ -0,0 +1,165 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imagesuperres; + + +import android.content.Context; +import android.graphics.Bitmap; +import java.io.IOException; +import java.nio.MappedByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import com.qualcomm.qti.QnnDelegate; +import org.tensorflow.lite.Interpreter; +import android.content.res.AssetFileDescriptor; +import android.content.res.AssetManager; +import android.util.Log; +import android.widget.ImageView; +import java.io.FileInputStream; +import java.nio.channels.FileChannel; + +public class SuperResolution { + private MappedByteBuffer tfLiteModel; + private Interpreter tfLite; + private Interpreter tfLite_QNN; + + private boolean model_loaded= false; + + private QnnDelegate qnnDelegate = null; + private static final String TAG = "SUPERRES"; + private static Utils util = new Utils(); + + + private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename) + throws IOException { + AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename); + FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor()); + FileChannel fileChannel = inputStream.getChannel(); + long startOffset = fileDescriptor.getStartOffset(); + long declaredLength = fileDescriptor.getDeclaredLength(); + return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength); + } + + public void close() + { + if(qnnDelegate!=null) { + qnnDelegate.close(); + } + + if(tfLite != null){ + tfLite.close(); + } + + if(tfLiteModel!=null) + tfLiteModel.clear(); + } + + public boolean getBuildStatus() + { + return model_loaded; + } + public boolean initializeModel(Context context, String tflitemodelfileName) { + + //If modeltype is in Red, it will resolved after building the app + String kk = context.getString(R.string.modeltype); + Log.i(TAG,"MY STRING FROM GetProperty is : "+kk); + + if(kk == "ESRGAN") + util = new UtilsESRGAN(); + + try { + tfLiteModel = loadModelFile(context.getApplicationContext().getAssets(), tflitemodelfileName); + Log.i(TAG, "MODEL LOADED"); + Interpreter.Options tfLiteOptions = new Interpreter.Options(); + tfLiteOptions.setNumThreads(4); + tfLiteOptions.setUseXNNPACK(true); + tfLite = new Interpreter(tfLiteModel, tfLiteOptions); + + QnnDelegate.Options options = new QnnDelegate.Options(); + options.setBackendType(QnnDelegate.Options.BackendType.HTP_BACKEND); + options.setHtpPerformanceMode(QnnDelegate.Options.HtpPerformanceMode.HTP_PERFORMANCE_BURST); + options.setHtpPrecision(QnnDelegate.Options.HtpPrecision.HTP_PRECISION_FP16); + + Log.i(TAG, "NATIVE LIB PATH: " + context.getApplicationInfo().nativeLibraryDir); + options.setSkelLibraryDir(context.getApplicationInfo().nativeLibraryDir); + qnnDelegate = new QnnDelegate(options); + tfLiteOptions.addDelegate(qnnDelegate); + tfLite_QNN = new Interpreter(tfLiteModel,tfLiteOptions); + Log.i(TAG, "QnnDelegate Option Added "); + model_loaded= true; + return true; + } + catch (Exception e) + { + e.printStackTrace(); + return false; + } + + } + + public Result inference(Bitmap[] images, String backend) { + System.out.println("Processing %d images %dx%d."+ images.length+ images[0].getWidth()+ images[0].getHeight()); + String remarks = ""; + try{ + int[] arr = tfLite.getInputTensor(0).shape(); + int input_dims1 = arr[1]; + int input_dims2 = arr[2]; + + if(input_dims1!=input_dims2) + { + remarks = "THIS APP IS DESIGNED FOR 1:1 ASPECT RATIO"; + } + //PREPROCESSING INPUT to Model input Shape and Normalizing data + Bitmap scaledBitmap = Bitmap.createScaledBitmap(images[0],input_dims1,input_dims2,true); + float[][][][] floatinputarray = new float[1][input_dims1][input_dims2][3]; + util.PreProcess(scaledBitmap,input_dims1,input_dims2,floatinputarray); + + Object[] inputArray = {floatinputarray}; + int[] out_arr = tfLite.getOutputTensor(0).shape(); + int output_dims1 = out_arr[1]; + int output_dims2 = out_arr[2]; + + float[][][][] floatoutputarray = new float[1][output_dims1][output_dims2][3]; + Map outputMap = new HashMap<>(); + outputMap.put(0, floatoutputarray); + + Log.i(TAG, "inputTensor shape"+ Arrays.toString(tfLite.getInputTensor(0).shape())); + long inferenceStartTime = System.nanoTime(); + if (backend.equals("QNNDELEGATE") && tfLite_QNN != null) { + System.out.println("QNN BACKEND"); + tfLite_QNN.runForMultipleInputsOutputs(inputArray, outputMap); + } + else if (backend.equals("TFLITE") && tfLite != null) { + System.out.println("TFLITE BACKEND"); + tfLite.runForMultipleInputsOutputs(inputArray, outputMap); + } + else + { + System.out.println("PROBLEM WITH Model Iinitilization"); + } + long inferenceEndTime = System.nanoTime(); + Log.i(TAG,"MODEL EXECUTED"); + System.out.println("Inference time: "+ (inferenceEndTime - inferenceStartTime) / 1000);// calculated inference time + + + Bitmap outbmp = Bitmap.createBitmap(output_dims1, output_dims2, Bitmap.Config.ARGB_8888); + util.PostProcess(outbmp, output_dims1, output_dims2, floatoutputarray); + + Bitmap[] finalProcessedImages = new Bitmap[images.length]; + finalProcessedImages[0] = outbmp; + + SuperResolutionResult result = new SuperResolutionResult(finalProcessedImages); + return new Result<>(result, + (inferenceEndTime - inferenceStartTime) / 1000000, remarks); + + } catch (Exception ex) { + ex.printStackTrace(); + } + return null; + } +} diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolutionResult.java b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolutionResult.java new file mode 100644 index 00000000..6e825468 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/SuperResolutionResult.java @@ -0,0 +1,19 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imagesuperres; + +import android.graphics.Bitmap; + +public class SuperResolutionResult { + private final Bitmap[] highResolutionImages; + + public SuperResolutionResult(Bitmap[] highResolutionImages) { + this.highResolutionImages = highResolutionImages; + } + + public Bitmap[] getHighResolutionImages() { + return highResolutionImages; + } +} diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Utils.java b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Utils.java new file mode 100644 index 00000000..8aa6a2f8 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/Utils.java @@ -0,0 +1,36 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imagesuperres; + +import android.graphics.Bitmap; +import android.graphics.Color; + +public class Utils { + + public void PreProcess(Bitmap inputBitmap, int input_dims1, int input_dims2, float[][][][] floatinputarray){ + for (int x = 0; x < input_dims1; x++) { + for (int y = 0; y < input_dims2; y++) { + int pixel = inputBitmap.getPixel(x, y); + // Normalize channel values to [-1.0, 1.0]. Here, pixel values + // are positive so the effective range will be [0.0, 1.0] + floatinputarray[0][x][y][0] = (Color.red(pixel))/255.0f; + floatinputarray[0][x][y][1] = (Color.green(pixel))/255.0f; + floatinputarray[0][x][y][2] = (Color.blue(pixel))/255.0f; + } + } + } + + public void PostProcess(Bitmap outbmp, int output_dims1, int output_dims2, float[][][][] floatoutputarray) { + for (int x = 0; x < output_dims1; x++) { + for (int y = 0; y < output_dims2; y++) { + int red = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][0] * 255))); + int green = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][1] * 255))); + int blue = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][2] * 255))); + int color = Color.argb(255, red, green, blue); + outbmp.setPixel(x, y, color); + } + } + } +} diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/UtilsESRGAN.java b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/UtilsESRGAN.java new file mode 100644 index 00000000..80f3f877 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/java/com/qcom/imagesuperres/UtilsESRGAN.java @@ -0,0 +1,35 @@ +// --------------------------------------------------------------------- +// Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause +// --------------------------------------------------------------------- +package com.qcom.imagesuperres; + +import android.graphics.Bitmap; +import android.graphics.Color; +import android.util.Log; + +public class UtilsESRGAN extends Utils{ + + public void PreProcess(Bitmap inputBitmap, int input_dims1, int input_dims2, float[][][][] floatinputarray){ + for (int x = 0; x < input_dims1; x++) { + for (int y = 0; y < input_dims2; y++) { + int pixel = inputBitmap.getPixel(x, y); + floatinputarray[0][x][y][0] = Color.red(pixel); + floatinputarray[0][x][y][1] = Color.green(pixel); + floatinputarray[0][x][y][2] = Color.blue(pixel); + } + } + } + + public void PostProcess(Bitmap outbmp, int output_dims1, int output_dims2, float[][][][] floatoutputarray) { + for (int x = 0; x < output_dims1; x++) { + for (int y = 0; y < output_dims2; y++) { + int red = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][0]))); + int green = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][1]))); + int blue = (int) (Math.max(0, Math.min(255, floatoutputarray[0][x][y][2]))); + int color = Color.argb(255, red, green, blue); + outbmp.setPixel(x, y, color); + } + } + } +} diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/drawable-v24/ic_launcher_foreground.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/drawable-v24/ic_launcher_foreground.xml new file mode 100644 index 00000000..1ff1154f --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/drawable-v24/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/drawable/ic_launcher_background.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 00000000..a4f78de5 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/layout/activity_superres.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/layout/activity_superres.xml new file mode 100644 index 00000000..11ccc7fe --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/layout/activity_superres.xml @@ -0,0 +1,140 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher.xml new file mode 100644 index 00000000..3564f5b0 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml new file mode 100644 index 00000000..0351084b --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher.png new file mode 100644 index 00000000..1ae43951 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee6ca2903e3094d64110dee90aea432f9eb9bc747c7e5c134496b8f7feff3b8 +size 3593 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher_round.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher_round.png new file mode 100644 index 00000000..e898edde --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-hdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:876e35db6919f81d28dbe042d8535414b2b3bb13d1139c16265f652ca5df65ac +size 5339 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher.png new file mode 100644 index 00000000..10afb4e6 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62c4db47abd938c35f4926ea0b7b31b9d6c41bef1ddacb2c7685b5c6ea0890e5 +size 2636 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher_round.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher_round.png new file mode 100644 index 00000000..dd1c781b --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-mdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9bc0ce206c8715db22eacfbceddff820f9b8a0ef3a519bbb88f7b6e65806d71 +size 3388 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher.png new file mode 100644 index 00000000..cc082f9a --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a5e85ebec8c77c18f5c34d762949674373c7a95de57a8a82ce165c8db8cedbb +size 4926 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher_round.png new file mode 100644 index 00000000..3910511a --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xhdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81b31d4cabd3e4358db2d99e6c31474b82ae6379dc06ea0c00267f41da1bd599 +size 7472 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher.png new file mode 100644 index 00000000..db7bbe05 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01cec2d4d6cc59f250e4bbfa445042c3a3b9ebf9788b9f34796e85c0af874da4 +size 7909 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher_round.png new file mode 100644 index 00000000..17327017 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxhdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf24682fa050affbc3bdaae15cd5532c13758a5a0e6d5305cfad52ddefc4d571 +size 11873 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher.png new file mode 100644 index 00000000..6413db3b --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f8a8d39587eb912f8d217f2160c4ba4e21a60fd7a6f84849c102bcac725975 +size 10652 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png new file mode 100644 index 00000000..5d1c9ba8 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ed7010aa67cb843af6df357ff1a74ec1598d2407e4884d12633aa0f11e3a4b +size 16570 diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/values-night/themes.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/values-night/themes.xml new file mode 100644 index 00000000..a9e583cb --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/values-night/themes.xml @@ -0,0 +1,17 @@ + + + + + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/values/colors.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/values/colors.xml new file mode 100644 index 00000000..977bb9f2 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/values/colors.xml @@ -0,0 +1,11 @@ + + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/values/strings.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/values/strings.xml new file mode 100644 index 00000000..cdac71e2 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/values/strings.xml @@ -0,0 +1,4 @@ + + + Super_Resolution + diff --git a/apps/android/ImageSuperResolution/superresolution/src/main/res/values/themes.xml b/apps/android/ImageSuperResolution/superresolution/src/main/res/values/themes.xml new file mode 100644 index 00000000..e69989a9 --- /dev/null +++ b/apps/android/ImageSuperResolution/superresolution/src/main/res/values/themes.xml @@ -0,0 +1,18 @@ + + + + + + diff --git a/qai_hub_models/_version.py b/qai_hub_models/_version.py index 9b229525..1fab5070 100644 --- a/qai_hub_models/_version.py +++ b/qai_hub_models/_version.py @@ -2,4 +2,4 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -__version__ = "0.3.2" +__version__ = "0.4.0" diff --git a/qai_hub_models/asset_bases.yaml b/qai_hub_models/asset_bases.yaml index 1b110e24..124db058 100644 --- a/qai_hub_models/asset_bases.yaml +++ b/qai_hub_models/asset_bases.yaml @@ -1,7 +1,7 @@ store_url: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models web_asset_folder: models/{model_id}/web-assets -static_web_banner_filename: banner.png -animated_web_banner_filename: banner.mp4 +static_web_banner_filename: model_demo.png +animated_web_banner_filename: model_demo.mp4 model_asset_folder: models/{model_id}/v{version} dataset_asset_folder: datasets/{dataset_id}/v{version} repo_url: https://github.com/quic/ai-hub-models/blob/main diff --git a/qai_hub_models/conftest.py b/qai_hub_models/conftest.py index 6f57c5d0..9dd11824 100644 --- a/qai_hub_models/conftest.py +++ b/qai_hub_models/conftest.py @@ -6,3 +6,10 @@ def pytest_configure(config): config.addinivalue_line("markers", "compile: Run compile tests.") config.addinivalue_line("markers", "profile: Run profile tests.") config.addinivalue_line("markers", "inference: Run inference tests.") + config.addinivalue_line("markers", "trace: Run trace accuracy tests.") + + +def pytest_collection_modifyitems(items, config): + for item in items: + if not any(item.iter_markers()): + item.add_marker("unmarked") diff --git a/qai_hub_models/global_requirements.txt b/qai_hub_models/global_requirements.txt new file mode 100644 index 00000000..cdfa95ed --- /dev/null +++ b/qai_hub_models/global_requirements.txt @@ -0,0 +1,44 @@ +# If you: +# - Install requirements.txt +# - Run the aimet installation script +# - Then install this requirements file +# That should create an environment that works for every single model. + +PySoundFile; sys_platform == 'win32' +albumentations==0.5.2 +av==10.0.0 +basicsr==1.4.2 +click==8.0 +datasets==2.14.5 +diffusers[torch]==0.21.4 +easydict==1.10 +ffmpeg==1.4 +ftfy==6.1.1 +hydra-core==1.3.0 +imageio[ffmpeg]==2.31.5 +kornia==0.5.0 +librosa==0.10.1 +matplotlib==3.7.4 +mmcv==2.1.0 +mmdet==3.2.0 +mmpose==1.2.0 +openai-whisper==20230314 +pycocotools==2.0.7 +pytorch-lightning==1.6.0 +regex==2023.12.25 +scikit-image==0.21.0 +scikit-learn==1.1.3 +scipy==1.8.1 +seaborn==0.11.0 +sentencepiece==0.2.0 +soundfile==0.12.1 +tflite==2.10.0 +thop==0.1.1.post2209072238 +timm==0.9.11 +tensorboard==2.13.0 +torchaudio==0.13.1 +transformers==4.27.4 +tucker-conv==1.0.1 +ultralytics==8.0.193 +webdataset==0.2.86 +yacs==0.1.8 diff --git a/qai_hub_models/models/_shared/cityscapes_segmentation/demo.py b/qai_hub_models/models/_shared/cityscapes_segmentation/demo.py index 4fb3518d..25921362 100644 --- a/qai_hub_models/models/_shared/cityscapes_segmentation/demo.py +++ b/qai_hub_models/models/_shared/cityscapes_segmentation/demo.py @@ -49,7 +49,7 @@ def cityscapes_segmentation_demo( help="File path or URL to an input image to use for the demo.", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_type.get_model_id()) + validate_on_device_demo_args(args, model_id) if args.image is None: image = TEST_CITYSCAPES_LIKE_IMAGE_ASSET.fetch() @@ -60,7 +60,7 @@ def cityscapes_segmentation_demo( input_spec = model_type.get_input_spec() - inference_model = demo_model_from_cli_args(model_type, args) + inference_model = demo_model_from_cli_args(model_type, model_id, args) app = CityscapesSegmentationApp(inference_model) (_, _, height, width) = input_spec["image"][0] diff --git a/qai_hub_models/models/_shared/common.py b/qai_hub_models/models/_shared/common.py index 5e2038d4..af10bca3 100644 --- a/qai_hub_models/models/_shared/common.py +++ b/qai_hub_models/models/_shared/common.py @@ -2,11 +2,33 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -from typing import Type +from __future__ import annotations + +from typing import Callable, Type import torch +def apply_module_function_recursively( + module: torch.nn.Module, + tgt_cls: Type[torch.nn.Module], + apply_fn: Callable[torch.nn.Module, torch.nn.Module, str], + parent_module: Type[torch.nn.Module] = None, +): + """ + Recursively calls a function on all modules of a given type. + + The function `apply_fn` passes in the module, the parent module, and the + name of the module inside the parent module. + """ + for name, child in module.named_children(): + if isinstance(child, tgt_cls): + if parent_module is None or isinstance(module, parent_module): + apply_fn(child, module, name) + else: + apply_module_function_recursively(child, tgt_cls, apply_fn, parent_module) + + def replace_module_recursively( module: torch.nn.Module, tgt_cls: Type[torch.nn.Module], @@ -18,9 +40,8 @@ def replace_module_recursively( specified, `tgt_cls` instance must be an immediate member of `parent_module` (useful for limiting replacement scope) """ - for name, child in module.named_children(): - if isinstance(child, tgt_cls): - if parent_module is None or isinstance(module, parent_module): - setattr(module, name, new_cls(child)) - else: - replace_module_recursively(child, tgt_cls, new_cls) + + def apply_fn(child, pmodule, name): + setattr(pmodule, name, new_cls(child)) + + apply_module_function_recursively(module, tgt_cls, apply_fn, parent_module) diff --git a/qai_hub_models/models/_shared/deeplab/demo.py b/qai_hub_models/models/_shared/deeplab/demo.py index a14b7064..1b4b999d 100644 --- a/qai_hub_models/models/_shared/deeplab/demo.py +++ b/qai_hub_models/models/_shared/deeplab/demo.py @@ -21,6 +21,7 @@ def deeplabv3_demo( model_type: Type[BaseModel], + model_id: str, default_image: str | CachedWebAsset, num_classes: int, is_test: bool, @@ -35,7 +36,7 @@ def deeplabv3_demo( help="image file path or URL.", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_type.get_model_id()) + validate_on_device_demo_args(args, model_id) input_spec = model_type.get_input_spec() @@ -47,7 +48,7 @@ def deeplabv3_demo( # This DeepLabV3 ResNet 50 demo comes from # https://pytorch.org/hub/pytorch_vision_deeplabv3_resnet101/ input_image = image.convert("RGB") - inference_model = demo_model_from_cli_args(model_type, args) + inference_model = demo_model_from_cli_args(model_type, model_id, args) app = DeepLabV3App(inference_model, num_classes=num_classes) # Run app diff --git a/qai_hub_models/models/_shared/detr/demo.py b/qai_hub_models/models/_shared/detr/demo.py index 0a513b0a..e0eeb66e 100644 --- a/qai_hub_models/models/_shared/detr/demo.py +++ b/qai_hub_models/models/_shared/detr/demo.py @@ -24,6 +24,7 @@ # The demo will display the predicted mask in a window. def detr_demo( model: Type[BaseModel], + model_id: str, default_weights: str, default_image: str | CachedWebAsset, is_test: bool = False, @@ -38,10 +39,10 @@ def detr_demo( help="test image file path or URL", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model.get_model_id()) + validate_on_device_demo_args(args, model_id) # Load image & model - detr = demo_model_from_cli_args(model, args) + detr = demo_model_from_cli_args(model, model_id, args) # Run app to scores, labels and boxes img = load_image(args.image) diff --git a/qai_hub_models/models/_shared/detr/model.py b/qai_hub_models/models/_shared/detr/model.py index 426186a1..9e277c0d 100644 --- a/qai_hub_models/models/_shared/detr/model.py +++ b/qai_hub_models/models/_shared/detr/model.py @@ -49,8 +49,8 @@ def forward( predictions = self.model(image, mask, return_dict=False) return predictions + @staticmethod def get_input_spec( - self, batch_size: int = 1, num_channels: int = 3, height: int = 480, diff --git a/qai_hub_models/models/_shared/fastsam/demo.py b/qai_hub_models/models/_shared/fastsam/demo.py index cc1241ff..59281888 100644 --- a/qai_hub_models/models/_shared/fastsam/demo.py +++ b/qai_hub_models/models/_shared/fastsam/demo.py @@ -17,13 +17,16 @@ get_on_device_demo_parser, validate_on_device_demo_args, ) -from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_path +from qai_hub_models.utils.asset_loaders import CachedWebAsset, load_image from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.display import display_or_save_image def fastsam_demo( - model_type: Type[BaseModel], image_path: str | CachedWebAsset, is_test: bool + model_type: Type[BaseModel], + model_id: str, + image_path: str | CachedWebAsset, + is_test: bool, ): # Demo parameters parser = get_model_cli_parser(model_type) @@ -36,21 +39,27 @@ def fastsam_demo( ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_type.get_model_id()) + validate_on_device_demo_args(args, model_id) - model = demo_model_from_cli_args(model_type, args) + model = demo_model_from_cli_args(model_type, model_id, args) app = FastSAMApp(model) + image = load_image(args.image) + with tempfile.TemporaryDirectory() as tmpdir: - image_path = load_path(args.image, tmpdir) + image_path = os.path.join(tmpdir, "inp_image.jpg") + image.save(image_path) pred, prompt_process = app.segment_image(image_path) - # Store the output image - output_dirname, _ = os.path.split(image_path) - output_path = os.path.join(output_dirname, "output.jpg") - prompt_process.plot(annotations=pred, output=output_path) + # Store the output image + output_path = os.path.join(args.output_dir or tmpdir, "output.jpg") + + # Save the output + prompt_process.plot(annotations=pred, output=output_path) - # Display the output - output_image = Image.open(output_path) - if not is_test: - display_or_save_image(output_image, args.output_dir) + if is_test: + assert pred is not None + else: + display_or_save_image( + Image.open(output_path), args.output_dir, "output.jpg" + ) diff --git a/qai_hub_models/models/_shared/fastsam/model.py b/qai_hub_models/models/_shared/fastsam/model.py index 092d44e6..4342fb72 100644 --- a/qai_hub_models/models/_shared/fastsam/model.py +++ b/qai_hub_models/models/_shared/fastsam/model.py @@ -48,8 +48,8 @@ def forward(self, image: torch.Tensor): predictions[1][2], ) + @staticmethod def get_input_spec( - self, batch_size: int = 1, num_channels: int = 3, height: int = 640, diff --git a/qai_hub_models/models/_shared/ffnet/model.py b/qai_hub_models/models/_shared/ffnet/model.py index 16834289..21c94b67 100644 --- a/qai_hub_models/models/_shared/ffnet/model.py +++ b/qai_hub_models/models/_shared/ffnet/model.py @@ -5,6 +5,7 @@ from __future__ import annotations import os +from importlib import reload import torch @@ -105,6 +106,16 @@ def _load_ffnet_source_model(variant_name) -> torch.nn.Module: import config config.model_weights_base_path = root_weights_path + + # This repository has a top-level "models", which is common. We + # explicitly reload it in case it has been loaded and cached by another + # package (or our models when executing from qai_hub_models/). + # This reload must happen after the config fix, and before trying to + # load model_entrypoint. + import models + + reload(models) + from models.model_registry import model_entrypoint model = model_entrypoint(variant_name)().eval() diff --git a/qai_hub_models/models/_shared/imagenet_classifier/demo.py b/qai_hub_models/models/_shared/imagenet_classifier/demo.py index 41a81e70..7dad494c 100644 --- a/qai_hub_models/models/_shared/imagenet_classifier/demo.py +++ b/qai_hub_models/models/_shared/imagenet_classifier/demo.py @@ -2,7 +2,7 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -from typing import Type +from typing import List, Type import torch @@ -26,6 +26,7 @@ load_image, load_json, ) +from qai_hub_models.utils.base_model import TargetRuntime IMAGENET_LABELS_ASSET = CachedWebModelAsset( "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json", @@ -37,10 +38,20 @@ # Run Imagenet Classifier end-to-end on a sample image. # The demo will print the predicted class to terminal. -def imagenet_demo(model_cls: Type[ImagenetClassifier], is_test: bool = False): +def imagenet_demo( + model_cls: Type[ImagenetClassifier], + model_id: str, + is_test: bool = False, + available_target_runtimes: List[TargetRuntime] = list( + TargetRuntime.__members__.values() + ), +): + # Demo parameters parser = get_model_cli_parser(model_cls) - parser = get_on_device_demo_parser(parser) + parser = get_on_device_demo_parser( + parser, available_target_runtimes=available_target_runtimes + ) parser.add_argument( "--image", type=str, @@ -48,9 +59,9 @@ def imagenet_demo(model_cls: Type[ImagenetClassifier], is_test: bool = False): help="test image file path or URL", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_cls.get_model_id()) + validate_on_device_demo_args(args, model_id) - model = demo_model_from_cli_args(model_cls, args) + model = demo_model_from_cli_args(model_cls, model_id, args) app = ImagenetClassifierApp(model) print("Model Loaded") diff --git a/qai_hub_models/models/_shared/imagenet_classifier/model.py b/qai_hub_models/models/_shared/imagenet_classifier/model.py index fb680725..070e1437 100644 --- a/qai_hub_models/models/_shared/imagenet_classifier/model.py +++ b/qai_hub_models/models/_shared/imagenet_classifier/model.py @@ -28,16 +28,19 @@ class ImagenetClassifier(BaseModel): def __init__( self, net: torch.nn.Module, + transform_input: bool = False, ): """ Basic initializer which takes in a pretrained classifier network. Subclasses can choose to implement their own __init__ and forward methods. """ super().__init__() + self.transform_input = transform_input self.net = net self.eval() - def forward(self, image_tensor: torch.Tensor): + # Type annotation on image_tensor causes aimet onnx export failure + def forward(self, image_tensor): """ Predict class probabilities for an input `image`. @@ -54,14 +57,22 @@ def forward(self, image_tensor: torch.Tensor): A [1, 1000] where each value is the log-likelihood of the image belonging to the corresponding Imagenet class. """ + if self.transform_input: + # This is equivalent but converts better than the built-in. + # transform_input should be turned off in torchvision model. + shape = (1, 3, 1, 1) + scale = torch.tensor([0.229 / 0.5, 0.224 / 0.5, 0.225 / 0.5]).reshape(shape) + bias = torch.tensor( + [(0.485 - 0.5) / 0.5, (0.456 - 0.5) / 0.5, (0.406 - 0.5) / 0.5] + ).reshape(shape) + image_tensor = image_tensor * scale + bias return self.net(image_tensor) def get_evaluator(self) -> BaseEvaluator: return ClassificationEvaluator() - def get_input_spec( - self, - ) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: """ Returns the input specification (name -> (shape, type). This can be used to submit profiling job on Qualcomm® AI Hub. diff --git a/qai_hub_models/models/_shared/imagenet_classifier/test_utils.py b/qai_hub_models/models/_shared/imagenet_classifier/test_utils.py index d8865ca5..cf3bd022 100644 --- a/qai_hub_models/models/_shared/imagenet_classifier/test_utils.py +++ b/qai_hub_models/models/_shared/imagenet_classifier/test_utils.py @@ -100,7 +100,7 @@ def run_imagenet_classifier_trace_test( model.convert_to_torchscript(check_trace=check_trace) ) else: - trace_app = ImagenetClassifierApp(model.convert_to_quantized_torchscript()) + trace_app = ImagenetClassifierApp(model.convert_to_torchscript()) probabilities = app.predict(img) trace_probs = trace_app.predict(img) assert_most_close(probabilities.numpy(), trace_probs.numpy(), diff_tol, rtol, atol) diff --git a/qai_hub_models/models/_shared/quicksrnet/common.py b/qai_hub_models/models/_shared/quicksrnet/common.py index 248c125e..3883190e 100644 --- a/qai_hub_models/models/_shared/quicksrnet/common.py +++ b/qai_hub_models/models/_shared/quicksrnet/common.py @@ -4,15 +4,10 @@ # --------------------------------------------------------------------- import torch -from qai_hub_models.utils.asset_loaders import SourceAsRoot - -QUICKSRNET_SOURCE_REPOSITORY = "https://github.com/quic/aimet-model-zoo" -QUICKSRNET_SOURCE_REPO_COMMIT = "d09d2b0404d10f71a7640a87e9d5e5257b028802" +from qai_hub_models.utils.aimet.repo import aimet_zoo_as_root def _load_quicksrnet_source_model( - model_id, - model_asset_version, scaling_factor, num_channels, num_intermediate_layers, @@ -20,22 +15,7 @@ def _load_quicksrnet_source_model( ) -> torch.nn.Module: # Load QuickSRNet model from the source repository using the given weights. # Returns .utils.super_resolution.models.QuickSRNetBase - with SourceAsRoot( - QUICKSRNET_SOURCE_REPOSITORY, - QUICKSRNET_SOURCE_REPO_COMMIT, - model_id, - model_asset_version, - ): - # Remove import of model_definition.py as it has an import error itself, - # but we don't need anything from that file here - with open("aimet_zoo_torch/quicksrnet/__init__.py", "r") as file: - file_content = file.read() - new_content = file_content.replace( - "from .model.model_definition import QuickSRNet", " " - ) - with open("aimet_zoo_torch/quicksrnet/__init__.py", "w") as file: - file.write(new_content) - + with aimet_zoo_as_root(): from aimet_zoo_torch.quicksrnet.model.models import QuickSRNetBase return QuickSRNetBase( diff --git a/qai_hub_models/models/_shared/repaint/app.py b/qai_hub_models/models/_shared/repaint/app.py index bb82f8b8..5a6165c1 100644 --- a/qai_hub_models/models/_shared/repaint/app.py +++ b/qai_hub_models/models/_shared/repaint/app.py @@ -4,7 +4,7 @@ # --------------------------------------------------------------------- from __future__ import annotations -from typing import Callable, List +from typing import Callable, Dict, List import numpy as np import torch @@ -36,6 +36,26 @@ def predict(self, *args, **kwargs): # See paint_mask_on_image. return self.paint_mask_on_image(*args, **kwargs) + @staticmethod + def preprocess_inputs( + pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], + mask_pixel_values_or_image: torch.Tensor | np.ndarray | Image, + ) -> Dict[str, torch.Tensor]: + NCHW_fp32_torch_frames = app_to_net_image_inputs(pixel_values_or_image)[1] + NCHW_fp32_torch_masks = app_to_net_image_inputs(mask_pixel_values_or_image)[1] + + # The number of input images should equal the number of input masks. + if NCHW_fp32_torch_masks.shape[0] != 1: + NCHW_fp32_torch_masks = NCHW_fp32_torch_masks.tile( + (NCHW_fp32_torch_frames.shape[0], 1, 1, 1) + ) + + # Mask input image + image_masked = ( + NCHW_fp32_torch_frames * (1 - NCHW_fp32_torch_masks) + NCHW_fp32_torch_masks + ) + return {"image": image_masked, "mask": NCHW_fp32_torch_masks} + def paint_mask_on_image( self, pixel_values_or_image: torch.Tensor | np.ndarray | Image | List[Image], @@ -65,19 +85,9 @@ def paint_mask_on_image( images: List[PIL.Image] A list of predicted images (one list element per batch). """ - NCHW_fp32_torch_frames = app_to_net_image_inputs(pixel_values_or_image)[1] - NCHW_fp32_torch_masks = app_to_net_image_inputs(mask_pixel_values_or_image)[1] - - # The number of input images should equal the number of input masks. - if NCHW_fp32_torch_masks.shape[0] != 1: - NCHW_fp32_torch_masks = NCHW_fp32_torch_masks.tile( - (NCHW_fp32_torch_frames.shape[0], 1, 1, 1) - ) - - # Mask input image - image_masked = ( - NCHW_fp32_torch_frames * (1 - NCHW_fp32_torch_masks) + NCHW_fp32_torch_masks + inputs = self.preprocess_inputs( + pixel_values_or_image, mask_pixel_values_or_image ) - out = self.model(image_masked, NCHW_fp32_torch_masks) + out = self.model(inputs["image"], inputs["mask"]) return [torch_tensor_to_PIL_image(img) for img in out] diff --git a/qai_hub_models/models/_shared/repaint/demo.py b/qai_hub_models/models/_shared/repaint/demo.py index f67f8270..2df57b1c 100644 --- a/qai_hub_models/models/_shared/repaint/demo.py +++ b/qai_hub_models/models/_shared/repaint/demo.py @@ -4,7 +4,7 @@ # --------------------------------------------------------------------- from __future__ import annotations -from typing import Type +from typing import List, Type from qai_hub_models.models._shared.repaint.app import RepaintMaskApp from qai_hub_models.utils.args import ( @@ -22,14 +22,18 @@ # The demo will display the predicted image in a window. def repaint_demo( model_type: Type[BaseModel], + model_id: str, default_image: str | CachedWebAsset, default_mask: str | CachedWebAsset, is_test: bool = False, + available_target_runtimes: List[TargetRuntime] = list( + TargetRuntime.__members__.values() + ), ): # Demo parameters parser = get_model_cli_parser(model_type) parser = get_on_device_demo_parser( - parser, available_target_runtimes=[TargetRuntime.TFLITE], add_output_dir=True + parser, available_target_runtimes=available_target_runtimes, add_output_dir=True ) parser.add_argument( "--image", @@ -44,10 +48,10 @@ def repaint_demo( help="test mask file path or URL", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_type.get_model_id()) + validate_on_device_demo_args(args, model_id) # Load image & model - model = demo_model_from_cli_args(model_type, args) + model = demo_model_from_cli_args(model_type, model_id, args) image = load_image(args.image) mask = load_image(args.mask) print("Model Loaded") diff --git a/qai_hub_models/models/_shared/sesr/common.py b/qai_hub_models/models/_shared/sesr/common.py index eebef83c..157fdea7 100644 --- a/qai_hub_models/models/_shared/sesr/common.py +++ b/qai_hub_models/models/_shared/sesr/common.py @@ -6,22 +6,18 @@ import torch -from qai_hub_models.utils.asset_loaders import SourceAsRoot +from qai_hub_models.utils.aimet.repo import aimet_zoo_as_root # SESR original repo is here: https://github.com/ARM-software/sesr -# But this is all written in TF and Keras. Torch version is in AIMET -SESR_SOURCE_REPOSITORY = "https://github.com/quic/aimet-model-zoo" -SESR_SOURCE_REPO_COMMIT = "d09d2b0404d10f71a7640a87e9d5e5257b028802" +# But this is all written in TF and Keras. Torch version is in AIMET Zoo def _load_sesr_source_model( - model_id, model_asset_version: int | str, scaling_factor, num_channels, num_lblocks + scaling_factor, num_channels, num_lblocks ) -> torch.nn.Module: # Load SESR model from the source repository using the given weights. # Returns .utils.super_resolution.models.SESRRelease - with SourceAsRoot( - SESR_SOURCE_REPOSITORY, SESR_SOURCE_REPO_COMMIT, model_id, model_asset_version - ): + with aimet_zoo_as_root(): from aimet_zoo_torch.common.super_resolution.models import SESRRelease diff --git a/qai_hub_models/models/_shared/super_resolution/demo.py b/qai_hub_models/models/_shared/super_resolution/demo.py index e8d545ec..3ed3cb5d 100644 --- a/qai_hub_models/models/_shared/super_resolution/demo.py +++ b/qai_hub_models/models/_shared/super_resolution/demo.py @@ -23,6 +23,7 @@ # The demo will display both the input image and the higher resolution output. def super_resolution_demo( model_cls: Type[BaseModel], + model_id: str, default_image: str | CachedWebAsset, is_test: bool = False, available_target_runtimes: List[TargetRuntime] = list( @@ -44,7 +45,7 @@ def super_resolution_demo( ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_cls.get_model_id()) + validate_on_device_demo_args(args, model_id) # Load image & model image = load_image(args.image) @@ -64,6 +65,7 @@ def super_resolution_demo( inference_model = demo_model_from_cli_args( model_cls, + model_id, args, ) app = SuperResolutionApp(inference_model) diff --git a/qai_hub_models/models/_shared/video_classifier/model.py b/qai_hub_models/models/_shared/video_classifier/model.py index 6b75ea3a..6fbacf89 100644 --- a/qai_hub_models/models/_shared/video_classifier/model.py +++ b/qai_hub_models/models/_shared/video_classifier/model.py @@ -43,8 +43,8 @@ def forward(self, video: torch.Tensor): """ return self.net(video) + @staticmethod def get_input_spec( - self, num_frames: int = 16, ) -> InputSpec: """ diff --git a/qai_hub_models/models/_shared/whisper/__init__.py b/qai_hub_models/models/_shared/whisper/__init__.py new file mode 100644 index 00000000..21a22b31 --- /dev/null +++ b/qai_hub_models/models/_shared/whisper/__init__.py @@ -0,0 +1,4 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- diff --git a/qai_hub_models/models/whisper_asr/app.py b/qai_hub_models/models/_shared/whisper/app.py similarity index 98% rename from qai_hub_models/models/whisper_asr/app.py rename to qai_hub_models/models/_shared/whisper/app.py index dcdccd68..4a548b94 100644 --- a/qai_hub_models/models/whisper_asr/app.py +++ b/qai_hub_models/models/_shared/whisper/app.py @@ -11,7 +11,7 @@ import whisper # type: ignore from scipy import special as scipy_special # type: ignore -from qai_hub_models.models.whisper_asr.model import Whisper +from qai_hub_models.models._shared.whisper.model import Whisper from qai_hub_models.utils.model_adapters import TorchNumpyAdapter # hard-coded audio hyperparameters @@ -31,8 +31,8 @@ class WhisperApp: """ def __init__(self, whisper: Whisper): - decoder = whisper.decoder - encoder = whisper.encoder + decoder = whisper.decoder.to("cpu") + encoder = whisper.encoder.to("cpu") self.num_decoder_blocks = whisper.num_decoder_blocks self.attention_dim = whisper.attention_dim diff --git a/qai_hub_models/models/whisper_asr/demo.py b/qai_hub_models/models/_shared/whisper/demo.py similarity index 82% rename from qai_hub_models/models/whisper_asr/demo.py rename to qai_hub_models/models/_shared/whisper/demo.py index 9bb729c6..bd9a4fa8 100644 --- a/qai_hub_models/models/whisper_asr/demo.py +++ b/qai_hub_models/models/_shared/whisper/demo.py @@ -2,12 +2,14 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -from qai_hub_models.models.whisper_asr.app import ( +from typing import Type + +from qai_hub_models.models._shared.whisper.app import ( WhisperApp, load_audio, load_mel_filter, ) -from qai_hub_models.models.whisper_asr.model import ( +from qai_hub_models.models._shared.whisper.model import ( MEL_FILTER_PATH, MODEL_ASSET_VERSION, MODEL_ID, @@ -20,9 +22,9 @@ ) -def main(): +def whisper_demo(model_cls: Type[Whisper]): # For other model sizes, see https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17 - app = WhisperApp(Whisper.from_pretrained()) + app = WhisperApp(model_cls.from_pretrained()) TEST_AUDIO_PATH.fetch() MEL_FILTER_PATH.fetch() @@ -36,7 +38,3 @@ def main(): # Perform transcription transcription = app.transcribe(mel_input) print("Transcription:", transcription) - - -if __name__ == "__main__": - main() diff --git a/qai_hub_models/models/whisper_asr/model.py b/qai_hub_models/models/_shared/whisper/model.py similarity index 88% rename from qai_hub_models/models/whisper_asr/model.py rename to qai_hub_models/models/_shared/whisper/model.py index 6ed6a7be..50bb546b 100644 --- a/qai_hub_models/models/whisper_asr/model.py +++ b/qai_hub_models/models/_shared/whisper/model.py @@ -10,12 +10,12 @@ import whisper # type: ignore from qai_hub_models.utils.asset_loaders import CachedWebModelAsset -from qai_hub_models.utils.base_model import BaseModel, CollectionModel +from qai_hub_models.utils.base_model import BaseModel, CollectionModel, TargetRuntime from qai_hub_models.utils.input_spec import InputSpec -MAX_DECODE_LEN = 448 +MAX_DECODE_LEN = 224 -MODEL_ID = __name__.split(".")[-2] +MODEL_ID = "whisper_asr_shared" MODEL_ASSET_VERSION = 1 MEL_FILTER_PATH = CachedWebModelAsset.from_asset_store( MODEL_ID, MODEL_ASSET_VERSION, "openai_assets/mel_filters.npz" @@ -70,7 +70,8 @@ def forward(self, audio: torch.Tensor) -> List[torch.Tensor]: res.append(residual_block.cross_attn.value(encoder_out)) return res - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: """ Returns the input specification (name -> (shape, type). This can be used to submit profiling job on Qualcomm AI Hub. @@ -81,6 +82,22 @@ def get_input_spec(self) -> InputSpec: def from_pretrained(cls): return Whisper.from_pretrained().encoder + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --compute_unit gpu" + + def get_hub_profile_options( + self, target_runtime: TargetRuntime, other_profile_options: str = "" + ) -> str: + profile_options = super().get_hub_profile_options( + target_runtime, other_profile_options + ) + return profile_options + " --max_profiler_iterations 10" + " --compute_unit gpu" + class WhisperDecoderInf(BaseModel): """ @@ -101,6 +118,7 @@ def __init__(self, model: whisper.model.TextDecoder): self.blocks = torch.nn.ModuleList( [ResidualAttentionBlockWrapper(b) for b in model.blocks] ) + self.num_blocks = len(self.blocks) for m in ["token_embedding", "ln"]: self.add_module(m, getattr(model, m)) for p in ["positional_embedding"]: @@ -144,12 +162,11 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs): assert isinstance(self.positional_embedding, torch.nn.Parameter) # for mypy # Set up kv_cache kv_cache = {} # torch.nn.Module -> torch.Tensor - num_blocks = len(self.blocks) for i, block in enumerate(self.blocks): kv_cache.update( { - block.attn.key: kv_cache_args[2 * num_blocks + i * 2], - block.attn.value: kv_cache_args[2 * num_blocks + i * 2 + 1], + block.attn.key: kv_cache_args[2 * self.num_blocks + i * 2], + block.attn.value: kv_cache_args[2 * self.num_blocks + i * 2 + 1], block.cross_attn.key: kv_cache_args[i * 2], block.cross_attn.value: kv_cache_args[i * 2 + 1], } @@ -178,31 +195,35 @@ def forward(self, x: torch.Tensor, *kv_cache_args, **kv_cache_kwargs): # shape: [1, 1, 51864] return (logits,) + tuple(kv_cache_new) - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec(num_blocks: int, attention_dim: int) -> InputSpec: """ Returns the input specification (name -> (shape, type). This can be used to submit profiling job on Qualcomm AI Hub. """ specs = dict(x=((1, 1), "int32")) - for i in range(len(self.blocks)): - specs[f"b{i}_cross_attn_k"] = ((1, 1500, self.attention_dim), "float32") - specs[f"b{i}_cross_attn_v"] = ((1, 1500, self.attention_dim), "float32") + for i in range(num_blocks): + specs[f"b{i}_cross_attn_k"] = ((1, 1500, attention_dim), "float32") + specs[f"b{i}_cross_attn_v"] = ((1, 1500, attention_dim), "float32") # Use mean length for profiling mean_decode_len = MAX_DECODE_LEN // 2 - for i in range(len(self.blocks)): + for i in range(num_blocks): specs[f"b{i}_self_attn_k"] = ( - (1, mean_decode_len, self.attention_dim), + (1, mean_decode_len, attention_dim), "float32", ) specs[f"b{i}_self_attn_v"] = ( - (1, mean_decode_len, self.attention_dim), + (1, mean_decode_len, attention_dim), "float32", ) return specs + def _get_input_spec_for_model_instance(self) -> InputSpec: + return self.__class__.get_input_spec(len(self.blocks), self.attention_dim) + @classmethod def from_pretrained(cls): return Whisper.from_pretrained().decoder diff --git a/qai_hub_models/models/whisper_asr/test.py b/qai_hub_models/models/_shared/whisper/test_utils.py similarity index 71% rename from qai_hub_models/models/whisper_asr/test.py rename to qai_hub_models/models/_shared/whisper/test_utils.py index 7b021e37..a75dd327 100644 --- a/qai_hub_models/models/whisper_asr/test.py +++ b/qai_hub_models/models/_shared/whisper/test_utils.py @@ -3,18 +3,16 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np -import pytest import torch import whisper -from qai_hub_models.models.whisper_asr.app import ( +from qai_hub_models.models._shared.whisper.app import ( WhisperApp, load_audio, load_mel_filter, ) -from qai_hub_models.models.whisper_asr.demo import TEST_AUDIO_PATH -from qai_hub_models.models.whisper_asr.demo import main as demo_main -from qai_hub_models.models.whisper_asr.model import ( +from qai_hub_models.models._shared.whisper.demo import TEST_AUDIO_PATH +from qai_hub_models.models._shared.whisper.model import ( MEL_FILTER_PATH, Whisper, WhisperDecoderInf, @@ -22,20 +20,22 @@ ) -@pytest.fixture(scope="session") -def mel_input() -> np.ndarray: +def load_mel_input() -> np.ndarray: mel_filter_path = MEL_FILTER_PATH.fetch() mel_filter = load_mel_filter(mel_filter_path) audio_path = TEST_AUDIO_PATH.fetch() return load_audio(mel_filter, audio_path) -def test_numerics(mel_input): +def run_test_wrapper_numerics(whisper_version): """ - Test that wrapper classes predict logits (without post processing) that - matches with the original model's. + Test that wrapper classes, excluding the + app, predict logits (without post + processing) that matches with the + original model's. """ # OpenAI + mel_input = load_mel_input() with torch.no_grad(): mel_input = torch.from_numpy(mel_input) model = whisper.load_model("tiny.en") @@ -49,8 +49,8 @@ def test_numerics(mel_input): decoder = WhisperDecoderInf(model.decoder) cross_attn_cache = encoder(mel_input) - cache_tensor = np.array([], dtype=np.float32).reshape((1, 0, 384)) - self_attn_cache = [torch.from_numpy(cache_tensor)] * 2 * 4 + cache_tensor = np.array([], dtype=np.float32).reshape((1, 0, decoder.attention_dim)) + self_attn_cache = [torch.from_numpy(cache_tensor)] * 2 * decoder.num_blocks decoder_out = decoder(tokens, *cross_attn_cache, *self_attn_cache) logits = decoder_out[0].detach().numpy() @@ -58,14 +58,16 @@ def test_numerics(mel_input): np.testing.assert_allclose(logits_orig, logits) -def test_transcribe(mel_input): +def run_test_transcribe(whisper_version): """ - Test that pytorch wrappers produces end to end transcription results that + Test that WhisperApp produces end to end transcription results that matches with the original model """ + mel_input = load_mel_input() + # Run inference with OpenAI whisper with torch.no_grad(): - model = whisper.load_model("tiny.en") + model = whisper.load_model(whisper_version) options = whisper.DecodingOptions( language="en", without_timestamps=False, fp16=False ) @@ -77,7 +79,3 @@ def test_transcribe(mel_input): # Perform transcription transcription = app.transcribe(mel_input) assert transcription == text_orig - - -def test_demo(): - demo_main() diff --git a/qai_hub_models/models/_shared/yolo/demo.py b/qai_hub_models/models/_shared/yolo/demo.py index f9662848..0f6d4a1d 100644 --- a/qai_hub_models/models/_shared/yolo/demo.py +++ b/qai_hub_models/models/_shared/yolo/demo.py @@ -24,6 +24,7 @@ # The demo will display a image with the predicted bounding boxes. def yolo_detection_demo( model_type: Type[BaseModel], + model_id: str, app_type: Callable[..., YoloObjectDetectionApp], default_image: str | CachedWebAsset, stride_multiple: int | None = None, @@ -49,9 +50,9 @@ def yolo_detection_demo( help="Intersection over Union (IoU) threshold for NonMaximumSuppression", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_type.get_model_id()) + validate_on_device_demo_args(args, model_id) - model = demo_model_from_cli_args(model_type, args) + model = demo_model_from_cli_args(model_type, model_id, args) app = app_type(model, args.score_threshold, args.iou_threshold) print("Model Loaded") diff --git a/qai_hub_models/models/_shared/yolo/utils.py b/qai_hub_models/models/_shared/yolo/utils.py index 217d1d7a..5911106f 100644 --- a/qai_hub_models/models/_shared/yolo/utils.py +++ b/qai_hub_models/models/_shared/yolo/utils.py @@ -6,8 +6,8 @@ import torch +from qai_hub_models.models.common import SampleInputsType from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image -from qai_hub_models.utils.base_model import InputsType from qai_hub_models.utils.image_processing import app_to_net_image_inputs @@ -103,7 +103,7 @@ class scores reduced to keep max score per prediction return torch.cat(max_scores, dim=-1), torch.cat(max_indices, dim=-1) -def yolo_sample_inputs() -> InputsType: +def yolo_sample_inputs() -> SampleInputsType: image_address = CachedWebModelAsset.from_asset_store( "yolov7", 1, "yolov7_demo_640.jpg" ) diff --git a/qai_hub_models/models/aotgan/README.md b/qai_hub_models/models/aotgan/README.md new file mode 100644 index 00000000..dff56fea --- /dev/null +++ b/qai_hub_models/models/aotgan/README.md @@ -0,0 +1,54 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [AOT-GAN: High resolution image in-painting on-device](https://aihub.qualcomm.com/models/aotgan) + +AOT-GAN is a machine learning model that allows to erase and in-paint part of given input image. + +This is based on the implementation of AOT-GAN found +[here](https://github.com/researchmm/AOT-GAN-for-Inpainting). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/aotgan). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.aotgan.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.aotgan.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of AOT-GAN can be found + [here](https://github.com/taki0112/AttnGAN-Tensorflow/blob/master/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [Aggregated Contextual Transformations for High-Resolution Image Inpainting](https://arxiv.org/abs/2104.01431) +* [Source Model Implementation](https://github.com/researchmm/AOT-GAN-for-Inpainting) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/aotgan/__init__.py b/qai_hub_models/models/aotgan/__init__.py new file mode 100644 index 00000000..27effd6f --- /dev/null +++ b/qai_hub_models/models/aotgan/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.repaint.app import ( # noqa: F401 + RepaintMaskApp as App, +) + +from .model import AOTGAN as Model # noqa: F401 +from .model import MODEL_ID # noqa: F401 diff --git a/qai_hub_models/models/aotgan/conftest.py b/qai_hub_models/models/aotgan/conftest.py new file mode 100644 index 00000000..ce808157 --- /dev/null +++ b/qai_hub_models/models/aotgan/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.aotgan import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.aotgan.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/aotgan/demo.py b/qai_hub_models/models/aotgan/demo.py new file mode 100644 index 00000000..cc1d153d --- /dev/null +++ b/qai_hub_models/models/aotgan/demo.py @@ -0,0 +1,19 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.repaint.demo import repaint_demo +from qai_hub_models.models.aotgan.model import ( + AOTGAN, + IMAGE_ADDRESS, + MASK_ADDRESS, + MODEL_ID, +) + + +def main(is_test: bool = False): + repaint_demo(AOTGAN, MODEL_ID, IMAGE_ADDRESS, MASK_ADDRESS, is_test=is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/aotgan/export.py b/qai_hub_models/models/aotgan/export.py new file mode 100644 index 00000000..1316347d --- /dev/null +++ b/qai_hub_models/models/aotgan/export.py @@ -0,0 +1,206 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub +import torch + +from qai_hub_models.models.aotgan import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "aotgan" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "aotgan", + "AOT-GAN", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image,mask" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image,mask", sample_inputs, target_runtime + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/aotgan/info.yaml b/qai_hub_models/models/aotgan/info.yaml new file mode 100644 index 00000000..18157c64 --- /dev/null +++ b/qai_hub_models/models/aotgan/info.yaml @@ -0,0 +1,31 @@ +name: AOT-GAN +id: aotgan +status: public +headline: High resolution image in-painting on-device. +domain: Computer Vision +description: AOT-GAN is a machine learning model that allows to erase and in-paint + part of given input image. +use_case: Image Editing +tags: [] +research_paper: https://arxiv.org/abs/2104.01431 +research_paper_title: Aggregated Contextual Transformations for High-Resolution Image + Inpainting +license: https://github.com/taki0112/AttnGAN-Tensorflow/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/researchmm/AOT-GAN-for-Inpainting +technical_details: + Model checkpoint: CelebAHQ + Input resolution: 512x512 + Number of parameters: 15.2M + Model size: 58.0 MB +applicable_scenarios: + - Image editing +form_factors: + - Phone + - Tablet +related_models: [] +has_static_banner: yes +has_animated_banner: yes +license_type: mit +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/aotgan/model.py b/qai_hub_models/models/aotgan/model.py new file mode 100644 index 00000000..d5d33563 --- /dev/null +++ b/qai_hub_models/models/aotgan/model.py @@ -0,0 +1,131 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +import os + +import torch +import torch.nn as nn + +from qai_hub_models.models.common import SampleInputsType +from qai_hub_models.utils.asset_loaders import ( + CachedWebModelAsset, + SourceAsRoot, + load_image, +) +from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.input_spec import InputSpec + +AOTGAN_SOURCE_REPOSITORY = "https://github.com/researchmm/AOT-GAN-for-Inpainting/" +AOTGAN_SOURCE_REPO_COMMIT = "418034627392289bdfc118d62bc49e6abd3bb185" +AOTGAN_SOURCE_PATCHES = [ + # Prevent overflow in layer norm (and re-use mean) + # On both on TFLite/QNN, the divider by (n - 1) ends up before the sum, so + # overflow is avoided. + os.path.abspath( + os.path.join(os.path.dirname(__file__), "patches", "layer_norm.diff") + ) +] +MODEL_ID = __name__.split(".")[-2] +SUPPORTED_PRETRAINED_MODELS = set(["celebahq", "places2"]) +DEFAULT_WEIGHTS = "celebahq" +MODEL_ASSET_VERSION = 2 + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/test_input_image.png" +) +MASK_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/test_input_mask.png" +) + + +class AOTGAN(BaseModel): + """Exportable AOTGAN for Image inpainting""" + + def __init__(self, model: nn.Module) -> None: + super().__init__() + self.model = model + + @classmethod + def from_pretrained(cls, ckpt_name: str = DEFAULT_WEIGHTS): + if ckpt_name not in SUPPORTED_PRETRAINED_MODELS: + raise ValueError( + "Unsupported pre_trained model requested. Please provide either 'celeabhq' or 'places2'." + ) + downloaded_model_path = CachedWebModelAsset.from_asset_store( + MODEL_ID, + MODEL_ASSET_VERSION, + f"pretrained_models/{ckpt_name}/G0000000.pt", + ).fetch() + with SourceAsRoot( + AOTGAN_SOURCE_REPOSITORY, + AOTGAN_SOURCE_REPO_COMMIT, + MODEL_ID, + MODEL_ASSET_VERSION, + source_repo_patches=AOTGAN_SOURCE_PATCHES, + ): + from src.model.aotgan import InpaintGenerator + + # AOT-GAN InpaintGenerator uses ArgParser to + # initialize model and it uses following two parameters + # - rates: default value [1, 2, 4, 8] + # - block_num: default value 8 + # creating dummy class with default values to set the same + class InpaintArgs: + def __init__(self): + self.rates = [1, 2, 4, 8] + self.block_num = 8 + + args = InpaintArgs() + model = InpaintGenerator(args) + model.load_state_dict(torch.load(downloaded_model_path, map_location="cpu")) + return cls(model) + + def forward(self, image: torch.Tensor, mask: torch.Tensor): + """ + Run AOTGAN Inpaint Generator on `image` with given `mask` + and generates new high-resolution in-painted image. + + Parameters: + image: Pixel values pre-processed of shape [N, C, H, W] + Range: float[0, 1] + 3-channel color Space: BGR + mask: Pixel values pre-processed to have have mask values either 0. or 1. + Range: float[0, 1] and only values of 0. or 1. + 1-channel binary image. + + Returns: + In-painted image for given image and mask of shape [N, C, H, W] + Range: float[0, 1] + 3-channel color space: RGB + """ + return self.model(image, mask) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + height: int = 512, + width: int = 512, + ) -> InputSpec: + """ + Returns the input specification (name -> (shape, type). This can be + used to submit profiling job on Qualcomm AI Hub. + """ + return { + "image": ((batch_size, num_channels, height, width), "float32"), + "mask": ((batch_size, 1, height, width), "float32"), + } + + def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType: + """ + Provides an example image of a man with a mask over the glasses. + """ + from qai_hub_models.models._shared.repaint.app import RepaintMaskApp + + image = load_image(IMAGE_ADDRESS) + mask = load_image(MASK_ADDRESS) + torch_inputs = RepaintMaskApp.preprocess_inputs(image, mask) + return {k: [v.detach().numpy()] for k, v in torch_inputs.items()} diff --git a/qai_hub_models/models/aotgan/patches/layer_norm.diff b/qai_hub_models/models/aotgan/patches/layer_norm.diff new file mode 100644 index 00000000..e67a5ad0 --- /dev/null +++ b/qai_hub_models/models/aotgan/patches/layer_norm.diff @@ -0,0 +1,14 @@ +diff --git a/src/model/aotgan.py b/src/model/aotgan.py +index 518b76c..75d96c3 100644 +--- a/src/model/aotgan.py ++++ b/src/model/aotgan.py +@@ -80,7 +80,8 @@ class AOTBlock(nn.Module): + + def my_layer_norm(feat): + mean = feat.mean((2, 3), keepdim=True) +- std = feat.std((2, 3), keepdim=True) + 1e-9 ++ num_samples = feat.shape[2] * feat.shape[3] ++ std = torch.sqrt(torch.sum((feat - mean) ** 2 / (num_samples - 1), (2, 3), keepdim=True)) + 1e-9 + feat = 2 * (feat - mean) / std - 1 + feat = 5 * feat + return feat diff --git a/qai_hub_models/models/aotgan/perf.yaml b/qai_hub_models/models/aotgan/perf.yaml new file mode 100644 index 00000000..895bc321 --- /dev/null +++ b/qai_hub_models/models/aotgan/perf.yaml @@ -0,0 +1,108 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 +models: +- name: AOT-GAN + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 172836.0 + throughput: 5.785831655442153 + estimated_peak_memory_range: + min: 3305472 + max: 6628872 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 235 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 235 + job_id: jqpyel4gy + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:15:11.791489Z' + torchscript_onnx_qnn: + inference_time: 162909.0 + throughput: 6.138396282587212 + estimated_peak_memory_range: + min: 4268032 + max: 33754568 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 275 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 275 + job_id: j1p8on8g9 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 127366.0 + throughput: 7.851388910698303 + estimated_peak_memory_range: + min: 2334720 + max: 227053936 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 235 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 235 + job_id: j2p0ywegw + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:17:12.829523Z' + torchscript_onnx_qnn: + inference_time: 120027.0 + throughput: 8.331458755113433 + estimated_peak_memory_range: + min: 0 + max: 140852624 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 275 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 275 + job_id: jogkz1ogd + job_status: Passed diff --git a/qai_hub_models/models/aotgan/test.py b/qai_hub_models/models/aotgan/test.py new file mode 100644 index 00000000..302df4ef --- /dev/null +++ b/qai_hub_models/models/aotgan/test.py @@ -0,0 +1,68 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import numpy as np +import pytest + +from qai_hub_models.models._shared.repaint.app import RepaintMaskApp +from qai_hub_models.models.aotgan.demo import main as demo_main +from qai_hub_models.models.aotgan.model import ( + AOTGAN, + IMAGE_ADDRESS, + MASK_ADDRESS, + MODEL_ASSET_VERSION, + MODEL_ID, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "test_images/test_output.png" +) + + +@skip_clone_repo_check +def test_task(): + app = RepaintMaskApp(AOTGAN.from_pretrained()) + + img = load_image(IMAGE_ADDRESS) + mask_image = load_image(MASK_ADDRESS) + + out_imgs = app.paint_mask_on_image(img, mask_image) + expected_out = load_image(OUTPUT_ADDRESS) + assert_most_close( + np.asarray(out_imgs[0], dtype=np.float32), + np.asarray(expected_out, dtype=np.float32), + 0.005, + rtol=0.02, + atol=1.5, + ) + + +@pytest.mark.trace +@skip_clone_repo_check +def test_trace(): + net = AOTGAN.from_pretrained() + input_spec = net.get_input_spec() + trace = net.convert_to_torchscript(input_spec) + + img = load_image(IMAGE_ADDRESS) + mask_image = load_image(MASK_ADDRESS) + app = RepaintMaskApp(trace) + + out_imgs = app.paint_mask_on_image(img, mask_image) + expected_out = load_image(OUTPUT_ADDRESS) + assert_most_close( + np.asarray(out_imgs[0], dtype=np.float32), + np.asarray(expected_out, dtype=np.float32), + 0.005, + rtol=0.02, + atol=1.5, + ) + + +@skip_clone_repo_check +def test_demo(): + # Run demo and verify it does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/baichuan_7b_quantized/README.md b/qai_hub_models/models/baichuan_7b_quantized/README.md index 58a190d9..e7fb6962 100644 --- a/qai_hub_models/models/baichuan_7b_quantized/README.md +++ b/qai_hub_models/models/baichuan_7b_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Baichuan-7B found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/baichuan_7b_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -18,7 +18,7 @@ a hosted Qualcomm® device. ## License - The license for the original implementation of Baichuan-7B can be found [here](https://github.com/baichuan-inc/Baichuan-7B/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Baichuan 2: Open Large-scale Language Models](https://arxiv.org/abs/2309.10305) diff --git a/qai_hub_models/models/baichuan_7b_quantized/info.yaml b/qai_hub_models/models/baichuan_7b_quantized/info.yaml index 4fb26cc4..9d4c7a7d 100644 --- a/qai_hub_models/models/baichuan_7b_quantized/info.yaml +++ b/qai_hub_models/models/baichuan_7b_quantized/info.yaml @@ -17,6 +17,7 @@ tags: research_paper: https://arxiv.org/abs/2309.10305 research_paper_title: "Baichuan 2: Open Large-scale Language Models" license: https://github.com/baichuan-inc/Baichuan-7B/blob/main/LICENSE +deploy_license: https://github.com/baichuan-inc/Baichuan-7B/blob/main/LICENSE source_repo: https://github.com/baichuan-inc/Baichuan-7B/ technical_details: Number of parameters: 7B @@ -40,6 +41,7 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: apache-2.0 +deploy_license_type: apache-2.0 dataset: [] diff --git a/qai_hub_models/models/common.py b/qai_hub_models/models/common.py new file mode 100644 index 00000000..bf635e0e --- /dev/null +++ b/qai_hub_models/models/common.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from enum import Enum +from typing import Dict, List + +import numpy as np + + +class TargetRuntime(Enum): + TFLITE = 0 + QNN = 1 + + def __str__(self): + return self.name.lower() + + +class SourceModelFormat(Enum): + ONNX = 0 + TORCHSCRIPT = 1 + + +SampleInputsType = Dict[str, List[np.ndarray]] diff --git a/qai_hub_models/models/controlnet_quantized/README.md b/qai_hub_models/models/controlnet_quantized/README.md index 9c93c519..5224338a 100644 --- a/qai_hub_models/models/controlnet_quantized/README.md +++ b/qai_hub_models/models/controlnet_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ControlNet found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/controlnet_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.controlnet_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ControlNet can be found [here](https://github.com/lllyasviel/ControlNet/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Adding Conditional Control to Text-to-Image Diffusion Models](https://arxiv.org/abs/2302.05543) diff --git a/qai_hub_models/models/controlnet_quantized/export.py b/qai_hub_models/models/controlnet_quantized/export.py index b83cfb58..3bc3a0a0 100644 --- a/qai_hub_models/models/controlnet_quantized/export.py +++ b/qai_hub_models/models/controlnet_quantized/export.py @@ -9,13 +9,13 @@ import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub from qai_hub_models.models.controlnet_quantized import Model from qai_hub_models.utils.args import export_parser -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime from qai_hub_models.utils.printing import print_profile_metrics_from_job from qai_hub_models.utils.qai_hub_helpers import ( can_access_qualcomm_ai_hub, @@ -23,16 +23,16 @@ ) ALL_COMPONENTS = [ - "Text-Encoder-Quantized", - "UNet-Quantized", - "VAE-Decoder-Quantized", - "ControlNet-Quantized", + "TextEncoder_Quantized", + "UNet_Quantized", + "VAEDecoder_Quantized", + "ControlNet_Quantized", ] DEFAULT_COMPONENTS = [ - "Text-Encoder-Quantized", - "VAE-Decoder-Quantized", - "UNet-Quantized", - "ControlNet-Quantized", + "TextEncoder_Quantized", + "VAEDecoder_Quantized", + "UNet_Quantized", + "ControlNet_Quantized", ] @@ -85,9 +85,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or DEFAULT_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "controlnet_quantized", @@ -104,18 +104,19 @@ def export_model( component_arg, ) + target_runtime = TargetRuntime.TFLITE # 1. Initialize model print("Initializing model class") model = Model.from_precompiled() - components_dict = {} - if "Text-Encoder-Quantized" in components: - components_dict["Text-Encoder-Quantized"] = model.text_encoder - if "UNet-Quantized" in components: - components_dict["UNet-Quantized"] = model.unet - if "VAE-Decoder-Quantized" in components: - components_dict["VAE-Decoder-Quantized"] = model.vae_decoder - if "ControlNet-Quantized" in components: - components_dict["ControlNet-Quantized"] = model.controlnet + components_dict: Dict[str, BasePrecompiledModel] = {} + if "TextEncoder_Quantized" in components: + components_dict["TextEncoder_Quantized"] = model.text_encoder # type: ignore + if "UNet_Quantized" in components: + components_dict["UNet_Quantized"] = model.unet # type: ignore + if "VAEDecoder_Quantized" in components: + components_dict["VAEDecoder_Quantized"] = model.vae_decoder # type: ignore + if "ControlNet_Quantized" in components: + components_dict["ControlNet_Quantized"] = model.controlnet # type: ignore # 2. Upload model assets to hub print("Uploading model assets on hub") @@ -126,39 +127,51 @@ def export_model( ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=uploaded_models[component_name], device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=uploaded_models[component_name], inputs=sample_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Summarize the results from profiling if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) return { diff --git a/qai_hub_models/models/controlnet_quantized/info.yaml b/qai_hub_models/models/controlnet_quantized/info.yaml index d2a7df33..1bec5406 100644 --- a/qai_hub_models/models/controlnet_quantized/info.yaml +++ b/qai_hub_models/models/controlnet_quantized/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/2302.05543 research_paper_title: Adding Conditional Control to Text-to-Image Diffusion Models license: https://github.com/lllyasviel/ControlNet/blob/main/LICENSE +deploy_license: https://github.com/lllyasviel/ControlNet/blob/main/LICENSE source_repo: https://github.com/lllyasviel/ControlNet technical_details: Input: Text prompt and input image as a reference @@ -35,3 +36,4 @@ has_static_banner: yes has_animated_banner: no dataset: [] license_type: apache-2.0 +deploy_license_type: apache-2.0 diff --git a/qai_hub_models/models/controlnet_quantized/model.py b/qai_hub_models/models/controlnet_quantized/model.py index 57a882a0..0abe7574 100644 --- a/qai_hub_models/models/controlnet_quantized/model.py +++ b/qai_hub_models/models/controlnet_quantized/model.py @@ -6,8 +6,9 @@ import os +from qai_hub_models.models.protocols import FromPrecompiledProtocol from qai_hub_models.utils.asset_loaders import CachedWebModelAsset -from qai_hub_models.utils.base_model import BasePrecompiledModel +from qai_hub_models.utils.base_model import BasePrecompiledModel, CollectionModel from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] @@ -19,7 +20,7 @@ CONTROL_NET = os.path.join(QNN_SDK_PREFIX, "controlnet.serialized.bin") -class ControlNetQuantized: +class ControlNetQuantized(FromPrecompiledProtocol, CollectionModel): """ ControlNet class consists of - Text Encoder @@ -55,9 +56,6 @@ class ClipVITTextEncoder(BasePrecompiledModel): and compiled into serialized binary for Qualcomm Snapdragon Gen2+. """ - def __init__(self, target_model_path) -> None: - self.target_model_path = target_model_path - @classmethod def from_precompiled(cls) -> "ClipVITTextEncoder": text_encoder_path = CachedWebModelAsset.from_asset_store( @@ -65,10 +63,8 @@ def from_precompiled(cls) -> "ClipVITTextEncoder": ).fetch() return ClipVITTextEncoder(text_encoder_path) - def get_target_model_path(self) -> str: - return self.target_model_path - - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: return {"input_1": ((1, 77), "int32")} @@ -80,9 +76,6 @@ class Unet(BasePrecompiledModel): and compiled into serialized binary for Qualcomm Snapdragon Gen2+. """ - def __init__(self, target_model_path) -> None: - self.target_model_path = target_model_path - @classmethod def from_precompiled(cls) -> "Unet": model_path = CachedWebModelAsset.from_asset_store( @@ -90,10 +83,8 @@ def from_precompiled(cls) -> "Unet": ).fetch() return Unet(model_path) - def get_target_model_path(self) -> str: - return self.target_model_path - - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: return { "input_1": ((1, 64, 64, 4), "float32"), "input_2": ((1, 1280), "float32"), @@ -122,9 +113,6 @@ class VAEDecoder(BasePrecompiledModel): and compiled into serialized binary for Qualcomm Snapdragon Gen2+. """ - def __init__(self, target_model_path) -> None: - self.target_model_path = target_model_path - @classmethod def from_precompiled(cls) -> "VAEDecoder": model_path = CachedWebModelAsset.from_asset_store( @@ -132,10 +120,8 @@ def from_precompiled(cls) -> "VAEDecoder": ).fetch() return VAEDecoder(model_path) - def get_target_model_path(self) -> str: - return self.target_model_path - - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: return {"input_1": ((1, 64, 64, 4), "float32")} @@ -147,9 +133,6 @@ class ControlNet(BasePrecompiledModel): and compiled into serialized binary for Qualcomm Snapdragon Gen2+. """ - def __init__(self, target_model_path) -> None: - self.target_model_path = target_model_path - @classmethod def from_precompiled(cls) -> "ControlNet": model_path = CachedWebModelAsset.from_asset_store( @@ -157,10 +140,8 @@ def from_precompiled(cls) -> "ControlNet": ).fetch() return ControlNet(model_path) - def get_target_model_path(self) -> str: - return self.target_model_path - - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: return { "input_1": ((1, 64, 64, 4), "float32"), "input_2": ((1, 1280), "float32"), diff --git a/qai_hub_models/models/controlnet_quantized/requirements.txt b/qai_hub_models/models/controlnet_quantized/requirements.txt index 8d0cd0c6..83aa3d48 100644 --- a/qai_hub_models/models/controlnet_quantized/requirements.txt +++ b/qai_hub_models/models/controlnet_quantized/requirements.txt @@ -1,3 +1,2 @@ -transformers==4.31.0 +transformers==4.27.4 diffusers[torch]==0.21.4 -opencv-python==4.8.1.78 diff --git a/qai_hub_models/models/controlnet_quantized/test.py b/qai_hub_models/models/controlnet_quantized/test.py index f192cd28..18c31392 100644 --- a/qai_hub_models/models/controlnet_quantized/test.py +++ b/qai_hub_models/models/controlnet_quantized/test.py @@ -8,6 +8,11 @@ from qai_hub_models.models.controlnet_quantized.demo import main as demo_main from qai_hub_models.models.controlnet_quantized.export import export_model +from qai_hub_models.models.controlnet_quantized.model import ControlNetQuantized + + +def test_from_precompiled(): + ControlNetQuantized.from_precompiled() @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") diff --git a/qai_hub_models/models/convnext_tiny/README.md b/qai_hub_models/models/convnext_tiny/README.md index 836052f4..3f75d201 100644 --- a/qai_hub_models/models/convnext_tiny/README.md +++ b/qai_hub_models/models/convnext_tiny/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ConvNext-Tiny found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/convnext_tiny). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.convnext_tiny.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ConvNext-Tiny can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) diff --git a/qai_hub_models/models/convnext_tiny/conftest.py b/qai_hub_models/models/convnext_tiny/conftest.py new file mode 100644 index 00000000..b53df396 --- /dev/null +++ b/qai_hub_models/models/convnext_tiny/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.convnext_tiny import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.convnext_tiny.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/convnext_tiny/demo.py b/qai_hub_models/models/convnext_tiny/demo.py index 892edb19..42a15676 100644 --- a/qai_hub_models/models/convnext_tiny/demo.py +++ b/qai_hub_models/models/convnext_tiny/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.convnext_tiny.model import ConvNextTiny +from qai_hub_models.models.convnext_tiny.model import MODEL_ID, ConvNextTiny def main(is_test: bool = False): - imagenet_demo(ConvNextTiny, is_test) + imagenet_demo(ConvNextTiny, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/convnext_tiny/export.py b/qai_hub_models/models/convnext_tiny/export.py index b2173781..4649e4ef 100644 --- a/qai_hub_models/models/convnext_tiny/export.py +++ b/qai_hub_models/models/convnext_tiny/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/convnext_tiny/info.yaml b/qai_hub_models/models/convnext_tiny/info.yaml index 70292699..f3e7e9e1 100644 --- a/qai_hub_models/models/convnext_tiny/info.yaml +++ b/qai_hub_models/models/convnext_tiny/info.yaml @@ -12,6 +12,7 @@ tags: [] research_paper: https://arxiv.org/abs/2201.03545 research_paper_title: A ConvNet for the 2020s license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/convnext.py technical_details: Model checkpoint: Imagenet @@ -33,6 +34,7 @@ related_models: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/convnext_tiny/model.py b/qai_hub_models/models/convnext_tiny/model.py index 8b838392..3e6079f1 100644 --- a/qai_hub_models/models/convnext_tiny/model.py +++ b/qai_hub_models/models/convnext_tiny/model.py @@ -14,6 +14,6 @@ class ConvNextTiny(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ConvNextTiny: net = tv_models.convnext_tiny(weights=weights) return cls(net) diff --git a/qai_hub_models/models/convnext_tiny/perf.yaml b/qai_hub_models/models/convnext_tiny/perf.yaml index e76fd904..d850c5bf 100644 --- a/qai_hub_models/models/convnext_tiny/perf.yaml +++ b/qai_hub_models/models/convnext_tiny/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ConvNext-Tiny performance_metrics: - torchscript_onnx_tflite: - inference_time: 11532.0 - throughput: 86.71522719389525 + inference_time: 11538.0 + throughput: 86.67013347200555 estimated_peak_memory_range: - min: 339968 - max: 2817216 + min: 53248 + max: 2750320 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 380 - job_id: jmg9zykqp + job_id: jnp10l25q job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:26:23.235644Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 8123.0 + throughput: 123.10722639418934 + estimated_peak_memory_range: + min: 40960 + max: 205818960 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 380 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 380 + job_id: jvgdw9e5j + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:23:45.273161Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:26:23.235670Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/convnext_tiny/test.py b/qai_hub_models/models/convnext_tiny/test.py index b45cb350..5b0b2b77 100644 --- a/qai_hub_models/models/convnext_tiny/test.py +++ b/qai_hub_models/models/convnext_tiny/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(ConvNextTiny.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(ConvNextTiny.from_pretrained()) diff --git a/qai_hub_models/models/ddrnet23_slim/README.md b/qai_hub_models/models/ddrnet23_slim/README.md index c69a3a18..374c2f4a 100644 --- a/qai_hub_models/models/ddrnet23_slim/README.md +++ b/qai_hub_models/models/ddrnet23_slim/README.md @@ -10,7 +10,7 @@ This is based on the implementation of DDRNet23-Slim found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ddrnet23_slim). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.ddrnet23_slim.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of DDRNet23-Slim can be found [here](https://github.com/chenjun2hao/DDRNet.pytorch/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes](https://arxiv.org/abs/2101.06085) diff --git a/qai_hub_models/models/ddrnet23_slim/conftest.py b/qai_hub_models/models/ddrnet23_slim/conftest.py new file mode 100644 index 00000000..311a5aa2 --- /dev/null +++ b/qai_hub_models/models/ddrnet23_slim/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ddrnet23_slim import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ddrnet23_slim.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ddrnet23_slim/demo.py b/qai_hub_models/models/ddrnet23_slim/demo.py index c97479cd..f6006e1a 100644 --- a/qai_hub_models/models/ddrnet23_slim/demo.py +++ b/qai_hub_models/models/ddrnet23_slim/demo.py @@ -36,8 +36,8 @@ def main(is_test: bool = False): help="image file path or URL", ) args = parser.parse_args([] if is_test else None) - model = demo_model_from_cli_args(DDRNet, args) - validate_on_device_demo_args(args, DDRNet.get_model_id()) + model = demo_model_from_cli_args(DDRNet, MODEL_ID, args) + validate_on_device_demo_args(args, MODEL_ID) # Load image (_, _, height, width) = DDRNet.get_input_spec()["image"][0] diff --git a/qai_hub_models/models/ddrnet23_slim/export.py b/qai_hub_models/models/ddrnet23_slim/export.py index ef4a4df7..62378aea 100644 --- a/qai_hub_models/models/ddrnet23_slim/export.py +++ b/qai_hub_models/models/ddrnet23_slim/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -119,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -150,37 +158,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/ddrnet23_slim/info.yaml b/qai_hub_models/models/ddrnet23_slim/info.yaml index 95e0fdab..cf0776a3 100644 --- a/qai_hub_models/models/ddrnet23_slim/info.yaml +++ b/qai_hub_models/models/ddrnet23_slim/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/2101.06085 research_paper_title: Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes license: https://github.com/chenjun2hao/DDRNet.pytorch/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/chenjun2hao/DDRNet.pytorch technical_details: Model checkpoint: DDRNet23s_imagenet.pth @@ -33,5 +34,6 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: mit +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ddrnet23_slim/perf.yaml b/qai_hub_models/models/ddrnet23_slim/perf.yaml index 1f405dd9..277e6c15 100644 --- a/qai_hub_models/models/ddrnet23_slim/perf.yaml +++ b/qai_hub_models/models/ddrnet23_slim/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: DDRNet23-Slim performance_metrics: - torchscript_onnx_tflite: - inference_time: 6736.0 - throughput: 148.45605700712588 + inference_time: 6741.0 + throughput: 148.3459427384661 estimated_peak_memory_range: - min: 991232 - max: 3246040 + min: 1024000 + max: 28696320 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 131 - job_id: jvgddqv6g + job_id: jz5wo7zp1 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:17:36.932886Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 4644.0 + throughput: 215.33161068044788 + estimated_peak_memory_range: + min: 45056 + max: 68954288 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 131 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 131 + job_id: jmg9vmq57 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:12:22.404643Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:17:36.932896Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/ddrnet23_slim/test.py b/qai_hub_models/models/ddrnet23_slim/test.py index 5833b4d1..bbffdb0d 100644 --- a/qai_hub_models/models/ddrnet23_slim/test.py +++ b/qai_hub_models/models/ddrnet23_slim/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest from qai_hub_models.models.ddrnet23_slim.app import DDRNetApp from qai_hub_models.models.ddrnet23_slim.demo import INPUT_IMAGE_ADDRESS @@ -33,6 +34,7 @@ def test_task(): ) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): app = DDRNetApp(DDRNet.from_pretrained().convert_to_torchscript()) diff --git a/qai_hub_models/models/deeplabv3_resnet50/README.md b/qai_hub_models/models/deeplabv3_resnet50/README.md index 75610be5..a12383b3 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/README.md +++ b/qai_hub_models/models/deeplabv3_resnet50/README.md @@ -10,7 +10,7 @@ This is based on the implementation of DeepLabV3-ResNet50 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/deeplabv3_resnet50). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.deeplabv3_resnet50.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of DeepLabV3-ResNet50 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Rethinking Atrous Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1706.05587) diff --git a/qai_hub_models/models/deeplabv3_resnet50/conftest.py b/qai_hub_models/models/deeplabv3_resnet50/conftest.py new file mode 100644 index 00000000..bc194115 --- /dev/null +++ b/qai_hub_models/models/deeplabv3_resnet50/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.deeplabv3_resnet50 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.deeplabv3_resnet50.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/deeplabv3_resnet50/demo.py b/qai_hub_models/models/deeplabv3_resnet50/demo.py index e182f4ae..5c5afb78 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/demo.py +++ b/qai_hub_models/models/deeplabv3_resnet50/demo.py @@ -20,7 +20,9 @@ def main(is_test: bool = False): - deeplabv3_demo(DeepLabV3_ResNet50, INPUT_IMAGE_ADDRESS, NUM_CLASSES, is_test) + deeplabv3_demo( + DeepLabV3_ResNet50, MODEL_ID, INPUT_IMAGE_ADDRESS, NUM_CLASSES, is_test + ) if __name__ == "__main__": diff --git a/qai_hub_models/models/deeplabv3_resnet50/export.py b/qai_hub_models/models/deeplabv3_resnet50/export.py index 2bacc87c..ff57a6fc 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/export.py +++ b/qai_hub_models/models/deeplabv3_resnet50/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0,output_1", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0,output_1", inference_result, target_runtime diff --git a/qai_hub_models/models/deeplabv3_resnet50/info.yaml b/qai_hub_models/models/deeplabv3_resnet50/info.yaml index 2e150816..68562270 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/info.yaml +++ b/qai_hub_models/models/deeplabv3_resnet50/info.yaml @@ -11,6 +11,7 @@ tags: [] research_paper: https://arxiv.org/abs/1706.05587 research_paper_title: Rethinking Atrous Convolution for Semantic Image Segmentation license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/deeplabv3.py technical_details: @@ -32,4 +33,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/deeplabv3_resnet50/model.py b/qai_hub_models/models/deeplabv3_resnet50/model.py index 3514cc0d..0fc2e6b1 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/model.py +++ b/qai_hub_models/models/deeplabv3_resnet50/model.py @@ -9,7 +9,7 @@ from qai_hub_models.evaluators.base_evaluators import BaseEvaluator from qai_hub_models.models._shared.deeplab.evaluator import DeepLabV3Evaluator -from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] @@ -62,3 +62,19 @@ def get_input_spec( # This can be used with the qai_hub python API to declare # the model input specification upon submitting a profile job. return {"image": ((batch_size, num_channels, height, width), "float32")} + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --compute_unit gpu" + + def get_hub_profile_options( + self, target_runtime: TargetRuntime, other_profile_options: str = "" + ) -> str: + profile_options = super().get_hub_profile_options( + target_runtime, other_profile_options + ) + return profile_options + " --compute_unit gpu" diff --git a/qai_hub_models/models/deeplabv3_resnet50/perf.yaml b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml index eeccef3a..1d0853f7 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/perf.yaml +++ b/qai_hub_models/models/deeplabv3_resnet50/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: DeepLabV3-ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 57759.0 - throughput: 17.313319136411643 + inference_time: 57559.0 + throughput: 17.373477649021005 estimated_peak_memory_range: - min: 12288 - max: 171360368 + min: 106496 + max: 3561872 primary_compute_unit: GPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 96 layers_on_cpu: 0 total_layers: 96 - job_id: jqp4ydxqp + job_id: jw5663y5o job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:49:36.627925Z' torchscript_onnx_qnn: - inference_time: 146022.0 - throughput: 6.848283135417951 + inference_time: 145372.0 + throughput: 6.878903777893955 estimated_peak_memory_range: - min: 806912 - max: 9532744 + min: 724992 + max: 17276040 primary_compute_unit: GPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 82 layers_on_cpu: 0 total_layers: 82 - job_id: j0pxl67jp + job_id: jwgoy1k58 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 40153.0 + throughput: 24.904739371902473 + estimated_peak_memory_range: + min: 4358144 + max: 29236608 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 96 + layers_on_cpu: 0 + total_layers: 96 + job_id: j1p3k4n52 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:27.279356Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:51:31.429028Z' + torchscript_onnx_qnn: + inference_time: 104457.0 + throughput: 9.573317250160352 + estimated_peak_memory_range: + min: 675840 + max: 24520160 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 82 + layers_on_cpu: 0 + total_layers: 82 + job_id: j1pv31r5x + job_status: Passed diff --git a/qai_hub_models/models/deeplabv3_resnet50/test.py b/qai_hub_models/models/deeplabv3_resnet50/test.py index cfff53bf..72a58132 100644 --- a/qai_hub_models/models/deeplabv3_resnet50/test.py +++ b/qai_hub_models/models/deeplabv3_resnet50/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest from qai_hub_models.models._shared.deeplab.app import DeepLabV3App from qai_hub_models.models.deeplabv3_resnet50.demo import INPUT_IMAGE_ADDRESS @@ -37,6 +38,7 @@ def test_task(): ) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): image = load_image(INPUT_IMAGE_ADDRESS) diff --git a/qai_hub_models/models/densenet121/README.md b/qai_hub_models/models/densenet121/README.md index a586b0d2..2689b6a2 100644 --- a/qai_hub_models/models/densenet121/README.md +++ b/qai_hub_models/models/densenet121/README.md @@ -10,7 +10,7 @@ This is based on the implementation of DenseNet-121 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/densenet121). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.densenet121.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of DenseNet-121 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Densely Connected Convolutional Networks](https://arxiv.org/abs/1608.06993) diff --git a/qai_hub_models/models/densenet121/conftest.py b/qai_hub_models/models/densenet121/conftest.py new file mode 100644 index 00000000..81b42e00 --- /dev/null +++ b/qai_hub_models/models/densenet121/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.densenet121 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.densenet121.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/densenet121/demo.py b/qai_hub_models/models/densenet121/demo.py index 72ba762b..d18d86bd 100644 --- a/qai_hub_models/models/densenet121/demo.py +++ b/qai_hub_models/models/densenet121/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.densenet121.model import DenseNet +from qai_hub_models.models.densenet121.model import MODEL_ID, DenseNet def main(is_test: bool = False): - imagenet_demo(DenseNet, is_test) + imagenet_demo(DenseNet, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/densenet121/export.py b/qai_hub_models/models/densenet121/export.py index 32e11a53..5d02faa4 100644 --- a/qai_hub_models/models/densenet121/export.py +++ b/qai_hub_models/models/densenet121/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/densenet121/info.yaml b/qai_hub_models/models/densenet121/info.yaml index 7eb5a937..9d1dda3b 100644 --- a/qai_hub_models/models/densenet121/info.yaml +++ b/qai_hub_models/models/densenet121/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1608.06993 research_paper_title: Densely Connected Convolutional Networks license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/densenet.py technical_details: Model checkpoint: Imagenet @@ -34,6 +35,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/densenet121/model.py b/qai_hub_models/models/densenet121/model.py index 79faf024..dacbe0fe 100644 --- a/qai_hub_models/models/densenet121/model.py +++ b/qai_hub_models/models/densenet121/model.py @@ -14,6 +14,6 @@ class DenseNet(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> DenseNet: net = tv_models.densenet121(weights=weights) return cls(net) diff --git a/qai_hub_models/models/densenet121/perf.yaml b/qai_hub_models/models/densenet121/perf.yaml index ed91b04c..42d62485 100644 --- a/qai_hub_models/models/densenet121/perf.yaml +++ b/qai_hub_models/models/densenet121/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: DenseNet-121 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1605.0 - throughput: 623.0529595015577 + inference_time: 1603.0 + throughput: 623.8303181534623 estimated_peak_memory_range: - min: 28672 - max: 20688688 + min: 16384 + max: 20547528 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 310 - job_id: jlpe7w275 + job_id: jqpyen0gy job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:35:22.500705Z' torchscript_onnx_qnn: - inference_time: 1449.0 - throughput: 690.1311249137336 + inference_time: 1436.0 + throughput: 696.3788300835655 estimated_peak_memory_range: - min: 73728 - max: 209142552 + min: 618496 + max: 5887960 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 371 - job_id: jygzljwz5 + job_id: j1p8o6qg9 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1114.0 + throughput: 897.6660682226212 + estimated_peak_memory_range: + min: 12288 + max: 93424064 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 310 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 310 + job_id: j2p0yd0gw job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:21:38.990133Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:39:50.803809Z' + torchscript_onnx_qnn: + inference_time: 985.0 + throughput: 1015.2284263959391 + estimated_peak_memory_range: + min: 618496 + max: 142978448 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 371 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 371 + job_id: jn5q8ze57 + job_status: Passed diff --git a/qai_hub_models/models/densenet121/test.py b/qai_hub_models/models/densenet121/test.py index 82b584c4..647eaf92 100644 --- a/qai_hub_models/models/densenet121/test.py +++ b/qai_hub_models/models/densenet121/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(DenseNet.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(DenseNet.from_pretrained()) diff --git a/qai_hub_models/models/detr_resnet101/README.md b/qai_hub_models/models/detr_resnet101/README.md index 54ddfe1b..58a8d4e3 100644 --- a/qai_hub_models/models/detr_resnet101/README.md +++ b/qai_hub_models/models/detr_resnet101/README.md @@ -10,7 +10,7 @@ This is based on the implementation of DETR-ResNet101 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet101). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.detr_resnet101.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of DETR-ResNet101 can be found [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) diff --git a/qai_hub_models/models/detr_resnet101/conftest.py b/qai_hub_models/models/detr_resnet101/conftest.py new file mode 100644 index 00000000..cc6456c1 --- /dev/null +++ b/qai_hub_models/models/detr_resnet101/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.detr_resnet101 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.detr_resnet101.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/detr_resnet101/demo.py b/qai_hub_models/models/detr_resnet101/demo.py index d9ed9cc0..d84dbdbb 100644 --- a/qai_hub_models/models/detr_resnet101/demo.py +++ b/qai_hub_models/models/detr_resnet101/demo.py @@ -19,7 +19,7 @@ # Run DETR app end-to-end on a sample image. # The demo will display the predicted mask in a window. def main(is_test: bool = False): - detr_demo(DETRResNet101, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + detr_demo(DETRResNet101, MODEL_ID, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/detr_resnet101/export.py b/qai_hub_models/models/detr_resnet101/export.py index a3e36a66..f46ca4e5 100644 --- a/qai_hub_models/models/detr_resnet101/export.py +++ b/qai_hub_models/models/detr_resnet101/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/detr_resnet101/info.yaml b/qai_hub_models/models/detr_resnet101/info.yaml index 45c4e48e..f7bc400e 100644 --- a/qai_hub_models/models/detr_resnet101/info.yaml +++ b/qai_hub_models/models/detr_resnet101/info.yaml @@ -11,6 +11,7 @@ use_case: Object Detection research_paper: https://arxiv.org/abs/2005.12872 research_paper_title: End-to-End Object Detection with Transformers license: https://github.com/facebookresearch/detr/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/facebookresearch/detr technical_details: Model checkpoint: ResNet101 @@ -32,5 +33,6 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet101/perf.yaml b/qai_hub_models/models/detr_resnet101/perf.yaml index 701f35ef..ab365fa3 100644 --- a/qai_hub_models/models/detr_resnet101/perf.yaml +++ b/qai_hub_models/models/detr_resnet101/perf.yaml @@ -2,6 +2,12 @@ aggregated: supported_oses: - Android supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -11,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: DETR-ResNet101 performance_metrics: - torchscript_onnx_tflite: - inference_time: 640294.0 - throughput: 1.5617825561382739 + inference_time: 563957.0 + throughput: 1.7731848350140171 estimated_peak_memory_range: - min: 107266048 - max: 111542968 + min: 102526976 + max: 112477944 primary_compute_unit: CPU precision: fp32 layer_info: @@ -34,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 957 total_layers: 957 - job_id: jz5wl39zp + job_id: jmg9v8m57 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:10:49.800332Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -51,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 489867.0 + throughput: 2.0413704127855112 + estimated_peak_memory_range: + min: 109977600 + max: 266823568 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 957 + total_layers: 957 + job_id: jnp103n5q + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:15:32.226652Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:10:49.800340Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/detr_resnet101/requirements.txt b/qai_hub_models/models/detr_resnet101/requirements.txt index 3582ec2c..f9dfeb47 100644 --- a/qai_hub_models/models/detr_resnet101/requirements.txt +++ b/qai_hub_models/models/detr_resnet101/requirements.txt @@ -1,2 +1,2 @@ -transformers==4.31.0 -timm==0.9.7 +transformers==4.27.4 +timm==0.9.11 diff --git a/qai_hub_models/models/detr_resnet101/test.py b/qai_hub_models/models/detr_resnet101/test.py index 7a9b8da1..3b87d3bd 100644 --- a/qai_hub_models/models/detr_resnet101/test.py +++ b/qai_hub_models/models/detr_resnet101/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.detr.app import DETRApp from qai_hub_models.models.detr_resnet101.demo import MODEL_ASSET_VERSION, MODEL_ID from qai_hub_models.models.detr_resnet101.demo import main as demo_main @@ -20,6 +22,7 @@ def test_task(): assert set(list(label.numpy())) == {75, 63, 17} +@pytest.mark.trace def test_trace(): net = DETRResNet101.from_pretrained(DEFAULT_WEIGHTS).convert_to_torchscript() img = load_image(IMAGE_ADDRESS) diff --git a/qai_hub_models/models/detr_resnet101_dc5/README.md b/qai_hub_models/models/detr_resnet101_dc5/README.md index 8e4b4c6b..68300db5 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/README.md +++ b/qai_hub_models/models/detr_resnet101_dc5/README.md @@ -10,7 +10,7 @@ This is based on the implementation of DETR-ResNet101-DC5 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet101_dc5). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.detr_resnet101_dc5.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of DETR-ResNet101-DC5 can be found [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) diff --git a/qai_hub_models/models/detr_resnet101_dc5/conftest.py b/qai_hub_models/models/detr_resnet101_dc5/conftest.py new file mode 100644 index 00000000..417445fd --- /dev/null +++ b/qai_hub_models/models/detr_resnet101_dc5/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.detr_resnet101_dc5 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.detr_resnet101_dc5.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/detr_resnet101_dc5/demo.py b/qai_hub_models/models/detr_resnet101_dc5/demo.py index fd286725..c90f12b1 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/demo.py +++ b/qai_hub_models/models/detr_resnet101_dc5/demo.py @@ -19,7 +19,7 @@ # Run DETR app end-to-end on a sample image. # The demo will display the predicted mask in a window. def main(is_test: bool = False): - detr_demo(DETRResNet101DC5, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + detr_demo(DETRResNet101DC5, MODEL_ID, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/detr_resnet101_dc5/export.py b/qai_hub_models/models/detr_resnet101_dc5/export.py index b0b2cd5a..e8c61070 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/export.py +++ b/qai_hub_models/models/detr_resnet101_dc5/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/detr_resnet101_dc5/info.yaml b/qai_hub_models/models/detr_resnet101_dc5/info.yaml index 323c4743..e1ca9e22 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/info.yaml +++ b/qai_hub_models/models/detr_resnet101_dc5/info.yaml @@ -11,6 +11,7 @@ use_case: Object Detection research_paper: https://arxiv.org/abs/2005.12872 research_paper_title: End-to-End Object Detection with Transformers license: https://github.com/facebookresearch/detr/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/facebookresearch/detr technical_details: Model checkpoint: ResNet101-DC5 @@ -32,5 +33,6 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet101_dc5/perf.yaml b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml index ce87325d..07e3fdc8 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/perf.yaml +++ b/qai_hub_models/models/detr_resnet101_dc5/perf.yaml @@ -2,6 +2,12 @@ aggregated: supported_oses: - Android supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -11,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: DETR-ResNet101-DC5 performance_metrics: - torchscript_onnx_tflite: - inference_time: 971988.0 - throughput: 1.0288192858348046 + inference_time: 976351.0 + throughput: 1.0242218218652923 estimated_peak_memory_range: - min: 12288 - max: 291526464 + min: 168345600 + max: 171158408 primary_compute_unit: CPU precision: fp32 layer_info: @@ -34,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 958 total_layers: 958 - job_id: jlpe7w875 + job_id: jep28v6p6 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:03:43.829001Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -51,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 777938.0 + throughput: 1.2854494831207628 + estimated_peak_memory_range: + min: 175112192 + max: 339555616 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 958 + total_layers: 958 + job_id: jqpye70gy + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:40:02.166898Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:03:43.829010Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/detr_resnet101_dc5/requirements.txt b/qai_hub_models/models/detr_resnet101_dc5/requirements.txt index 3582ec2c..f9dfeb47 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/requirements.txt +++ b/qai_hub_models/models/detr_resnet101_dc5/requirements.txt @@ -1,2 +1,2 @@ -transformers==4.31.0 -timm==0.9.7 +transformers==4.27.4 +timm==0.9.11 diff --git a/qai_hub_models/models/detr_resnet101_dc5/test.py b/qai_hub_models/models/detr_resnet101_dc5/test.py index 64d63836..a0c0bfb4 100644 --- a/qai_hub_models/models/detr_resnet101_dc5/test.py +++ b/qai_hub_models/models/detr_resnet101_dc5/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.detr.app import DETRApp from qai_hub_models.models.detr_resnet101_dc5.demo import IMAGE_ADDRESS from qai_hub_models.models.detr_resnet101_dc5.demo import main as demo_main @@ -25,6 +27,7 @@ def test_task(): assert set(list(label.numpy())) == {75, 63, 17} +@pytest.mark.trace def test_trace(): net = DETRResNet101DC5.from_pretrained(DEFAULT_WEIGHTS).convert_to_torchscript() img = load_image(IMAGE_ADDRESS) diff --git a/qai_hub_models/models/detr_resnet50/README.md b/qai_hub_models/models/detr_resnet50/README.md index 1089a7bc..3ad8f2a2 100644 --- a/qai_hub_models/models/detr_resnet50/README.md +++ b/qai_hub_models/models/detr_resnet50/README.md @@ -10,7 +10,7 @@ This is based on the implementation of DETR-ResNet50 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet50). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.detr_resnet50.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of DETR-ResNet50 can be found [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) diff --git a/qai_hub_models/models/detr_resnet50/conftest.py b/qai_hub_models/models/detr_resnet50/conftest.py new file mode 100644 index 00000000..cf3b6330 --- /dev/null +++ b/qai_hub_models/models/detr_resnet50/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.detr_resnet50 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.detr_resnet50.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/detr_resnet50/demo.py b/qai_hub_models/models/detr_resnet50/demo.py index 6fcf2a7f..169895e5 100644 --- a/qai_hub_models/models/detr_resnet50/demo.py +++ b/qai_hub_models/models/detr_resnet50/demo.py @@ -19,7 +19,7 @@ # Run DETR app end-to-end on a sample image. # The demo will display the predicted mask in a window. def main(is_test: bool = False): - detr_demo(DETRResNet50, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + detr_demo(DETRResNet50, MODEL_ID, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/detr_resnet50/export.py b/qai_hub_models/models/detr_resnet50/export.py index 7c6a0d6c..70838101 100644 --- a/qai_hub_models/models/detr_resnet50/export.py +++ b/qai_hub_models/models/detr_resnet50/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/detr_resnet50/info.yaml b/qai_hub_models/models/detr_resnet50/info.yaml index 1daf3e8b..2270fe6e 100644 --- a/qai_hub_models/models/detr_resnet50/info.yaml +++ b/qai_hub_models/models/detr_resnet50/info.yaml @@ -11,6 +11,7 @@ use_case: Object Detection research_paper: https://arxiv.org/abs/2005.12872 research_paper_title: End-to-End Object Detection with Transformers license: https://github.com/facebookresearch/detr/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/facebookresearch/detr technical_details: Model checkpoint: ResNet50 @@ -32,5 +33,6 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet50/perf.yaml b/qai_hub_models/models/detr_resnet50/perf.yaml index 6f7b0bd7..d07318ba 100644 --- a/qai_hub_models/models/detr_resnet50/perf.yaml +++ b/qai_hub_models/models/detr_resnet50/perf.yaml @@ -2,6 +2,12 @@ aggregated: supported_oses: - Android supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -11,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: DETR-ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 346284.0 - throughput: 2.887803074932714 + inference_time: 365312.0 + throughput: 2.737386124737211 estimated_peak_memory_range: - min: 109121536 - max: 112011896 + min: 109416448 + max: 444976064 primary_compute_unit: CPU precision: fp32 layer_info: @@ -34,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 889 total_layers: 889 - job_id: jvgddqrkg + job_id: j1p3k7x52 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:08:31.933833Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -51,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 287302.0 + throughput: 3.480657983585217 + estimated_peak_memory_range: + min: 108204032 + max: 196940032 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 889 + total_layers: 889 + job_id: jwgoyw458 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:35:06.340774Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:08:31.933846Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/detr_resnet50/requirements.txt b/qai_hub_models/models/detr_resnet50/requirements.txt index 3582ec2c..f9dfeb47 100644 --- a/qai_hub_models/models/detr_resnet50/requirements.txt +++ b/qai_hub_models/models/detr_resnet50/requirements.txt @@ -1,2 +1,2 @@ -transformers==4.31.0 -timm==0.9.7 +transformers==4.27.4 +timm==0.9.11 diff --git a/qai_hub_models/models/detr_resnet50/test.py b/qai_hub_models/models/detr_resnet50/test.py index 9174f95b..634027b4 100644 --- a/qai_hub_models/models/detr_resnet50/test.py +++ b/qai_hub_models/models/detr_resnet50/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.detr.app import DETRApp from qai_hub_models.models.detr_resnet50.demo import main as demo_main from qai_hub_models.models.detr_resnet50.model import ( @@ -32,6 +34,7 @@ def test_cli_from_pretrained(): assert model_from_cli_args(DETRResNet50, args) is not None +@pytest.mark.trace def test_trace(): net = DETRResNet50.from_pretrained() input_spec = net.get_input_spec() diff --git a/qai_hub_models/models/detr_resnet50_dc5/README.md b/qai_hub_models/models/detr_resnet50_dc5/README.md index f93b2bad..59d5fed2 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/README.md +++ b/qai_hub_models/models/detr_resnet50_dc5/README.md @@ -10,7 +10,7 @@ This is based on the implementation of DETR-ResNet50-DC5 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/detr_resnet50_dc5). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.detr_resnet50_dc5.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of DETR-ResNet50-DC5 can be found [here](https://github.com/facebookresearch/detr/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) diff --git a/qai_hub_models/models/detr_resnet50_dc5/conftest.py b/qai_hub_models/models/detr_resnet50_dc5/conftest.py new file mode 100644 index 00000000..f2a22cab --- /dev/null +++ b/qai_hub_models/models/detr_resnet50_dc5/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.detr_resnet50_dc5 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.detr_resnet50_dc5.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/detr_resnet50_dc5/demo.py b/qai_hub_models/models/detr_resnet50_dc5/demo.py index 0eeaee66..a0552f0e 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/demo.py +++ b/qai_hub_models/models/detr_resnet50_dc5/demo.py @@ -19,7 +19,7 @@ # Run DETR app end-to-end on a sample image. # The demo will display the predicted mask in a window. def main(is_test: bool = False): - detr_demo(DETRResNet50DC5, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) + detr_demo(DETRResNet50DC5, MODEL_ID, DEFAULT_WEIGHTS, IMAGE_ADDRESS, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/detr_resnet50_dc5/export.py b/qai_hub_models/models/detr_resnet50_dc5/export.py index 496a68aa..09e9b406 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/export.py +++ b/qai_hub_models/models/detr_resnet50_dc5/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/detr_resnet50_dc5/info.yaml b/qai_hub_models/models/detr_resnet50_dc5/info.yaml index 4777b6ea..16b90ef9 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/info.yaml +++ b/qai_hub_models/models/detr_resnet50_dc5/info.yaml @@ -11,6 +11,7 @@ use_case: Object Detection research_paper: https://arxiv.org/abs/2005.12872 research_paper_title: End-to-End Object Detection with Transformers license: https://github.com/facebookresearch/detr/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/facebookresearch/detr technical_details: Model checkpoint: ResNet50-DC5 @@ -32,5 +33,6 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: - detection-datasets/coco diff --git a/qai_hub_models/models/detr_resnet50_dc5/perf.yaml b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml index aaba6ce8..4d135c6c 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/perf.yaml +++ b/qai_hub_models/models/detr_resnet50_dc5/perf.yaml @@ -2,6 +2,12 @@ aggregated: supported_oses: - Android supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -11,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: DETR-ResNet50-DC5 performance_metrics: - torchscript_onnx_tflite: - inference_time: 760148.0 - throughput: 1.3155332908854591 + inference_time: 692168.0 + throughput: 1.4447359600559402 estimated_peak_memory_range: - min: 251318272 - max: 254954864 + min: 117583872 + max: 529905552 primary_compute_unit: CPU precision: fp32 layer_info: @@ -34,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 890 total_layers: 890 - job_id: j1pvlr7m5 + job_id: jqp4q2lgo job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:27:50.803823Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -51,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 521991.0 + throughput: 1.9157418422923 + estimated_peak_memory_range: + min: 178831360 + max: 279734112 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 890 + total_layers: 890 + job_id: j0pxvz9g7 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:15:27.657498Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:27:50.803834Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/detr_resnet50_dc5/requirements.txt b/qai_hub_models/models/detr_resnet50_dc5/requirements.txt index 3582ec2c..f9dfeb47 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/requirements.txt +++ b/qai_hub_models/models/detr_resnet50_dc5/requirements.txt @@ -1,2 +1,2 @@ -transformers==4.31.0 -timm==0.9.7 +transformers==4.27.4 +timm==0.9.11 diff --git a/qai_hub_models/models/detr_resnet50_dc5/test.py b/qai_hub_models/models/detr_resnet50_dc5/test.py index 28533460..04f51a81 100644 --- a/qai_hub_models/models/detr_resnet50_dc5/test.py +++ b/qai_hub_models/models/detr_resnet50_dc5/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.detr.app import DETRApp from qai_hub_models.models.detr_resnet50_dc5.demo import MODEL_ASSET_VERSION, MODEL_ID from qai_hub_models.models.detr_resnet50_dc5.demo import main as demo_main @@ -23,6 +25,7 @@ def test_task(): assert set(list(label.numpy())) == {75, 63, 17} +@pytest.mark.trace def test_trace(): net = DETRResNet50DC5.from_pretrained(DEFAULT_WEIGHTS).convert_to_torchscript() img = load_image(IMAGE_ADDRESS) diff --git a/qai_hub_models/models/efficientnet_b0/README.md b/qai_hub_models/models/efficientnet_b0/README.md index 677b5dc7..e1241088 100644 --- a/qai_hub_models/models/efficientnet_b0/README.md +++ b/qai_hub_models/models/efficientnet_b0/README.md @@ -10,7 +10,7 @@ This is based on the implementation of EfficientNet-B0 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/efficientnet_b0). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.efficientnet_b0.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of EfficientNet-B0 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) diff --git a/qai_hub_models/models/efficientnet_b0/conftest.py b/qai_hub_models/models/efficientnet_b0/conftest.py new file mode 100644 index 00000000..72ffa71c --- /dev/null +++ b/qai_hub_models/models/efficientnet_b0/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.efficientnet_b0 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.efficientnet_b0.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/efficientnet_b0/demo.py b/qai_hub_models/models/efficientnet_b0/demo.py index 40bd18fb..9f01d294 100644 --- a/qai_hub_models/models/efficientnet_b0/demo.py +++ b/qai_hub_models/models/efficientnet_b0/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.efficientnet_b0.model import EfficientNetB0 +from qai_hub_models.models.efficientnet_b0.model import MODEL_ID, EfficientNetB0 def main(is_test: bool = False): - imagenet_demo(EfficientNetB0, is_test) + imagenet_demo(EfficientNetB0, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/efficientnet_b0/export.py b/qai_hub_models/models/efficientnet_b0/export.py index aba81864..44abf4a7 100644 --- a/qai_hub_models/models/efficientnet_b0/export.py +++ b/qai_hub_models/models/efficientnet_b0/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/efficientnet_b0/info.yaml b/qai_hub_models/models/efficientnet_b0/info.yaml index d66d06a5..0e74436b 100644 --- a/qai_hub_models/models/efficientnet_b0/info.yaml +++ b/qai_hub_models/models/efficientnet_b0/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/1905.11946 research_paper_title: 'EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/efficientnet.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/efficientnet_b0/model.py b/qai_hub_models/models/efficientnet_b0/model.py index 956dc29b..4667e5c9 100644 --- a/qai_hub_models/models/efficientnet_b0/model.py +++ b/qai_hub_models/models/efficientnet_b0/model.py @@ -14,6 +14,6 @@ class EfficientNetB0(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> EfficientNetB0: net = tv_models.efficientnet_b0(weights=weights) return cls(net) diff --git a/qai_hub_models/models/efficientnet_b0/perf.yaml b/qai_hub_models/models/efficientnet_b0/perf.yaml index ea6a7116..bce0b849 100644 --- a/qai_hub_models/models/efficientnet_b0/perf.yaml +++ b/qai_hub_models/models/efficientnet_b0/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: EfficientNet-B0 performance_metrics: - torchscript_onnx_tflite: - inference_time: 2184.0 - throughput: 457.87545787545787 + inference_time: 2174.0 + throughput: 459.9816007359706 estimated_peak_memory_range: - min: 12288 - max: 2340896 + min: 24576 + max: 2273464 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 243 - job_id: j0pxl61jp + job_id: jlpe9l8gr job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:37:36.573638Z' torchscript_onnx_qnn: - inference_time: 2166.0 - throughput: 461.6805170821791 + inference_time: 2173.0 + throughput: 460.1932811780948 estimated_peak_memory_range: - min: 12288 - max: 86865200 + min: 16384 + max: 87349280 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 242 - job_id: jo5m06zyg + job_id: jz5wo14p1 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1524.0 + throughput: 656.1679790026246 + estimated_peak_memory_range: + min: 12288 + max: 70874656 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 243 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 243 + job_id: jygze44g8 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:27:58.826690Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:42:10.776325Z' + torchscript_onnx_qnn: + inference_time: 1508.0 + throughput: 663.1299734748011 + estimated_peak_memory_range: + min: 618496 + max: 79231776 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 242 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 242 + job_id: jmg9vxm57 + job_status: Passed diff --git a/qai_hub_models/models/efficientnet_b0/test.py b/qai_hub_models/models/efficientnet_b0/test.py index b7808e69..5ac04c69 100644 --- a/qai_hub_models/models/efficientnet_b0/test.py +++ b/qai_hub_models/models/efficientnet_b0/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(EfficientNetB0.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(EfficientNetB0.from_pretrained()) diff --git a/qai_hub_models/models/esrgan/README.md b/qai_hub_models/models/esrgan/README.md index 0607633f..781f6201 100644 --- a/qai_hub_models/models/esrgan/README.md +++ b/qai_hub_models/models/esrgan/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ESRGAN found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/esrgan). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.esrgan.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ESRGAN can be found [here](https://github.com/xinntao/ESRGAN/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks](https://arxiv.org/abs/1809.00219) diff --git a/qai_hub_models/models/esrgan/conftest.py b/qai_hub_models/models/esrgan/conftest.py new file mode 100644 index 00000000..7e3e37cf --- /dev/null +++ b/qai_hub_models/models/esrgan/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.esrgan import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.esrgan.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/esrgan/demo.py b/qai_hub_models/models/esrgan/demo.py index 3a15c695..03c02612 100644 --- a/qai_hub_models/models/esrgan/demo.py +++ b/qai_hub_models/models/esrgan/demo.py @@ -16,6 +16,7 @@ def main(is_test: bool = False): super_resolution_demo( model_cls=ESRGAN, + model_id=MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, ) diff --git a/qai_hub_models/models/esrgan/export.py b/qai_hub_models/models/esrgan/export.py index d82b3c60..01c64d63 100644 --- a/qai_hub_models/models/esrgan/export.py +++ b/qai_hub_models/models/esrgan/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/esrgan/info.yaml b/qai_hub_models/models/esrgan/info.yaml index f0a23cd5..155c1676 100644 --- a/qai_hub_models/models/esrgan/info.yaml +++ b/qai_hub_models/models/esrgan/info.yaml @@ -11,6 +11,7 @@ tags: [] research_paper: https://arxiv.org/abs/1809.00219 research_paper_title: 'ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks' license: https://github.com/xinntao/ESRGAN/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/xinntao/ESRGAN/ technical_details: Model checkpoint: ESRGAN_x4 @@ -30,4 +31,5 @@ related_models: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/esrgan/perf.yaml b/qai_hub_models/models/esrgan/perf.yaml index 50e0e2bd..1ecf1170 100644 --- a/qai_hub_models/models/esrgan/perf.yaml +++ b/qai_hub_models/models/esrgan/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ESRGAN performance_metrics: - torchscript_onnx_tflite: - inference_time: 76337.0 - throughput: 13.099807432830737 + inference_time: 74047.0 + throughput: 13.504936054127784 estimated_peak_memory_range: - min: 3301376 - max: 6221192 + min: 12288 + max: 4695144 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1024 - job_id: jnp1nw7kg + job_id: jnp10rl5q job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:41:56.326001Z' torchscript_onnx_qnn: - inference_time: 66070.0 - throughput: 15.135462388375965 + inference_time: 65507.0 + throughput: 15.265544140320882 estimated_peak_memory_range: - min: 102400 - max: 101973424 + min: 57344 + max: 55933800 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1027 - job_id: jvgddq8kg + job_id: jz5woo6p1 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 53553.0 + throughput: 18.673090209698803 + estimated_peak_memory_range: + min: 3276800 + max: 574983152 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1024 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1024 + job_id: jvgdwjl5j job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:29:43.305116Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:46:30.960659Z' + torchscript_onnx_qnn: + inference_time: 50563.0 + throughput: 19.777307517354586 + estimated_peak_memory_range: + min: 86016 + max: 240922112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1027 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1027 + job_id: jmg9vvl57 + job_status: Passed diff --git a/qai_hub_models/models/esrgan/test.py b/qai_hub_models/models/esrgan/test.py index 25a9e20a..f7719ebc 100644 --- a/qai_hub_models/models/esrgan/test.py +++ b/qai_hub_models/models/esrgan/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp from qai_hub_models.models.esrgan.demo import IMAGE_ADDRESS @@ -30,6 +31,7 @@ def test_task(): ) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): image = load_image(IMAGE_ADDRESS) diff --git a/qai_hub_models/models/facebook_denoiser/README.md b/qai_hub_models/models/facebook_denoiser/README.md index 8b79544c..3c05adc1 100644 --- a/qai_hub_models/models/facebook_denoiser/README.md +++ b/qai_hub_models/models/facebook_denoiser/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Facebook-Denoiser found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/facebook_denoiser). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.facebook_denoiser.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Facebook-Denoiser can be found [here](https://github.com/facebookresearch/denoiser/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Real Time Speech Enhancement in the Waveform Domain](https://arxiv.org/abs/2006.12847) diff --git a/qai_hub_models/models/facebook_denoiser/app.py b/qai_hub_models/models/facebook_denoiser/app.py index 5af5473b..537fc024 100644 --- a/qai_hub_models/models/facebook_denoiser/app.py +++ b/qai_hub_models/models/facebook_denoiser/app.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from __future__ import annotations -import os from pathlib import Path from typing import Callable, List, Sequence @@ -40,7 +39,6 @@ def predict(self, *args, **kwargs): def denoise_audio( self, input_audio: Sequence[Path | str | torch.Tensor | np.ndarray], - out_dir: Path | str | None = None, ) -> List[Path | torch.Tensor]: """ Denoise and isolate the speech in the provided audio clip(s). @@ -80,19 +78,9 @@ def denoise_audio( for noisy in noisy_audios: out = self.denoiser(noisy) out = out / max(out.abs().max().item(), 1) # Normalize - if all_inputs_are_paths and out_dir: + if all_inputs_are_paths: # We don't run files in batches, take the first batch output out = out[:, 0] estimates.append(out) - if out_dir and all_inputs_are_paths: - output_files = [] - for path, estimate in zip(input_audio, estimates): - filename = os.path.join( - out_dir, os.path.basename(path).rsplit(".", 1)[0] - ) - filename = Path(f"{filename}_enhanced.wav") - torchaudio.save(filename, estimate, self.sample_rate) - output_files.append(filename) - return output_files return estimates diff --git a/qai_hub_models/models/facebook_denoiser/conftest.py b/qai_hub_models/models/facebook_denoiser/conftest.py new file mode 100644 index 00000000..f2883f91 --- /dev/null +++ b/qai_hub_models/models/facebook_denoiser/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.facebook_denoiser import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.facebook_denoiser.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/facebook_denoiser/demo.py b/qai_hub_models/models/facebook_denoiser/demo.py index 6d6a86fe..cb8eb9d1 100644 --- a/qai_hub_models/models/facebook_denoiser/demo.py +++ b/qai_hub_models/models/facebook_denoiser/demo.py @@ -4,16 +4,25 @@ # --------------------------------------------------------------------- import os import tempfile +from pathlib import Path from typing import List +import torchaudio + from qai_hub_models.models.facebook_denoiser.app import FacebookDenoiserApp from qai_hub_models.models.facebook_denoiser.model import ( ASSET_VERSION, + DEFAULT_SEQUENCE_LENGTH, MODEL_ID, SAMPLE_RATE, FacebookDenoiser, ) -from qai_hub_models.utils.args import get_model_cli_parser, model_from_cli_args +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_path EXAMPLE_RECORDING = CachedWebModelAsset.from_asset_store( @@ -26,6 +35,7 @@ def main(is_test: bool = False): Run facebook denoiser on a sample audio (`.wav`) file. """ parser = get_model_cli_parser(FacebookDenoiser) + parser = get_on_device_demo_parser(parser, add_output_dir=True) parser.add_argument( "--audio", nargs="+", @@ -38,26 +48,41 @@ def main(is_test: bool = False): default=SAMPLE_RATE, help="Audio sample rate the model was trained on", ) - parser.add_argument( - "--output-dir", - type=str, - default=os.getcwd(), - help="output directory (where output WAV should be written)", - ) args = parser.parse_args([] if is_test else None) + model = demo_model_from_cli_args(FacebookDenoiser, MODEL_ID, args) + validate_on_device_demo_args(args, MODEL_ID) - # Load Model - source_model = model_from_cli_args(FacebookDenoiser, args) - app = FacebookDenoiserApp(source_model, args.sample_rate) + app = FacebookDenoiserApp(model, args.sample_rate) # Download data - audio: List[str] = args.audio + audio_files: List[str] = args.audio + audio_tensors = [] with tempfile.TemporaryDirectory() as tmpdir: - for idx, file in enumerate(audio): - audio[idx] = load_path(file, tmpdir) + for idx, file in enumerate(audio_files): + audio_file = load_path(file, tmpdir) + audio, sample_rate = torchaudio.load(audio_file) + # By default, cut audio to the default sequence length + # since by default, model is compiled with this input size + audio_tensor = audio[0, :DEFAULT_SEQUENCE_LENGTH].unsqueeze(0).unsqueeze(0) + assert sample_rate == SAMPLE_RATE + audio_tensors.append(audio_tensor) # Dump output from app - output = app.denoise_audio(audio, args.output_dir) + output = app.denoise_audio(audio_tensors) + + if args.output_dir: + output_files = [] + for file, estimate in zip(audio_files, output): + local_path = load_path(file, tmpdir) + filename = os.path.join( + args.output_dir, os.path.basename(local_path).rsplit(".", 1)[0] + ) + filename = Path(f"{filename}_enhanced.wav") + # make input 2D: + estimate = estimate.squeeze().unsqueeze(0) + torchaudio.save(filename, estimate, SAMPLE_RATE) + output_files.append(filename) + return output_files if not is_test: print("Wrote files:") diff --git a/qai_hub_models/models/facebook_denoiser/export.py b/qai_hub_models/models/facebook_denoiser/export.py index 4f462b4f..f536b473 100644 --- a/qai_hub_models/models/facebook_denoiser/export.py +++ b/qai_hub_models/models/facebook_denoiser/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -26,6 +26,7 @@ from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( print_inference_metrics, + print_on_target_demo_cmd, print_profile_metrics_from_job, ) from qai_hub_models.utils.qai_hub_helpers import ( @@ -107,65 +108,77 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) sample_inputs = model.sample_inputs(input_spec) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=sample_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/facebook_denoiser/info.yaml b/qai_hub_models/models/facebook_denoiser/info.yaml index 0b4e8453..3e5cf4ed 100644 --- a/qai_hub_models/models/facebook_denoiser/info.yaml +++ b/qai_hub_models/models/facebook_denoiser/info.yaml @@ -11,6 +11,7 @@ tags: [] research_paper: https://arxiv.org/abs/2006.12847 research_paper_title: Real Time Speech Enhancement in the Waveform Domain license: https://github.com/facebookresearch/denoiser/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/facebookresearch/denoiser technical_details: Input resolution: 1x1x917 @@ -27,4 +28,5 @@ related_models: [] has_static_banner: yes has_animated_banner: yes license_type: cc-by-nc-4.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/facebook_denoiser/model.py b/qai_hub_models/models/facebook_denoiser/model.py index 72a23b59..68e112f4 100644 --- a/qai_hub_models/models/facebook_denoiser/model.py +++ b/qai_hub_models/models/facebook_denoiser/model.py @@ -4,16 +4,19 @@ # --------------------------------------------------------------------- from __future__ import annotations +from typing import Optional + import torch -from denoiser import pretrained -from denoiser.pretrained import DNS_48_URL +from qai_hub_models.utils.asset_loaders import SourceAsRoot from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.input_spec import InputSpec +SOURCE_REPOSITORY = "https://github.com/facebookresearch/denoiser" +SOURCE_REPO_COMMIT = "8afd7c166699bb3c8b2d95b6dd706f71e1075df0" SAMPLE_RATE = 16000 HIDDEN_LAYER_COUNT = 48 -DEFAULT_SEQUENCE_LENGTH = 917 +DEFAULT_SEQUENCE_LENGTH = 100000 # This corresponds to about 6 seconds of audio MODEL_ID = "facebook_denoiser" ASSET_VERSION = 1 @@ -39,8 +42,8 @@ def forward(self, audio: torch.Tensor) -> torch.Tensor: """ return self.net(audio) + @staticmethod def get_input_spec( - self, batch_size: int = 1, sequence_length: int = DEFAULT_SEQUENCE_LENGTH, ) -> InputSpec: @@ -52,9 +55,14 @@ def get_input_spec( @classmethod def from_pretrained( - cls, state_dict_url: str = DNS_48_URL, hidden_layer_count=HIDDEN_LAYER_COUNT + cls, state_dict_url: Optional[str] = None, hidden_layer_count=HIDDEN_LAYER_COUNT ) -> FacebookDenoiser: - net = pretrained._demucs( - state_dict_url is not None, state_dict_url, hidden=hidden_layer_count - ) - return cls(net) + with SourceAsRoot( + SOURCE_REPOSITORY, SOURCE_REPO_COMMIT, MODEL_ID, ASSET_VERSION + ): + from denoiser.pretrained import DNS_48_URL, _demucs + + if state_dict_url is None: + state_dict_url = DNS_48_URL + net = _demucs(True, state_dict_url, hidden=hidden_layer_count) + return cls(net) diff --git a/qai_hub_models/models/facebook_denoiser/perf.yaml b/qai_hub_models/models/facebook_denoiser/perf.yaml index 4f7e4ae7..425ac777 100644 --- a/qai_hub_models/models/facebook_denoiser/perf.yaml +++ b/qai_hub_models/models/facebook_denoiser/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Facebook-Denoiser performance_metrics: - torchscript_onnx_tflite: - inference_time: 6985.0 - throughput: 143.16392269148176 + inference_time: 711384.0 + throughput: 1.4057105585731475 estimated_peak_memory_range: - min: 28246016 - max: 51679504 + min: 236318720 + max: 349174920 primary_compute_unit: CPU precision: fp32 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 209 total_layers: 209 - job_id: jn5qlrw7p + job_id: j1p3kwm52 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:10:56.043154Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 670316.0 + throughput: 1.4918337023135357 + estimated_peak_memory_range: + min: 481374208 + max: 504692832 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 209 + total_layers: 209 + job_id: jwgoy4158 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:10:37.856306Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:10:56.043167Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/facebook_denoiser/requirements.txt b/qai_hub_models/models/facebook_denoiser/requirements.txt index 0307e34c..855a9703 100644 --- a/qai_hub_models/models/facebook_denoiser/requirements.txt +++ b/qai_hub_models/models/facebook_denoiser/requirements.txt @@ -1,3 +1,3 @@ -denoiser -torchaudio +hydra-core==1.3.0 +torchaudio==0.13.1 PySoundFile; sys_platform == 'win32' diff --git a/qai_hub_models/models/facebook_denoiser/test.py b/qai_hub_models/models/facebook_denoiser/test.py index 56a46b6c..c77ab11a 100644 --- a/qai_hub_models/models/facebook_denoiser/test.py +++ b/qai_hub_models/models/facebook_denoiser/test.py @@ -2,8 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import numpy as np import pytest -import torch import torchaudio from qai_hub_models.models.facebook_denoiser.app import FacebookDenoiserApp @@ -16,6 +16,7 @@ FacebookDenoiser, ) from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.testing import skip_clone_repo_check ENHANCED_EXAMPLE_RECORDING = CachedWebModelAsset.from_asset_store( MODEL_ID, ASSET_VERSION, "icsi_meeting_recording_enhanced.wav" @@ -30,18 +31,21 @@ def _handle_runtime_error(e: RuntimeError): ) +@skip_clone_repo_check def test_task(): app = FacebookDenoiserApp(FacebookDenoiser.from_pretrained()) try: - out = app.predict([EXAMPLE_RECORDING.fetch()])[0][:, 0] + out = app.predict([EXAMPLE_RECORDING.fetch()])[0] except RuntimeError as e: _handle_runtime_error(e) return expected, _ = torchaudio.load(ENHANCED_EXAMPLE_RECORDING.fetch()) - torch.testing.assert_allclose(out, expected) + np.testing.assert_allclose(out, expected, atol=1e-07) @pytest.mark.skip(reason="Fails with a mysterious error in DefaultCPUAllocator.") +@pytest.mark.trace +@skip_clone_repo_check def test_trace(): try: input_data, sample_rate = torchaudio.load(EXAMPLE_RECORDING.fetch()) @@ -58,8 +62,9 @@ def test_trace(): return expected, _ = torchaudio.load(ENHANCED_EXAMPLE_RECORDING.fetch()) - torch.testing.assert_allclose(out, expected) + np.testing.assert_allclose(out, expected, atol=1e-07) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/fastsam_s/README.md b/qai_hub_models/models/fastsam_s/README.md index 1fc0e262..9d774e45 100644 --- a/qai_hub_models/models/fastsam_s/README.md +++ b/qai_hub_models/models/fastsam_s/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FastSam-S found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/fastsam_s). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.fastsam_s.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FastSam-S can be found [here](https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Fast Segment Anything](https://arxiv.org/abs/2306.12156) diff --git a/qai_hub_models/models/fastsam_s/conftest.py b/qai_hub_models/models/fastsam_s/conftest.py new file mode 100644 index 00000000..367fe8b9 --- /dev/null +++ b/qai_hub_models/models/fastsam_s/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.fastsam_s import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.fastsam_s.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/fastsam_s/demo.py b/qai_hub_models/models/fastsam_s/demo.py index d5396330..df4e5163 100644 --- a/qai_hub_models/models/fastsam_s/demo.py +++ b/qai_hub_models/models/fastsam_s/demo.py @@ -16,7 +16,7 @@ def main(is_test: bool = False): - fastsam_demo(FastSAM_S, image_path=INPUT_IMAGE, is_test=is_test) + fastsam_demo(FastSAM_S, MODEL_ID, image_path=INPUT_IMAGE, is_test=is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/fastsam_s/export.py b/qai_hub_models/models/fastsam_s/export.py index 61896353..4902b3a5 100644 --- a/qai_hub_models/models/fastsam_s/export.py +++ b/qai_hub_models/models/fastsam_s/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -111,7 +111,7 @@ def export_model( # Trace the model source_model = torch.jit.trace( - model, make_torch_inputs(input_spec), check_trace=False + model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) # 2. Compile the model to an on-device asset @@ -121,29 +121,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_1,output_2,output_3,output_5", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -152,37 +160,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_1,output_2,output_3,output_5", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/fastsam_s/info.yaml b/qai_hub_models/models/fastsam_s/info.yaml index f2a59ae2..4d701b49 100644 --- a/qai_hub_models/models/fastsam_s/info.yaml +++ b/qai_hub_models/models/fastsam_s/info.yaml @@ -14,6 +14,7 @@ tags: [] research_paper: https://arxiv.org/abs/2306.12156 research_paper_title: Fast Segment Anything license: https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE +deploy_license: https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE source_repo: https://github.com/CASIA-IVA-Lab/FastSAM technical_details: Model checkpoint: fastsam-s.pt @@ -33,4 +34,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: agpl-3.0 +deploy_license_type: agpl-3.0 dataset: [] diff --git a/qai_hub_models/models/fastsam_s/perf.yaml b/qai_hub_models/models/fastsam_s/perf.yaml index ca26141a..af668b3d 100644 --- a/qai_hub_models/models/fastsam_s/perf.yaml +++ b/qai_hub_models/models/fastsam_s/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FastSam-S performance_metrics: - torchscript_onnx_tflite: - inference_time: 13071.0 - throughput: 76.50524060898171 + inference_time: 13114.0 + throughput: 76.25438462711605 estimated_peak_memory_range: - min: 7827456 - max: 10814968 + min: 7823360 + max: 25444440 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 288 - job_id: jn5qlr97p + job_id: jegn21vgo job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:39:15.450027Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 9234.0 + throughput: 108.29542993285683 + estimated_peak_memory_range: + min: 6332416 + max: 79756208 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 288 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 288 + job_id: joprkxv50 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:18:24.085348Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:39:15.450036Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/fastsam_s/requirements.txt b/qai_hub_models/models/fastsam_s/requirements.txt index 8d55bfa4..94980b0d 100644 --- a/qai_hub_models/models/fastsam_s/requirements.txt +++ b/qai_hub_models/models/fastsam_s/requirements.txt @@ -1,2 +1,3 @@ +seaborn==0.11.0 +thop==0.1.1.post2209072238 ultralytics==8.0.193 -torchvision diff --git a/qai_hub_models/models/fastsam_x/README.md b/qai_hub_models/models/fastsam_x/README.md index da7d4af4..4983e01b 100644 --- a/qai_hub_models/models/fastsam_x/README.md +++ b/qai_hub_models/models/fastsam_x/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FastSam-X found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/fastsam_x). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.fastsam_x.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FastSam-X can be found [here](https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Fast Segment Anything](https://arxiv.org/abs/2306.12156) diff --git a/qai_hub_models/models/fastsam_x/conftest.py b/qai_hub_models/models/fastsam_x/conftest.py new file mode 100644 index 00000000..e0c49878 --- /dev/null +++ b/qai_hub_models/models/fastsam_x/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.fastsam_x import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.fastsam_x.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/fastsam_x/demo.py b/qai_hub_models/models/fastsam_x/demo.py index 0bce5d27..5acecbe5 100644 --- a/qai_hub_models/models/fastsam_x/demo.py +++ b/qai_hub_models/models/fastsam_x/demo.py @@ -16,7 +16,7 @@ def main(is_test: bool = False): - fastsam_demo(FastSAM_X, image_path=INPUT_IMAGE, is_test=is_test) + fastsam_demo(FastSAM_X, MODEL_ID, image_path=INPUT_IMAGE, is_test=is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/fastsam_x/export.py b/qai_hub_models/models/fastsam_x/export.py index c4cdf8cd..617e16c5 100644 --- a/qai_hub_models/models/fastsam_x/export.py +++ b/qai_hub_models/models/fastsam_x/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -111,7 +111,7 @@ def export_model( # Trace the model source_model = torch.jit.trace( - model, make_torch_inputs(input_spec), check_trace=False + model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) # 2. Compile the model to an on-device asset @@ -121,29 +121,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_1,output_2,output_3,output_5", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -152,37 +160,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_1,output_2,output_3,output_5", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/fastsam_x/info.yaml b/qai_hub_models/models/fastsam_x/info.yaml index a39f4a47..aa479646 100644 --- a/qai_hub_models/models/fastsam_x/info.yaml +++ b/qai_hub_models/models/fastsam_x/info.yaml @@ -14,6 +14,7 @@ tags: [] research_paper: https://arxiv.org/abs/2306.12156 research_paper_title: Fast Segment Anything license: https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE +deploy_license: https://github.com/CASIA-IVA-Lab/FastSAM/blob/main/LICENSE source_repo: https://github.com/CASIA-IVA-Lab/FastSAM technical_details: Model checkpoint: fastsam-x.pt @@ -33,4 +34,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: agpl-3.0 +deploy_license_type: agpl-3.0 dataset: [] diff --git a/qai_hub_models/models/fastsam_x/perf.yaml b/qai_hub_models/models/fastsam_x/perf.yaml index 3fa52d1d..f4f2b30b 100644 --- a/qai_hub_models/models/fastsam_x/perf.yaml +++ b/qai_hub_models/models/fastsam_x/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FastSam-X performance_metrics: - torchscript_onnx_tflite: - inference_time: 64468.0 - throughput: 15.511571632437798 + inference_time: 64155.0 + throughput: 15.587249629802821 estimated_peak_memory_range: - min: 9224192 - max: 14449200 + min: 9207808 + max: 14058240 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 420 - job_id: jz5wl3xzp + job_id: jw566k75o job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:56:58.796143Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 47867.0 + throughput: 20.891219420477572 + estimated_peak_memory_range: + min: 7962624 + max: 152777152 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 420 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 420 + job_id: j1p3kyz52 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:38:35.191434Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:56:58.796153Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/fastsam_x/requirements.txt b/qai_hub_models/models/fastsam_x/requirements.txt index 8d55bfa4..94980b0d 100644 --- a/qai_hub_models/models/fastsam_x/requirements.txt +++ b/qai_hub_models/models/fastsam_x/requirements.txt @@ -1,2 +1,3 @@ +seaborn==0.11.0 +thop==0.1.1.post2209072238 ultralytics==8.0.193 -torchvision diff --git a/qai_hub_models/models/fcn_resnet50/README.md b/qai_hub_models/models/fcn_resnet50/README.md index c78904fe..275c82c9 100644 --- a/qai_hub_models/models/fcn_resnet50/README.md +++ b/qai_hub_models/models/fcn_resnet50/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FCN_ResNet50 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/fcn_resnet50). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.fcn_resnet50.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FCN_ResNet50 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038) diff --git a/qai_hub_models/models/fcn_resnet50/conftest.py b/qai_hub_models/models/fcn_resnet50/conftest.py new file mode 100644 index 00000000..21f265c6 --- /dev/null +++ b/qai_hub_models/models/fcn_resnet50/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.fcn_resnet50 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.fcn_resnet50.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/fcn_resnet50/demo.py b/qai_hub_models/models/fcn_resnet50/demo.py index 33478943..6c12063c 100644 --- a/qai_hub_models/models/fcn_resnet50/demo.py +++ b/qai_hub_models/models/fcn_resnet50/demo.py @@ -38,8 +38,8 @@ def main(is_test: bool = False): ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, FCN_ResNet50.get_model_id()) - model = demo_model_from_cli_args(FCN_ResNet50, args) + validate_on_device_demo_args(args, MODEL_ID) + model = demo_model_from_cli_args(FCN_ResNet50, MODEL_ID, args) # This FCN ResNet 50 demo comes from # https://pytorch.org/hub/pytorch_vision_fcn_resnet101/ diff --git a/qai_hub_models/models/fcn_resnet50/export.py b/qai_hub_models/models/fcn_resnet50/export.py index 63a60381..3a5dd85a 100644 --- a/qai_hub_models/models/fcn_resnet50/export.py +++ b/qai_hub_models/models/fcn_resnet50/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -119,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -150,37 +158,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/fcn_resnet50/info.yaml b/qai_hub_models/models/fcn_resnet50/info.yaml index 074affa6..87c73764 100644 --- a/qai_hub_models/models/fcn_resnet50/info.yaml +++ b/qai_hub_models/models/fcn_resnet50/info.yaml @@ -11,6 +11,7 @@ tags: [] research_paper: https://arxiv.org/abs/1411.4038 research_paper_title: Fully Convolutional Networks for Semantic Segmentation license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py technical_details: @@ -33,4 +34,5 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/fcn_resnet50/perf.yaml b/qai_hub_models/models/fcn_resnet50/perf.yaml index 7c674dea..477b5790 100644 --- a/qai_hub_models/models/fcn_resnet50/perf.yaml +++ b/qai_hub_models/models/fcn_resnet50/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FCN_ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 8563.0 - throughput: 116.78150181011328 + inference_time: 8550.0 + throughput: 116.95906432748538 estimated_peak_memory_range: min: 4263936 - max: 11057224 + max: 6443424 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 84 - job_id: joprl21vp + job_id: jn5q8dm57 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:32:21.594233Z' torchscript_onnx_qnn: - inference_time: 7864.0 - throughput: 127.1617497456765 + inference_time: 7881.0 + throughput: 126.8874508311128 estimated_peak_memory_range: min: 20480 - max: 13081680 + max: 13250472 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 126 - job_id: jep2r93xg + job_id: jw566075o + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 6407.0 + throughput: 156.07928827844546 + estimated_peak_memory_range: + min: 4251648 + max: 76376944 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 84 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 84 + job_id: j1glnqlpv job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:21:53.248417Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:34:20.486125Z' + torchscript_onnx_qnn: + inference_time: 5846.0 + throughput: 171.05713308244952 + estimated_peak_memory_range: + min: 638976 + max: 55934880 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 126 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 126 + job_id: j1p3krz52 + job_status: Passed diff --git a/qai_hub_models/models/fcn_resnet50/test.py b/qai_hub_models/models/fcn_resnet50/test.py index 7c30198c..4ff0b2ab 100644 --- a/qai_hub_models/models/fcn_resnet50/test.py +++ b/qai_hub_models/models/fcn_resnet50/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest from qai_hub_models.models.fcn_resnet50.app import FCN_ResNet50App from qai_hub_models.models.fcn_resnet50.demo import INPUT_IMAGE_ADDRESS @@ -39,10 +40,12 @@ def test_task(): _test_impl(FCN_ResNet50App(FCN_ResNet50.from_pretrained())) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): _test_impl(FCN_ResNet50App(FCN_ResNet50.from_pretrained().convert_to_torchscript())) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_122ns_lowres/README.md b/qai_hub_models/models/ffnet_122ns_lowres/README.md index b1f3e848..56834d32 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/README.md +++ b/qai_hub_models/models/ffnet_122ns_lowres/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-122NS-LowRes found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_122ns_lowres). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.ffnet_122ns_lowres.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-122NS-LowRes can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_122ns_lowres/conftest.py b/qai_hub_models/models/ffnet_122ns_lowres/conftest.py new file mode 100644 index 00000000..d6573b6a --- /dev/null +++ b/qai_hub_models/models/ffnet_122ns_lowres/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_122ns_lowres import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_122ns_lowres.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_122ns_lowres/export.py b/qai_hub_models/models/ffnet_122ns_lowres/export.py index 3125277a..05689b50 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/export.py +++ b/qai_hub_models/models/ffnet_122ns_lowres/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_122ns_lowres/info.yaml b/qai_hub_models/models/ffnet_122ns_lowres/info.yaml index 10198c19..a40f6ccd 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/info.yaml +++ b/qai_hub_models/models/ffnet_122ns_lowres/info.yaml @@ -12,6 +12,7 @@ tags: [] research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet122NS_CCC_cityscapes_state_dict_quarts_pre_down @@ -30,7 +31,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml index f41f23a1..37ffb9dc 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml +++ b/qai_hub_models/models/ffnet_122ns_lowres/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-122NS-LowRes performance_metrics: - torchscript_onnx_tflite: - inference_time: 10460.0 - throughput: 95.60229445506693 + inference_time: 10407.0 + throughput: 96.08917075045642 estimated_peak_memory_range: - min: 643072 - max: 2912400 + min: 12288 + max: 2345904 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 216 - job_id: jqpyojnr5 + job_id: jmg9vel57 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:45:44.022843Z' torchscript_onnx_qnn: - inference_time: 10778.0 - throughput: 92.78159213212099 + inference_time: 10785.0 + throughput: 92.7213722763097 estimated_peak_memory_range: - min: 6332416 - max: 39442976 + min: 6205440 + max: 39312144 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 349 - job_id: j2p0m2k2g + job_id: jvgdwle5j + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 7373.0 + throughput: 135.63000135630003 + estimated_peak_memory_range: + min: 643072 + max: 58158976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 216 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 216 + job_id: jnp10x25q job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:22:01.714758Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:47:44.631260Z' + torchscript_onnx_qnn: + inference_time: 7627.0 + throughput: 131.1131506490101 + estimated_peak_memory_range: + min: 6311936 + max: 85982464 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 349 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 349 + job_id: jz57z3lp3 + job_status: Passed diff --git a/qai_hub_models/models/ffnet_122ns_lowres/requirements.txt b/qai_hub_models/models/ffnet_122ns_lowres/requirements.txt index 73ad8aa8..2470ac6c 100644 --- a/qai_hub_models/models/ffnet_122ns_lowres/requirements.txt +++ b/qai_hub_models/models/ffnet_122ns_lowres/requirements.txt @@ -1 +1 @@ -scikit-image>=0.21.0 +scikit-image==0.21.0 diff --git a/qai_hub_models/models/ffnet_40s/README.md b/qai_hub_models/models/ffnet_40s/README.md index ce9169df..4a670a0a 100644 --- a/qai_hub_models/models/ffnet_40s/README.md +++ b/qai_hub_models/models/ffnet_40s/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-40S found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_40s). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.ffnet_40s.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-40S can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_40s/conftest.py b/qai_hub_models/models/ffnet_40s/conftest.py new file mode 100644 index 00000000..61ded012 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_40s import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_40s.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_40s/export.py b/qai_hub_models/models/ffnet_40s/export.py index 0800da7f..dac6aea6 100644 --- a/qai_hub_models/models/ffnet_40s/export.py +++ b/qai_hub_models/models/ffnet_40s/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_40s/info.yaml b/qai_hub_models/models/ffnet_40s/info.yaml index 94830c01..967b8b32 100644 --- a/qai_hub_models/models/ffnet_40s/info.yaml +++ b/qai_hub_models/models/ffnet_40s/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet40S_dBBB_cityscapes_state_dict_quarts @@ -31,7 +32,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_40s/perf.yaml b/qai_hub_models/models/ffnet_40s/perf.yaml index cd79f677..038d58da 100644 --- a/qai_hub_models/models/ffnet_40s/perf.yaml +++ b/qai_hub_models/models/ffnet_40s/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-40S performance_metrics: - torchscript_onnx_tflite: - inference_time: 22739.0 - throughput: 43.97730770922204 + inference_time: 22513.0 + throughput: 44.41878026029405 estimated_peak_memory_range: - min: 2564096 - max: 5001048 + min: 2539520 + max: 5190832 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 92 - job_id: jegnzm9vg + job_id: jwgoyl458 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:12:39.279085Z' torchscript_onnx_qnn: - inference_time: 17313.0 - throughput: 57.760064691272454 + inference_time: 17466.0 + throughput: 57.25409366769724 estimated_peak_memory_range: - min: 25202688 - max: 51306904 + min: 25210880 + max: 48310168 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 141 - job_id: jep2r97xg + job_id: j7gjxr7pd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 16613.0 + throughput: 60.19382411364594 + estimated_peak_memory_range: + min: 61440 + max: 100488656 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 92 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 92 + job_id: j1pv3l75x job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:14:26.648274Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:15:49.686166Z' + torchscript_onnx_qnn: + inference_time: 12681.0 + throughput: 78.85813421654444 + estimated_peak_memory_range: + min: 25182208 + max: 82551136 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 141 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 141 + job_id: jlpe977gr + job_status: Passed diff --git a/qai_hub_models/models/ffnet_40s/requirements.txt b/qai_hub_models/models/ffnet_40s/requirements.txt index 73ad8aa8..2470ac6c 100644 --- a/qai_hub_models/models/ffnet_40s/requirements.txt +++ b/qai_hub_models/models/ffnet_40s/requirements.txt @@ -1 +1 @@ -scikit-image>=0.21.0 +scikit-image==0.21.0 diff --git a/qai_hub_models/models/ffnet_40s_quantized/README.md b/qai_hub_models/models/ffnet_40s_quantized/README.md index 8508542a..18fb3a95 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/README.md +++ b/qai_hub_models/models/ffnet_40s_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-40S-Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_40s_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.ffnet_40s_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-40S-Quantized can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_40s_quantized/conftest.py b/qai_hub_models/models/ffnet_40s_quantized/conftest.py new file mode 100644 index 00000000..f17ac459 --- /dev/null +++ b/qai_hub_models/models/ffnet_40s_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_40s_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_40s_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_40s_quantized/export.py b/qai_hub_models/models/ffnet_40s_quantized/export.py index a12d147c..08ed7624 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/export.py +++ b/qai_hub_models/models/ffnet_40s_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -123,8 +123,8 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -132,21 +132,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -158,30 +166,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_40s_quantized/info.yaml b/qai_hub_models/models/ffnet_40s_quantized/info.yaml index bf5bfbe3..163abd5d 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/info.yaml +++ b/qai_hub_models/models/ffnet_40s_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet40S_dBBB_cityscapes_state_dict_quarts @@ -32,7 +33,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_40s_quantized/perf.yaml b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml index d0408e2b..6d795ec7 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/perf.yaml +++ b/qai_hub_models/models/ffnet_40s_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-40S-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 6451.0 - throughput: 155.0147263990079 + inference_time: 6439.0 + throughput: 155.3036185743128 estimated_peak_memory_range: - min: 851968 - max: 2582296 + min: 888832 + max: 2660784 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 97 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 97 - job_id: j0pxl6x9p + job_id: jqp4q92go job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:35:32.125659Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 4671.0 + throughput: 214.08691928923142 + estimated_peak_memory_range: + min: 16384 + max: 65022448 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 97 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 97 + job_id: j0pxvd8g7 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:15:22.015621Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:35:32.125673Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/ffnet_40s_quantized/test.py b/qai_hub_models/models/ffnet_40s_quantized/test.py index 12e10323..9d2ebf14 100644 --- a/qai_hub_models/models/ffnet_40s_quantized/test.py +++ b/qai_hub_models/models/ffnet_40s_quantized/test.py @@ -17,5 +17,6 @@ def test_off_target_numerical(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_54s/README.md b/qai_hub_models/models/ffnet_54s/README.md index ecb28e6c..487434ac 100644 --- a/qai_hub_models/models/ffnet_54s/README.md +++ b/qai_hub_models/models/ffnet_54s/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-54S found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_54s). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.ffnet_54s.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-54S can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_54s/conftest.py b/qai_hub_models/models/ffnet_54s/conftest.py new file mode 100644 index 00000000..e87eb0b0 --- /dev/null +++ b/qai_hub_models/models/ffnet_54s/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_54s import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_54s.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_54s/export.py b/qai_hub_models/models/ffnet_54s/export.py index 14761dc7..5a27ee33 100644 --- a/qai_hub_models/models/ffnet_54s/export.py +++ b/qai_hub_models/models/ffnet_54s/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_54s/info.yaml b/qai_hub_models/models/ffnet_54s/info.yaml index be93ffc8..846f0dd0 100644 --- a/qai_hub_models/models/ffnet_54s/info.yaml +++ b/qai_hub_models/models/ffnet_54s/info.yaml @@ -12,6 +12,7 @@ tags: [] research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet54S_dBBB_cityscapes_state_dict_quarts @@ -30,7 +31,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_54s/perf.yaml b/qai_hub_models/models/ffnet_54s/perf.yaml index 13243d9e..9f1617aa 100644 --- a/qai_hub_models/models/ffnet_54s/perf.yaml +++ b/qai_hub_models/models/ffnet_54s/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-54S performance_metrics: - torchscript_onnx_tflite: - inference_time: 25261.0 - throughput: 39.58671469854717 + inference_time: 24853.0 + throughput: 40.23659115599727 estimated_peak_memory_range: - min: 2551808 - max: 4912232 + min: 2572288 + max: 4947328 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 113 - job_id: jygzlj8z5 + job_id: j0pxv38g7 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:47:57.765081Z' torchscript_onnx_qnn: - inference_time: 20585.0 - throughput: 48.57906242409521 + inference_time: 19975.0 + throughput: 50.06257822277847 estimated_peak_memory_range: - min: 25206784 - max: 41071808 + min: 25214976 + max: 52299192 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 176 - job_id: jz5wl38zp + job_id: joprkok50 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 18421.0 + throughput: 54.28586938819825 + estimated_peak_memory_range: + min: 462848 + max: 113159440 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 113 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 113 + job_id: jo5mro7gk job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:19.360420Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:49:56.432155Z' + torchscript_onnx_qnn: + inference_time: 14570.0 + throughput: 68.63417982155113 + estimated_peak_memory_range: + min: 154132480 + max: 217703424 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 176 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 176 + job_id: jep2846p6 + job_status: Passed diff --git a/qai_hub_models/models/ffnet_54s/requirements.txt b/qai_hub_models/models/ffnet_54s/requirements.txt index 73ad8aa8..2470ac6c 100644 --- a/qai_hub_models/models/ffnet_54s/requirements.txt +++ b/qai_hub_models/models/ffnet_54s/requirements.txt @@ -1 +1 @@ -scikit-image>=0.21.0 +scikit-image==0.21.0 diff --git a/qai_hub_models/models/ffnet_54s_quantized/README.md b/qai_hub_models/models/ffnet_54s_quantized/README.md index 3b9d860f..346077ff 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/README.md +++ b/qai_hub_models/models/ffnet_54s_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-54S-Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_54s_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.ffnet_54s_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-54S-Quantized can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_54s_quantized/conftest.py b/qai_hub_models/models/ffnet_54s_quantized/conftest.py new file mode 100644 index 00000000..f60efc9c --- /dev/null +++ b/qai_hub_models/models/ffnet_54s_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_54s_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_54s_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_54s_quantized/export.py b/qai_hub_models/models/ffnet_54s_quantized/export.py index 382d06b3..83dc6e73 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/export.py +++ b/qai_hub_models/models/ffnet_54s_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -123,8 +123,8 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -132,21 +132,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -158,30 +166,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_54s_quantized/info.yaml b/qai_hub_models/models/ffnet_54s_quantized/info.yaml index a97c6bbf..a7f45fd7 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/info.yaml +++ b/qai_hub_models/models/ffnet_54s_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet54S_dBBB_cityscapes_state_dict_quarts @@ -32,7 +33,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_54s_quantized/perf.yaml b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml index 4e723a91..1f34553f 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/perf.yaml +++ b/qai_hub_models/models/ffnet_54s_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-54S-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 7130.0 - throughput: 140.25245441795232 + inference_time: 7127.0 + throughput: 140.31149151115477 estimated_peak_memory_range: - min: 643072 - max: 23970880 + min: 712704 + max: 2530520 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 118 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 118 - job_id: jep2r9wmg + job_id: j7gjxeepd job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:16:07.677264Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 5136.0 + throughput: 194.70404984423675 + estimated_peak_memory_range: + min: 16384 + max: 71676704 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 118 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 118 + job_id: jnp10e75q + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:19:49.268425Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:16:07.677274Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/ffnet_54s_quantized/test.py b/qai_hub_models/models/ffnet_54s_quantized/test.py index 0b5b9132..3202e209 100644 --- a/qai_hub_models/models/ffnet_54s_quantized/test.py +++ b/qai_hub_models/models/ffnet_54s_quantized/test.py @@ -17,5 +17,6 @@ def test_off_target_numerical(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/ffnet_78s/README.md b/qai_hub_models/models/ffnet_78s/README.md index f8012df2..2b2e42f2 100644 --- a/qai_hub_models/models/ffnet_78s/README.md +++ b/qai_hub_models/models/ffnet_78s/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-78S found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_78s). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.ffnet_78s.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-78S can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_78s/conftest.py b/qai_hub_models/models/ffnet_78s/conftest.py new file mode 100644 index 00000000..e4a0bbd6 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_78s import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_78s.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_78s/export.py b/qai_hub_models/models/ffnet_78s/export.py index 9242f6bb..09e78a83 100644 --- a/qai_hub_models/models/ffnet_78s/export.py +++ b/qai_hub_models/models/ffnet_78s/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_78s/info.yaml b/qai_hub_models/models/ffnet_78s/info.yaml index 56fc5f75..1f1d8cf9 100644 --- a/qai_hub_models/models/ffnet_78s/info.yaml +++ b/qai_hub_models/models/ffnet_78s/info.yaml @@ -12,6 +12,7 @@ tags: [] research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet78S_dBBB_cityscapes_state_dict_quarts @@ -30,7 +31,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_78s/perf.yaml b/qai_hub_models/models/ffnet_78s/perf.yaml index 2db45bd8..3190c99d 100644 --- a/qai_hub_models/models/ffnet_78s/perf.yaml +++ b/qai_hub_models/models/ffnet_78s/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-78S performance_metrics: - torchscript_onnx_tflite: - inference_time: 29611.0 - throughput: 33.77123366316572 + inference_time: 28993.0 + throughput: 34.49108405477184 estimated_peak_memory_range: - min: 2596864 - max: 5429112 + min: 2699264 + max: 4868664 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: jep2r9emg + job_id: j0pxvq9g7 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:58:52.803970Z' torchscript_onnx_qnn: - inference_time: 24120.0 - throughput: 41.459369817578775 + inference_time: 23765.0 + throughput: 42.07868714496108 estimated_peak_memory_range: - min: 2215936 - max: 32957000 + min: 25214976 + max: 45434792 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 236 - job_id: jqpyojm45 + job_id: joprkre50 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 21479.0 + throughput: 46.557102285953725 + estimated_peak_memory_range: + min: 2478080 + max: 130875008 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 149 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 149 + job_id: jegn24mgo job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:36:14.251855Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:00:58.594801Z' + torchscript_onnx_qnn: + inference_time: 17826.0 + throughput: 56.09783462358353 + estimated_peak_memory_range: + min: 25219072 + max: 99798224 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 236 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 236 + job_id: jep281mp6 + job_status: Passed diff --git a/qai_hub_models/models/ffnet_78s/requirements.txt b/qai_hub_models/models/ffnet_78s/requirements.txt index 73ad8aa8..2470ac6c 100644 --- a/qai_hub_models/models/ffnet_78s/requirements.txt +++ b/qai_hub_models/models/ffnet_78s/requirements.txt @@ -1 +1 @@ -scikit-image>=0.21.0 +scikit-image==0.21.0 diff --git a/qai_hub_models/models/ffnet_78s_lowres/README.md b/qai_hub_models/models/ffnet_78s_lowres/README.md index eba7805b..1575435d 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/README.md +++ b/qai_hub_models/models/ffnet_78s_lowres/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-78S-LowRes found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_78s_lowres). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.ffnet_78s_lowres.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-78S-LowRes can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_78s_lowres/conftest.py b/qai_hub_models/models/ffnet_78s_lowres/conftest.py new file mode 100644 index 00000000..07c8d92a --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_lowres/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_78s_lowres import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_78s_lowres.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_78s_lowres/export.py b/qai_hub_models/models/ffnet_78s_lowres/export.py index 00e1a3b1..08f9197f 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/export.py +++ b/qai_hub_models/models/ffnet_78s_lowres/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_78s_lowres/info.yaml b/qai_hub_models/models/ffnet_78s_lowres/info.yaml index 9a4a4992..c50bd7cc 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/info.yaml +++ b/qai_hub_models/models/ffnet_78s_lowres/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet78S_BCC_cityscapes_state_dict_quarts_pre_down @@ -31,7 +32,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_78s_lowres/perf.yaml b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml index 4fceeb4f..e55df80c 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/perf.yaml +++ b/qai_hub_models/models/ffnet_78s_lowres/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-78S-LowRes performance_metrics: - torchscript_onnx_tflite: - inference_time: 10833.0 - throughput: 92.31053263177328 + inference_time: 10810.0 + throughput: 92.50693802035153 estimated_peak_memory_range: - min: 671744 - max: 3588808 + min: 0 + max: 1890472 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 149 - job_id: j0pxl6d9p + job_id: jegn2dmgo job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:53:13.887710Z' torchscript_onnx_qnn: - inference_time: 11410.0 - throughput: 87.64241893076249 + inference_time: 11408.0 + throughput: 87.6577840112202 estimated_peak_memory_range: - min: 565248 - max: 42397168 + min: 16384 + max: 52414400 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 237 - job_id: jegnzm7mg + job_id: jep28qmp6 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 7768.0 + throughput: 128.73326467559218 + estimated_peak_memory_range: + min: 540672 + max: 52237632 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 149 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 149 + job_id: joprkme50 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:29:56.988054Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:57:44.327749Z' + torchscript_onnx_qnn: + inference_time: 8084.0 + throughput: 123.70113805047006 + estimated_peak_memory_range: + min: 6328320 + max: 72586224 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 237 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 237 + job_id: jqpyek4gy + job_status: Passed diff --git a/qai_hub_models/models/ffnet_78s_lowres/requirements.txt b/qai_hub_models/models/ffnet_78s_lowres/requirements.txt index 73ad8aa8..2470ac6c 100644 --- a/qai_hub_models/models/ffnet_78s_lowres/requirements.txt +++ b/qai_hub_models/models/ffnet_78s_lowres/requirements.txt @@ -1 +1 @@ -scikit-image>=0.21.0 +scikit-image==0.21.0 diff --git a/qai_hub_models/models/ffnet_78s_quantized/README.md b/qai_hub_models/models/ffnet_78s_quantized/README.md index 4a58d65a..e4cfbc65 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/README.md +++ b/qai_hub_models/models/ffnet_78s_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of FFNet-78S-Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/ffnet_78s_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.ffnet_78s_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of FFNet-78S-Quantized can be found [here](https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Simple and Efficient Architectures for Semantic Segmentation](https://arxiv.org/abs/2206.08236) diff --git a/qai_hub_models/models/ffnet_78s_quantized/conftest.py b/qai_hub_models/models/ffnet_78s_quantized/conftest.py new file mode 100644 index 00000000..72a01635 --- /dev/null +++ b/qai_hub_models/models/ffnet_78s_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.ffnet_78s_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.ffnet_78s_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/ffnet_78s_quantized/export.py b/qai_hub_models/models/ffnet_78s_quantized/export.py index 0e2f3a1e..49c83ae3 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/export.py +++ b/qai_hub_models/models/ffnet_78s_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -123,8 +123,8 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -132,21 +132,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -158,30 +166,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/ffnet_78s_quantized/info.yaml b/qai_hub_models/models/ffnet_78s_quantized/info.yaml index cdb2f813..86f97e34 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/info.yaml +++ b/qai_hub_models/models/ffnet_78s_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/2206.08236 research_paper_title: Simple and Efficient Architectures for Semantic Segmentation license: https://github.com/Qualcomm-AI-research/FFNet/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/Qualcomm-AI-research/FFNet technical_details: Model checkpoint: ffnet78S_dBBB_cityscapes_state_dict_quarts @@ -32,7 +33,8 @@ form_factors: - Phone - Tablet has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - cityscapes diff --git a/qai_hub_models/models/ffnet_78s_quantized/perf.yaml b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml index 17c2c4c9..3201ba82 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/perf.yaml +++ b/qai_hub_models/models/ffnet_78s_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: FFNet-78S-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 8362.0 - throughput: 119.58861516383641 + inference_time: 8368.0 + throughput: 119.50286806883365 estimated_peak_memory_range: - min: 655360 - max: 2403480 + min: 663552 + max: 2264096 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 154 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 154 - job_id: j1gly2oe5 + job_id: jegn2jmgo job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:31:42.853131Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 6095.0 + throughput: 164.06890894175552 + estimated_peak_memory_range: + min: 16384 + max: 84212448 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 154 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 154 + job_id: jep282mp6 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:13:29.270963Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:31:42.853166Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/ffnet_78s_quantized/test.py b/qai_hub_models/models/ffnet_78s_quantized/test.py index 1c3c8a51..9cb6fa96 100644 --- a/qai_hub_models/models/ffnet_78s_quantized/test.py +++ b/qai_hub_models/models/ffnet_78s_quantized/test.py @@ -17,5 +17,6 @@ def test_off_target_numerical(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/googlenet/README.md b/qai_hub_models/models/googlenet/README.md index ddd6d58f..32249c1d 100644 --- a/qai_hub_models/models/googlenet/README.md +++ b/qai_hub_models/models/googlenet/README.md @@ -10,7 +10,7 @@ This is based on the implementation of GoogLeNet found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/googlenet). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.googlenet.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of GoogLeNet can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842) diff --git a/qai_hub_models/models/googlenet/conftest.py b/qai_hub_models/models/googlenet/conftest.py new file mode 100644 index 00000000..30481135 --- /dev/null +++ b/qai_hub_models/models/googlenet/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.googlenet import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.googlenet.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/googlenet/demo.py b/qai_hub_models/models/googlenet/demo.py index 7be6a572..954f7edd 100644 --- a/qai_hub_models/models/googlenet/demo.py +++ b/qai_hub_models/models/googlenet/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.googlenet.model import GoogLeNet +from qai_hub_models.models.googlenet.model import MODEL_ID, GoogLeNet def main(is_test: bool = False): - imagenet_demo(GoogLeNet, is_test) + imagenet_demo(GoogLeNet, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/googlenet/export.py b/qai_hub_models/models/googlenet/export.py index 38a7d779..81262c38 100644 --- a/qai_hub_models/models/googlenet/export.py +++ b/qai_hub_models/models/googlenet/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/googlenet/info.yaml b/qai_hub_models/models/googlenet/info.yaml index 047a032f..e3143397 100644 --- a/qai_hub_models/models/googlenet/info.yaml +++ b/qai_hub_models/models/googlenet/info.yaml @@ -12,6 +12,7 @@ tags: [] research_paper: https://arxiv.org/abs/1409.4842 research_paper_title: Going Deeper with Convolutions license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py technical_details: Model checkpoint: Imagenet @@ -34,6 +35,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/googlenet/model.py b/qai_hub_models/models/googlenet/model.py index 4f1dcd5d..d4319351 100644 --- a/qai_hub_models/models/googlenet/model.py +++ b/qai_hub_models/models/googlenet/model.py @@ -14,6 +14,6 @@ class GoogLeNet(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: - net = tv_models.googlenet(weights=weights) - return cls(net) + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> GoogLeNet: + net = tv_models.googlenet(weights=weights, transform_input=False) + return cls(net, transform_input=True) diff --git a/qai_hub_models/models/googlenet/perf.yaml b/qai_hub_models/models/googlenet/perf.yaml index 5a446a3a..655972d7 100644 --- a/qai_hub_models/models/googlenet/perf.yaml +++ b/qai_hub_models/models/googlenet/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: GoogLeNet performance_metrics: - torchscript_onnx_tflite: - inference_time: 1471.0 - throughput: 679.8096532970768 + inference_time: 1041.0 + throughput: 960.6147934678194 estimated_peak_memory_range: - min: 16384 - max: 1850752 + min: 12288 + max: 1836376 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 94 + layers_on_npu: 84 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 94 - job_id: jw568z3vg + total_layers: 84 + job_id: joprq3950 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:24:33.473846Z' torchscript_onnx_qnn: - inference_time: 1808.0 - throughput: 553.0973451327434 + inference_time: 1083.0 + throughput: 923.3610341643582 + estimated_peak_memory_range: + min: 32768 + max: 26497136 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: jqpyw37gy + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 647.0 + throughput: 1545.595054095827 estimated_peak_memory_range: - min: 24576 - max: 31167584 + min: 16384 + max: 45415536 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 156 + layers_on_npu: 84 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 156 - job_id: j1p3z14x5 + total_layers: 84 + job_id: jep26y4g6 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:07:34.463888Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:28:59.659531Z' + torchscript_onnx_qnn: + inference_time: 682.0 + throughput: 1466.275659824047 + estimated_peak_memory_range: + min: 0 + max: 49977664 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: j2p0q065w + job_status: Passed diff --git a/qai_hub_models/models/googlenet/test.py b/qai_hub_models/models/googlenet/test.py index 6f0b2a66..137eb7f1 100644 --- a/qai_hub_models/models/googlenet/test.py +++ b/qai_hub_models/models/googlenet/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(GoogLeNet.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(GoogLeNet.from_pretrained()) diff --git a/qai_hub_models/models/googlenet_quantized/README.md b/qai_hub_models/models/googlenet_quantized/README.md index 2b6438a7..8e923612 100644 --- a/qai_hub_models/models/googlenet_quantized/README.md +++ b/qai_hub_models/models/googlenet_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of GoogLeNetQuantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/googlenet_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.googlenet_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of GoogLeNetQuantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Going Deeper with Convolutions](https://arxiv.org/abs/1409.4842) diff --git a/qai_hub_models/models/googlenet_quantized/conftest.py b/qai_hub_models/models/googlenet_quantized/conftest.py new file mode 100644 index 00000000..4f758021 --- /dev/null +++ b/qai_hub_models/models/googlenet_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.googlenet_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.googlenet_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/googlenet_quantized/demo.py b/qai_hub_models/models/googlenet_quantized/demo.py index 5f363507..797f6fd2 100644 --- a/qai_hub_models/models/googlenet_quantized/demo.py +++ b/qai_hub_models/models/googlenet_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.googlenet_quantized.model import GoogLeNetQuantizable +from qai_hub_models.models.googlenet_quantized.model import ( + MODEL_ID, + GoogLeNetQuantizable, +) def main(is_test: bool = False): - imagenet_demo(GoogLeNetQuantizable, is_test) + imagenet_demo(GoogLeNetQuantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/googlenet_quantized/export.py b/qai_hub_models/models/googlenet_quantized/export.py index 34cc8ec6..7ed01e3e 100644 --- a/qai_hub_models/models/googlenet_quantized/export.py +++ b/qai_hub_models/models/googlenet_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,42 +163,44 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, metrics="psnr,top1,top5" ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/googlenet_quantized/info.yaml b/qai_hub_models/models/googlenet_quantized/info.yaml index c7f1e45c..0d5b5538 100644 --- a/qai_hub_models/models/googlenet_quantized/info.yaml +++ b/qai_hub_models/models/googlenet_quantized/info.yaml @@ -13,12 +13,13 @@ tags: research_paper: https://arxiv.org/abs/1409.4842 research_paper_title: Going Deeper with Convolutions license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py technical_details: Model checkpoint: Imagenet Input resolution: 224x224 Number of parameters: 6.62M - Model size: 16.0 MB + Model size: 6.55 MB applicable_scenarios: - Medical Imaging - Anomaly Detection @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/googlenet_quantized/model.py b/qai_hub_models/models/googlenet_quantized/model.py index 59323b6d..e100c42a 100644 --- a/qai_hub_models/models/googlenet_quantized/model.py +++ b/qai_hub_models/models/googlenet_quantized/model.py @@ -14,14 +14,17 @@ import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.googlenet.model import GoogLeNet -from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime +from qai_hub_models.utils.quantization_aimet import tie_aimet_observer_groups MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 DEFAULT_ENCODINGS = "googlenet_quantized_encodings.json" @@ -37,14 +40,20 @@ def __init__( ) -> None: GoogLeNet.__init__(self, sim_model.model) AIMETQuantizableMixin.__init__( - self, sim_model, needs_onnx_direct_aimet_export=True + self, + sim_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, aimet_encodings: str | None = "DEFAULT", - ) -> "GoogLeNet": + ) -> "GoogLeNetQuantizable": """ Parameters: aimet_encodings: @@ -53,17 +62,19 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ model = GoogLeNet.from_pretrained() - input_shape = model.get_input_spec()["image_tensor"][0] + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) equalize_model(model, input_shape) sim = QuantizationSimModel( - model.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_per_channel_aimet_config(), + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) + cls._tie_pre_concat_quantizers(sim) if aimet_encodings: if aimet_encodings == "DEFAULT": @@ -74,3 +85,36 @@ def from_pretrained( sim.model.eval() return cls(sim) + + @classmethod + def _tie_pre_concat_quantizers(cls, sim: QuantizationSimModel): + """ + This ties together the output quantizers prior to concatenations. This + prevents unnecessary re-quantization during the concatenation. + """ + blocks = [ + sim.model.net.inception3a, + sim.model.net.inception3b, + sim.model.net.inception4a, + sim.model.net.inception4b, + sim.model.net.inception4c, + sim.model.net.inception4d, + sim.model.net.inception4e, + sim.model.net.inception5a, + sim.model.net.inception5b, + ] + + idx = 3 + groups = [] + for block in blocks: + groups.append( + [ + getattr(block.branch1, f"module_relu_{idx}"), + getattr(getattr(block.branch2, "1"), f"module_relu_{idx+2}"), + getattr(getattr(block.branch3, "1"), f"module_relu_{idx+4}"), + getattr(getattr(block.branch4, "1"), f"module_relu_{idx+5}"), + ] + ) + idx += 6 + + tie_aimet_observer_groups(groups) diff --git a/qai_hub_models/models/googlenet_quantized/perf.yaml b/qai_hub_models/models/googlenet_quantized/perf.yaml index 78455c1d..e9d21985 100644 --- a/qai_hub_models/models/googlenet_quantized/perf.yaml +++ b/qai_hub_models/models/googlenet_quantized/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: GoogLeNetQuantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1026.0 - throughput: 974.6588693957115 + inference_time: 331.0 + throughput: 3021.1480362537764 estimated_peak_memory_range: - min: 20480 - max: 1771688 + min: 12288 + max: 1926544 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 183 + layers_on_npu: 87 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 183 - job_id: j2p0m2d2g + total_layers: 87 + job_id: jnp109l5q job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:54:15.624495Z' torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 365.0 + throughput: 2739.72602739726 + estimated_peak_memory_range: + min: 638976 + max: 5546832 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 89 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 89 + job_id: jqp4qzlgo + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 248.0 + throughput: 4032.2580645161293 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 16384 + max: 32361600 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 87 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: '' - job_status: Skipped + total_layers: 87 + job_id: jz57zqrp3 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:34:34.707459Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:00:04.109028Z' + torchscript_onnx_qnn: + inference_time: 258.0 + throughput: 3875.968992248062 + estimated_peak_memory_range: + min: 618496 + max: 47357168 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 89 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 89 + job_id: j0pxvw9g7 + job_status: Passed diff --git a/qai_hub_models/models/googlenet_quantized/test.py b/qai_hub_models/models/googlenet_quantized/test.py index 65afe84d..c116898d 100644 --- a/qai_hub_models/models/googlenet_quantized/test.py +++ b/qai_hub_models/models/googlenet_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.googlenet_quantized.demo import main as demo_main from qai_hub_models.models.googlenet_quantized.model import ( @@ -25,16 +24,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - GoogLeNetQuantizable.from_pretrained(), - diff_tol=0.01, - rtol=0.02, - atol=0.2, - is_quantized=True, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/hrnet_pose/README.md b/qai_hub_models/models/hrnet_pose/README.md index 6d3e6461..5628cf13 100644 --- a/qai_hub_models/models/hrnet_pose/README.md +++ b/qai_hub_models/models/hrnet_pose/README.md @@ -10,7 +10,7 @@ This is based on the implementation of HRNetPose found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/hrnet_pose). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.hrnet_pose.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of HRNetPose can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1902.09212) diff --git a/qai_hub_models/models/hrnet_pose/conftest.py b/qai_hub_models/models/hrnet_pose/conftest.py new file mode 100644 index 00000000..1e461dec --- /dev/null +++ b/qai_hub_models/models/hrnet_pose/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.hrnet_pose import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.hrnet_pose.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/hrnet_pose/demo.py b/qai_hub_models/models/hrnet_pose/demo.py index c5844da9..52f35b3e 100644 --- a/qai_hub_models/models/hrnet_pose/demo.py +++ b/qai_hub_models/models/hrnet_pose/demo.py @@ -34,10 +34,10 @@ def main(is_test: bool = False): help="image file path or URL", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, HRNetPose.get_model_id()) + validate_on_device_demo_args(args, MODEL_ID) # Load image & model - model = demo_model_from_cli_args(HRNetPose, args) + model = demo_model_from_cli_args(HRNetPose, MODEL_ID, args) image = load_image(args.image) print("Model Loaded") diff --git a/qai_hub_models/models/hrnet_pose/export.py b/qai_hub_models/models/hrnet_pose/export.py index 2c4d9846..338f35fd 100644 --- a/qai_hub_models/models/hrnet_pose/export.py +++ b/qai_hub_models/models/hrnet_pose/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -119,29 +119,37 @@ def export_model( + " --force_channel_last_input image_tensor" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -150,37 +158,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/hrnet_pose/info.yaml b/qai_hub_models/models/hrnet_pose/info.yaml index e70da7c1..bf4f1b15 100644 --- a/qai_hub_models/models/hrnet_pose/info.yaml +++ b/qai_hub_models/models/hrnet_pose/info.yaml @@ -11,6 +11,7 @@ research_paper: https://arxiv.org/abs/1902.09212 research_paper_title: Deep High-Resolution Representation Learning for Human Pose Estimation license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet technical_details: @@ -30,4 +31,5 @@ related_models: [litehrnet, openpose] has_static_banner: yes has_animated_banner: no license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/hrnet_pose/model.py b/qai_hub_models/models/hrnet_pose/model.py index 065ecc06..496f5e68 100644 --- a/qai_hub_models/models/hrnet_pose/model.py +++ b/qai_hub_models/models/hrnet_pose/model.py @@ -5,6 +5,7 @@ from __future__ import annotations import sys +from importlib import reload import torch import torch.nn as nn @@ -40,9 +41,20 @@ def from_pretrained(cls) -> HRNetPose: ).fetch() weights = torch.load(weights_file, map_location="cpu") with SourceAsRoot( - SOURCE_REPOSITORY, COMMIT_HASH, MODEL_ID, MODEL_ASSET_VERSION + SOURCE_REPOSITORY, + COMMIT_HASH, + MODEL_ID, + MODEL_ASSET_VERSION, + keep_sys_modules=True, ): sys.path.append("./lib") + + # This repository has a top-level "models", which is common. We + # explicitly reload it in case it has been loaded and cached by another + # package (or our models when executing from qai_hub_models/) + import models + + reload(models) from lib.config import cfg from models.pose_hrnet import PoseHighResolutionNet diff --git a/qai_hub_models/models/hrnet_pose/perf.yaml b/qai_hub_models/models/hrnet_pose/perf.yaml index bcbe5478..e8ac7da2 100644 --- a/qai_hub_models/models/hrnet_pose/perf.yaml +++ b/qai_hub_models/models/hrnet_pose/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: HRNetPose performance_metrics: - torchscript_onnx_tflite: - inference_time: 2574.0 - throughput: 388.5003885003885 + inference_time: 2519.0 + throughput: 396.9829297340214 estimated_peak_memory_range: - min: 16384 - max: 2027656 + min: 24576 + max: 3015464 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 515 - job_id: jwgoln14g + job_id: jep28oxp6 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:57:53.421052Z' torchscript_onnx_qnn: - inference_time: 2611.0 - throughput: 382.99502106472613 + inference_time: 2608.0 + throughput: 383.4355828220859 estimated_peak_memory_range: - min: 12288 - max: 48352008 + min: 49152 + max: 58039344 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 747 - job_id: j1pvlr175 + job_id: j1p8ojzg9 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1878.0 + throughput: 532.4813631522896 + estimated_peak_memory_range: + min: 16384 + max: 103402912 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 515 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 515 + job_id: j2p0yo2gw job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:51.091359Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:59:59.942614Z' + torchscript_onnx_qnn: + inference_time: 1922.0 + throughput: 520.2913631633714 + estimated_peak_memory_range: + min: 606208 + max: 178228720 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 747 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 747 + job_id: j1glnwepv + job_status: Passed diff --git a/qai_hub_models/models/hrnet_pose/requirements.txt b/qai_hub_models/models/hrnet_pose/requirements.txt index 69edf6ae..11ca0687 100644 --- a/qai_hub_models/models/hrnet_pose/requirements.txt +++ b/qai_hub_models/models/hrnet_pose/requirements.txt @@ -1,4 +1,4 @@ yacs==0.1.8 -mmpose<=1.2.0 +mmpose==1.2.0 mmcv==2.1.0 -mmdet<=3.2.0 +mmdet==3.2.0 diff --git a/qai_hub_models/models/hrnet_pose_quantized/README.md b/qai_hub_models/models/hrnet_pose_quantized/README.md index 7391c232..4697d29f 100644 --- a/qai_hub_models/models/hrnet_pose_quantized/README.md +++ b/qai_hub_models/models/hrnet_pose_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of HRNetPoseQuantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/hrnet_pose_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.hrnet_pose_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of HRNetPoseQuantized can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1902.09212) diff --git a/qai_hub_models/models/hrnet_pose_quantized/conftest.py b/qai_hub_models/models/hrnet_pose_quantized/conftest.py new file mode 100644 index 00000000..04066f70 --- /dev/null +++ b/qai_hub_models/models/hrnet_pose_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.hrnet_pose_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.hrnet_pose_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/hrnet_pose_quantized/demo.py b/qai_hub_models/models/hrnet_pose_quantized/demo.py index e17f276e..a5eca7ae 100644 --- a/qai_hub_models/models/hrnet_pose_quantized/demo.py +++ b/qai_hub_models/models/hrnet_pose_quantized/demo.py @@ -35,10 +35,10 @@ def main(is_test: bool = False): ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, HRNetPoseQuantizable.get_model_id()) + validate_on_device_demo_args(args, MODEL_ID) # Load image & model - model = demo_model_from_cli_args(HRNetPoseQuantizable, args) + model = demo_model_from_cli_args(HRNetPoseQuantizable, MODEL_ID, args) image = load_image(args.image) print("Model Loaded") diff --git a/qai_hub_models/models/hrnet_pose_quantized/export.py b/qai_hub_models/models/hrnet_pose_quantized/export.py index f65c3644..57904574 100644 --- a/qai_hub_models/models/hrnet_pose_quantized/export.py +++ b/qai_hub_models/models/hrnet_pose_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -124,8 +124,8 @@ def export_model( + " --force_channel_last_input image_tensor" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -133,21 +133,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -159,37 +167,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/hrnet_pose_quantized/info.yaml b/qai_hub_models/models/hrnet_pose_quantized/info.yaml index 0870433d..539d9ab1 100644 --- a/qai_hub_models/models/hrnet_pose_quantized/info.yaml +++ b/qai_hub_models/models/hrnet_pose_quantized/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/1902.09212 research_paper_title: Deep High-Resolution Representation Learning for Human Pose Estimation license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/hrnet_posenet technical_details: @@ -31,4 +32,5 @@ related_models: [litehrnet, hrnet_pose] has_static_banner: yes has_animated_banner: no license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/hrnet_pose_quantized/perf.yaml b/qai_hub_models/models/hrnet_pose_quantized/perf.yaml index 8e720351..db64d4b2 100644 --- a/qai_hub_models/models/hrnet_pose_quantized/perf.yaml +++ b/qai_hub_models/models/hrnet_pose_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: HRNetPoseQuantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 2508.0 - throughput: 398.72408293460927 + inference_time: 2539.0 + throughput: 393.8558487593541 estimated_peak_memory_range: - min: 16384 - max: 3642928 + min: 24576 + max: 4215600 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 515 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 515 - job_id: jz57eljqp + job_id: jo5mr2wgk job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:46:47.467700Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 1859.0 + throughput: 537.9236148466917 + estimated_peak_memory_range: + min: 16384 + max: 102354800 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 515 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 515 + job_id: jegn2yrgo + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:23:32.990808Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:46:47.467710Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/hrnet_pose_quantized/requirements.txt b/qai_hub_models/models/hrnet_pose_quantized/requirements.txt index 69edf6ae..11ca0687 100644 --- a/qai_hub_models/models/hrnet_pose_quantized/requirements.txt +++ b/qai_hub_models/models/hrnet_pose_quantized/requirements.txt @@ -1,4 +1,4 @@ yacs==0.1.8 -mmpose<=1.2.0 +mmpose==1.2.0 mmcv==2.1.0 -mmdet<=3.2.0 +mmdet==3.2.0 diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/README.md b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md index 94a1d034..5aae1665 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/README.md +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/README.md @@ -10,7 +10,7 @@ This is based on the implementation of HuggingFace-WavLM-Base-Plus found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/huggingface_wavlm_base_plus). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.huggingface_wavlm_base_plus.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of HuggingFace-WavLM-Base-Plus can be found [here](https://github.com/microsoft/unilm/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/conftest.py b/qai_hub_models/models/huggingface_wavlm_base_plus/conftest.py new file mode 100644 index 00000000..1beef838 --- /dev/null +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.huggingface_wavlm_base_plus import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.huggingface_wavlm_base_plus.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/export.py b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py index 66e9f8b7..5237cafc 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/export.py +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -107,63 +107,72 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) sample_inputs = model.sample_inputs(input_spec) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=sample_inputs, device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/info.yaml b/qai_hub_models/models/huggingface_wavlm_base_plus/info.yaml index 37a46dd9..ff447211 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/info.yaml +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/info.yaml @@ -13,6 +13,7 @@ research_paper: https://arxiv.org/abs/2110.13900 research_paper_title: 'WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing' license: https://github.com/microsoft/unilm/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://huggingface.co/patrickvonplaten/wavlm-libri-clean-100h-base-plus/tree/main technical_details: @@ -28,8 +29,11 @@ form_factors: - Tablet - IoT related_models: - - whisper_asr + - whisper_tiny_en + - whisper_base_en + - whisper_small_en has_static_banner: yes has_animated_banner: yes license_type: mit +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/model.py b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py index 6bdbbe6f..cca89a2f 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/model.py +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/model.py @@ -11,7 +11,7 @@ from transformers import WavLMModel from transformers.models.wavlm.modeling_wavlm import WavLMGroupNormConvLayer -from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.input_spec import InputSpec OPENPOSE_SOURCE_REPOSITORY = ( @@ -67,8 +67,8 @@ def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ return self.model(input) + @staticmethod def get_input_spec( - self, batch_size: int = 1, sample_length: int = 80000, ) -> InputSpec: @@ -169,6 +169,22 @@ def forward(self, x): x = torch.concat(torch.unbind(x, axis=2), axis=-1) return x[:, :, :-1] + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --compute_unit gpu" + + def get_hub_profile_options( + self, target_runtime: TargetRuntime, other_profile_options: str = "" + ) -> str: + profile_options = super().get_hub_profile_options( + target_runtime, other_profile_options + ) + return profile_options + " --compute_unit gpu" + def convert_to_wavlm_npu(model: WavLMModel): """ diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml index 3391d06e..e930ee5d 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: HuggingFace-WavLM-Base-Plus performance_metrics: - torchscript_onnx_tflite: - inference_time: 463847.0 - throughput: 2.1558832977253277 + inference_time: 237767939.0 + throughput: 0.0042057815036197965 estimated_peak_memory_range: - min: 10719232 - max: 13863736 - primary_compute_unit: CPU - precision: fp32 + min: 11886592 + max: 15703120 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 - layers_on_gpu: 88 - layers_on_cpu: 748 - total_layers: 836 - job_id: jo5m06wyg + layers_on_npu: 848 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 848 + job_id: jlpe928gr job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:13:01.265817Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 174470189.0 + throughput: 0.005731638199807303 + estimated_peak_memory_range: + min: 11321344 + max: 711668304 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 848 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 848 + job_id: jygzew4g8 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:32:02.862530Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:13:01.265830Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/requirements.txt b/qai_hub_models/models/huggingface_wavlm_base_plus/requirements.txt index 0e2962fb..657bbc9a 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/requirements.txt +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/requirements.txt @@ -1,4 +1,4 @@ -transformers>=4.31.0 -soundfile>=0.12.1 -librosa>=0.10.1 -datasets>=2.14.5 +transformers==4.27.4 +soundfile==0.12.1 +librosa==0.10.1 +datasets==2.14.5 diff --git a/qai_hub_models/models/huggingface_wavlm_base_plus/test.py b/qai_hub_models/models/huggingface_wavlm_base_plus/test.py index f1267acd..a69d95c4 100644 --- a/qai_hub_models/models/huggingface_wavlm_base_plus/test.py +++ b/qai_hub_models/models/huggingface_wavlm_base_plus/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest import torch from datasets import load_dataset @@ -65,6 +66,7 @@ def test_task(): _test_impl(HuggingFaceWavLMBasePlusApp(HuggingFaceWavLMBasePlus.from_pretrained())) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): _test_impl( diff --git a/qai_hub_models/models/inception_v3/README.md b/qai_hub_models/models/inception_v3/README.md index 63a131d6..2a8ddc93 100644 --- a/qai_hub_models/models/inception_v3/README.md +++ b/qai_hub_models/models/inception_v3/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Inception-v3 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/inception_v3). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.inception_v3.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Inception-v3 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Rethinking the Inception Architecture for Computer Vision](http://arxiv.org/abs/1512.00567) diff --git a/qai_hub_models/models/inception_v3/conftest.py b/qai_hub_models/models/inception_v3/conftest.py new file mode 100644 index 00000000..8f63d17d --- /dev/null +++ b/qai_hub_models/models/inception_v3/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.inception_v3 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.inception_v3.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/inception_v3/demo.py b/qai_hub_models/models/inception_v3/demo.py index 487dadc1..51beedba 100644 --- a/qai_hub_models/models/inception_v3/demo.py +++ b/qai_hub_models/models/inception_v3/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.inception_v3.model import InceptionNetV3 +from qai_hub_models.models.inception_v3.model import MODEL_ID, InceptionNetV3 def main(is_test: bool = False): - imagenet_demo(InceptionNetV3, is_test) + imagenet_demo(InceptionNetV3, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/inception_v3/export.py b/qai_hub_models/models/inception_v3/export.py index f722ee3a..461d52ea 100644 --- a/qai_hub_models/models/inception_v3/export.py +++ b/qai_hub_models/models/inception_v3/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/inception_v3/info.yaml b/qai_hub_models/models/inception_v3/info.yaml index 58042435..69856a43 100644 --- a/qai_hub_models/models/inception_v3/info.yaml +++ b/qai_hub_models/models/inception_v3/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: http://arxiv.org/abs/1512.00567 research_paper_title: Rethinking the Inception Architecture for Computer Vision license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/inception_v3/model.py b/qai_hub_models/models/inception_v3/model.py index 66e94e93..5787e9fb 100644 --- a/qai_hub_models/models/inception_v3/model.py +++ b/qai_hub_models/models/inception_v3/model.py @@ -13,5 +13,7 @@ class InceptionNetV3(ImagenetClassifier): - model_builder = tv_models.inception_v3 - DEFAULT_WEIGHTS = DEFAULT_WEIGHTS + @classmethod + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> InceptionNetV3: + net = tv_models.inception_v3(weights=weights, transform_input=False) + return cls(net, transform_input=True) diff --git a/qai_hub_models/models/inception_v3/perf.yaml b/qai_hub_models/models/inception_v3/perf.yaml index cbb39be7..e4c34f61 100644 --- a/qai_hub_models/models/inception_v3/perf.yaml +++ b/qai_hub_models/models/inception_v3/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Inception-v3 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1944.0 - throughput: 514.40329218107 + inference_time: 1435.0 + throughput: 696.8641114982578 estimated_peak_memory_range: - min: 24576 - max: 2564456 + min: 20480 + max: 1921832 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 141 + layers_on_npu: 131 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 141 - job_id: j1p8em8zp + total_layers: 131 + job_id: jqpyeorgy job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:30:27.461416Z' torchscript_onnx_qnn: - inference_time: 2266.0 - throughput: 441.306266548985 + inference_time: 1475.0 + throughput: 677.9661016949152 + estimated_peak_memory_range: + min: 20480 + max: 148512392 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 220 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 220 + job_id: j1p8oezg9 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1069.0 + throughput: 935.4536950420954 estimated_peak_memory_range: - min: 360448 - max: 133509928 + min: 12288 + max: 50854560 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 232 + layers_on_npu: 131 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 232 - job_id: jogk2qdyg + total_layers: 131 + job_id: j2p0ym2gw job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:21:24.010787Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:33:41.582505Z' + torchscript_onnx_qnn: + inference_time: 1082.0 + throughput: 924.2144177449168 + estimated_peak_memory_range: + min: 618496 + max: 68383952 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 220 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 220 + job_id: jogkz2ygd + job_status: Passed diff --git a/qai_hub_models/models/inception_v3/test.py b/qai_hub_models/models/inception_v3/test.py index 95c0dc88..8074a890 100644 --- a/qai_hub_models/models/inception_v3/test.py +++ b/qai_hub_models/models/inception_v3/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(InceptionNetV3.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(InceptionNetV3.from_pretrained()) diff --git a/qai_hub_models/models/inception_v3_quantized/README.md b/qai_hub_models/models/inception_v3_quantized/README.md index 2dbca972..93531fc0 100644 --- a/qai_hub_models/models/inception_v3_quantized/README.md +++ b/qai_hub_models/models/inception_v3_quantized/README.md @@ -1,16 +1,16 @@ [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) -# [Inception-v3Quantized: Quantized Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/inception_v3_quantized) +# [Inception-v3-Quantized: Quantized Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/inception_v3_quantized) InceptionNetV3 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This model is post-training quantized to int8 using samples from [Google's open images dataset](https://storage.googleapis.com/openimages/web/index.html). -This is based on the implementation of Inception-v3Quantized found +This is based on the implementation of Inception-v3-Quantized found [here](https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py). This repository contains scripts for optimized on-device export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/inception_v3_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.inception_v3_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -39,9 +39,9 @@ Additional options are documented with the `--help` option. Note that the above script requires access to Deployment instructions for Qualcomm® AI Hub. ## License -- The license for the original implementation of Inception-v3Quantized can be found +- The license for the original implementation of Inception-v3-Quantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Rethinking the Inception Architecture for Computer Vision](http://arxiv.org/abs/1512.00567) diff --git a/qai_hub_models/models/inception_v3_quantized/conftest.py b/qai_hub_models/models/inception_v3_quantized/conftest.py new file mode 100644 index 00000000..5ab488cb --- /dev/null +++ b/qai_hub_models/models/inception_v3_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.inception_v3_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.inception_v3_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/inception_v3_quantized/demo.py b/qai_hub_models/models/inception_v3_quantized/demo.py index e17bb3be..5443e0ef 100644 --- a/qai_hub_models/models/inception_v3_quantized/demo.py +++ b/qai_hub_models/models/inception_v3_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.inception_v3_quantized.model import InceptionNetV3Quantizable +from qai_hub_models.models.inception_v3_quantized.model import ( + MODEL_ID, + InceptionNetV3Quantizable, +) def main(is_test: bool = False): - imagenet_demo(InceptionNetV3Quantizable, is_test) + imagenet_demo(InceptionNetV3Quantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/inception_v3_quantized/export.py b/qai_hub_models/models/inception_v3_quantized/export.py index 816369ff..a5b83180 100644 --- a/qai_hub_models/models/inception_v3_quantized/export.py +++ b/qai_hub_models/models/inception_v3_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -89,7 +89,7 @@ def export_model( if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "inception_v3_quantized", - "Inception-v3Quantized", + "Inception-v3-Quantized", device, skip_profiling, skip_inferencing, @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,35 +163,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, metrics="psnr,top1,top5" ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/inception_v3_quantized/info.yaml b/qai_hub_models/models/inception_v3_quantized/info.yaml index e7da6c74..06f3bf87 100644 --- a/qai_hub_models/models/inception_v3_quantized/info.yaml +++ b/qai_hub_models/models/inception_v3_quantized/info.yaml @@ -1,4 +1,4 @@ -name: Inception-v3Quantized +name: Inception-v3-Quantized # id must match with the model dir name in qai_hub_models id: inception_v3_quantized status: public @@ -15,12 +15,13 @@ tags: research_paper: http://arxiv.org/abs/1512.00567 research_paper_title: Rethinking the Inception Architecture for Computer Vision license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py technical_details: Model checkpoint: Imagenet Input resolution: 224x224 - Number of parameters: 23.8M - Model size: 65.6 MB + Number of parameters: 23.9M + Model size: 23.3 MB applicable_scenarios: - Medical Imaging - Anomaly Detection @@ -38,6 +39,7 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/inception_v3_quantized/model.py b/qai_hub_models/models/inception_v3_quantized/model.py index cb320154..39dd2658 100644 --- a/qai_hub_models/models/inception_v3_quantized/model.py +++ b/qai_hub_models/models/inception_v3_quantized/model.py @@ -8,26 +8,29 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, - HubCompileOptionsInt8Mixin, ) # isort: on import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.inception_v3.model import InceptionNetV3 -from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime +from qai_hub_models.utils.quantization_aimet import tie_aimet_observer_groups MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 3 +MODEL_ASSET_VERSION = 4 DEFAULT_ENCODINGS = "inception_v3_quantized_encodings.json" class InceptionNetV3Quantizable( - HubCompileOptionsInt8Mixin, AIMETQuantizableMixin, InceptionNetV3 + AIMETQuantizableMixin, + InceptionNetV3, ): """InceptionNetV3 with post train quantization support. @@ -40,14 +43,20 @@ def __init__( ) -> None: InceptionNetV3.__init__(self, sim_model.model) AIMETQuantizableMixin.__init__( - self, sim_model, needs_onnx_direct_aimet_export=True + self, + sim_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, aimet_encodings: str | None = "DEFAULT", - ) -> "InceptionNetV3": + ) -> "InceptionNetV3Quantizable": """ Parameters: aimet_encodings: @@ -56,17 +65,19 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ model = InceptionNetV3.from_pretrained() - input_shape = model.get_input_spec()["image_tensor"][0] + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) equalize_model(model, input_shape) sim = QuantizationSimModel( - model.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_per_channel_aimet_config(), + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) + cls._tie_pre_concat_quantizers(sim) if aimet_encodings: if aimet_encodings == "DEFAULT": @@ -77,3 +88,119 @@ def from_pretrained( sim.model.eval() return cls(sim) + + @classmethod + def _tie_pre_concat_quantizers(cls, sim: QuantizationSimModel): + """ + This ties together the output quantizers prior to concatenations. This + prevents unnecessary re-quantization during the concatenation, and even + avoids fatal TFLite converter errors. + """ + + n = sim.model.net + groups = [ + [ + n.maxpool2, + n.Mixed_5b.module_avg_pool2d, + ], + [ + n.Mixed_5b.branch1x1.module_relu_5, + n.Mixed_5b.branch5x5_2.module_relu_7, + n.Mixed_5b.branch3x3dbl_3.module_relu_10, + n.Mixed_5b.branch_pool.module_relu_11, + n.Mixed_5b.module_cat, + n.Mixed_5c.module_avg_pool2d_1, + ], + [ + n.Mixed_5c.branch1x1.module_relu_12, + n.Mixed_5c.branch5x5_2.module_relu_14, + n.Mixed_5c.branch3x3dbl_3.module_relu_17, + n.Mixed_5c.branch_pool.module_relu_18, + n.Mixed_5c.module_cat_1, + n.Mixed_5d.module_avg_pool2d_2, + ], + [ + n.Mixed_5d.branch1x1.module_relu_19, + n.Mixed_5d.branch5x5_2.module_relu_21, + n.Mixed_5d.branch3x3dbl_3.module_relu_24, + n.Mixed_5d.branch_pool.module_relu_25, + n.Mixed_5d.module_cat_2, + # This group has a branch with only a max pool, + # this requires the two concat groups to merge + n.Mixed_6a.branch3x3.module_relu_26, + n.Mixed_6a.branch3x3dbl_3.module_relu_29, + n.Mixed_6a.module_max_pool2d, + n.Mixed_6a.module_cat_3, + n.Mixed_6b.module_avg_pool2d_3, + ], + [ + n.Mixed_6b.branch1x1.module_relu_30, + n.Mixed_6b.branch7x7_3.module_relu_33, + n.Mixed_6b.branch7x7dbl_5.module_relu_38, + n.Mixed_6b.branch_pool.module_relu_39, + n.Mixed_6b.module_cat_4, + n.Mixed_6c.module_avg_pool2d_4, + ], + [ + n.Mixed_6c.branch1x1.module_relu_40, + n.Mixed_6c.branch7x7_3.module_relu_43, + n.Mixed_6c.branch7x7dbl_5.module_relu_48, + n.Mixed_6c.branch_pool.module_relu_49, + n.Mixed_6c.module_cat_5, + n.Mixed_6d.module_avg_pool2d_5, + ], + [ + n.Mixed_6d.branch1x1.module_relu_50, + n.Mixed_6d.branch7x7_3.module_relu_53, + n.Mixed_6d.branch7x7dbl_5.module_relu_58, + n.Mixed_6d.branch_pool.module_relu_59, + n.Mixed_6d.module_cat_6, + n.Mixed_6e.module_avg_pool2d_6, + ], + [ + n.Mixed_6e.branch1x1.module_relu_60, + n.Mixed_6e.branch7x7_3.module_relu_63, + n.Mixed_6e.branch7x7dbl_5.module_relu_68, + n.Mixed_6e.branch_pool.module_relu_69, + n.Mixed_6e.module_cat_7, + # This group has a branch with only a max pool, + # this requires the two concat groups to merge + n.Mixed_7a.branch3x3_2.module_relu_71, + n.Mixed_7a.branch7x7x3_4.module_relu_75, + n.Mixed_7a.module_max_pool2d_1, + n.Mixed_7a.module_cat_8, + n.Mixed_7b.module_avg_pool2d_7, + ], + [ + n.Mixed_7b.branch1x1.module_relu_76, + n.Mixed_7b.branch3x3_2a.module_relu_78, + n.Mixed_7b.branch3x3_2b.module_relu_79, + n.Mixed_7b.branch3x3dbl_3a.module_relu_82, + n.Mixed_7b.branch3x3dbl_3b.module_relu_83, + n.Mixed_7b.branch_pool.module_relu_84, + n.Mixed_7b.module_cat_9, + n.Mixed_7b.module_cat_10, + n.Mixed_7b.module_cat_11, + n.Mixed_7c.module_avg_pool2d_8, + ], + [ + n.Mixed_7c.branch1x1.module_relu_85, + n.Mixed_7c.branch3x3_2a.module_relu_87, + n.Mixed_7c.branch3x3_2b.module_relu_88, + n.Mixed_7c.branch3x3dbl_3a.module_relu_91, + n.Mixed_7c.branch3x3dbl_3b.module_relu_92, + n.Mixed_7c.branch_pool.module_relu_93, + n.Mixed_7c.module_cat_12, + n.Mixed_7c.module_cat_13, + n.Mixed_7c.module_cat_14, + ], + ] + tie_aimet_observer_groups(groups) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/inception_v3_quantized/perf.yaml b/qai_hub_models/models/inception_v3_quantized/perf.yaml index ec2936b4..b06c4a03 100644 --- a/qai_hub_models/models/inception_v3_quantized/perf.yaml +++ b/qai_hub_models/models/inception_v3_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: -- name: Inception-v3Quantized +- name: Inception-v3-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 755.0 - throughput: 1324.5033112582782 + inference_time: 615.0 + throughput: 1626.0162601626016 estimated_peak_memory_range: - min: 12288 - max: 1825256 + min: 36864 + max: 2508048 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 150 + layers_on_npu: 144 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 150 - job_id: jygzlmjo5 + total_layers: 144 + job_id: jz57zj9p3 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:41:04.203939Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 487.0 + throughput: 2053.388090349076 + estimated_peak_memory_range: + min: 0 + max: 63551712 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: j0pxv7lg7 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-23T04:42:57.781769Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:41:04.203947Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/inception_v3_quantized/test.py b/qai_hub_models/models/inception_v3_quantized/test.py index ce2cba52..486a8cee 100644 --- a/qai_hub_models/models/inception_v3_quantized/test.py +++ b/qai_hub_models/models/inception_v3_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.inception_v3_quantized.demo import main as demo_main from qai_hub_models.models.inception_v3_quantized.model import ( @@ -25,16 +24,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - InceptionNetV3Quantizable.from_pretrained(), - diff_tol=0.01, - rtol=0.02, - atol=0.2, - is_quantized=True, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/lama_dilated/README.md b/qai_hub_models/models/lama_dilated/README.md index 082fb509..6f36b28f 100644 --- a/qai_hub_models/models/lama_dilated/README.md +++ b/qai_hub_models/models/lama_dilated/README.md @@ -10,7 +10,7 @@ This is based on the implementation of LaMa-Dilated found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/lama_dilated). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.lama_dilated.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of LaMa-Dilated can be found [here](https://github.com/advimman/lama/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Resolution-robust Large Mask Inpainting with Fourier Convolutions](https://arxiv.org/abs/2109.07161) diff --git a/qai_hub_models/models/lama_dilated/conftest.py b/qai_hub_models/models/lama_dilated/conftest.py new file mode 100644 index 00000000..f61093bc --- /dev/null +++ b/qai_hub_models/models/lama_dilated/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.lama_dilated import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.lama_dilated.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/lama_dilated/demo.py b/qai_hub_models/models/lama_dilated/demo.py index adabbf3f..f9778ff1 100644 --- a/qai_hub_models/models/lama_dilated/demo.py +++ b/qai_hub_models/models/lama_dilated/demo.py @@ -19,7 +19,7 @@ def main(is_test: bool = False): - repaint_demo(LamaDilated, IMAGE_ADDRESS, MASK_ADDRESS, is_test) + repaint_demo(LamaDilated, MODEL_ID, IMAGE_ADDRESS, MASK_ADDRESS, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/lama_dilated/export.py b/qai_hub_models/models/lama_dilated/export.py index 387f5e9c..02ae013e 100644 --- a/qai_hub_models/models/lama_dilated/export.py +++ b/qai_hub_models/models/lama_dilated/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -119,29 +119,37 @@ def export_model( + " --force_channel_last_input image,mask" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -150,37 +158,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image,mask", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/lama_dilated/info.yaml b/qai_hub_models/models/lama_dilated/info.yaml index 54e07b78..d4ffe566 100644 --- a/qai_hub_models/models/lama_dilated/info.yaml +++ b/qai_hub_models/models/lama_dilated/info.yaml @@ -12,6 +12,7 @@ tags: research_paper: https://arxiv.org/abs/2109.07161 research_paper_title: Resolution-robust Large Mask Inpainting with Fourier Convolutions license: https://github.com/advimman/lama/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/advimman/lama technical_details: Model checkpoint: Dilated CelebAHQ @@ -28,4 +29,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/lama_dilated/model.py b/qai_hub_models/models/lama_dilated/model.py index fff9bf32..a0cbc86d 100644 --- a/qai_hub_models/models/lama_dilated/model.py +++ b/qai_hub_models/models/lama_dilated/model.py @@ -4,6 +4,9 @@ # --------------------------------------------------------------------- from __future__ import annotations +import logging +from importlib import reload + import torch from omegaconf import OmegaConf @@ -12,6 +15,7 @@ SourceAsRoot, load_json, load_torch, + set_log_level, ) from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.input_spec import InputSpec @@ -71,8 +75,8 @@ def forward(self, image: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: inpainted = mask * predicted_image + (1 - mask) * image return inpainted + @staticmethod def get_input_spec( - self, batch_size: int = 1, num_channels: int = 3, height: int = 512, @@ -110,6 +114,13 @@ def _load_lama_dilated_source_model_from_weights(weights_name: str) -> torch.nn. with SourceAsRoot( LAMA_SOURCE_REPOSITORY, LAMA_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION ): + # This repository has a top-level "models", which is common. We + # explicitly reload it in case it has been loaded and cached by another + # package (or our models when executing from qai_hub_models/) + import models + + reload(models) + # Import module from saicinpainting.training.trainers.default import ( DefaultInpaintingTrainingModule, @@ -122,7 +133,8 @@ def _load_lama_dilated_source_model_from_weights(weights_name: str) -> torch.nn. kwargs.pop("kind") kwargs["use_ddp"] = True state = load_torch(weights_url) - lama_dilated_model = DefaultInpaintingTrainingModule(config, **kwargs) + with set_log_level(logging.WARN): + lama_dilated_model = DefaultInpaintingTrainingModule(config, **kwargs) lama_dilated_model.load_state_dict(state["state_dict"], strict=False) lama_dilated_model.on_load_checkpoint(state) lama_dilated_model.freeze() diff --git a/qai_hub_models/models/lama_dilated/perf.yaml b/qai_hub_models/models/lama_dilated/perf.yaml index f951db3e..af8a440d 100644 --- a/qai_hub_models/models/lama_dilated/perf.yaml +++ b/qai_hub_models/models/lama_dilated/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: LaMa-Dilated performance_metrics: - torchscript_onnx_tflite: - inference_time: 88596.0 - throughput: 11.287191295318072 + inference_time: 88628.0 + throughput: 11.283115945299453 estimated_peak_memory_range: - min: 3289088 - max: 139215624 + min: 3252224 + max: 140731056 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 346 - job_id: jqpyojvr5 + job_id: j2p0yv0gw job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:53:06.506039Z' torchscript_onnx_qnn: - inference_time: 84076.0 - throughput: 11.894000666064038 + inference_time: 84164.0 + throughput: 11.881564564421843 estimated_peak_memory_range: - min: 4313088 - max: 34733320 + min: 4321280 + max: 33964280 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 333 - job_id: j2p0m2e2g + job_id: j1gln12pv + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 62025.0 + throughput: 16.12253123740427 + estimated_peak_memory_range: + min: 225280 + max: 245293744 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 346 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 346 + job_id: jogkz9vgd job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:27:42.653097Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:58:58.279247Z' + torchscript_onnx_qnn: + inference_time: 58950.0 + throughput: 16.963528413910094 + estimated_peak_memory_range: + min: 78331904 + max: 243926976 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 333 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 333 + job_id: jw566dn5o + job_status: Passed diff --git a/qai_hub_models/models/lama_dilated/requirements.txt b/qai_hub_models/models/lama_dilated/requirements.txt index a21b654f..52c75e34 100644 --- a/qai_hub_models/models/lama_dilated/requirements.txt +++ b/qai_hub_models/models/lama_dilated/requirements.txt @@ -1,10 +1,9 @@ -matplotlib -pandas +matplotlib==3.7.4 albumentations==0.5.2 pytorch-lightning==1.6.0 -webdataset +webdataset==0.2.86 easydict==1.10 kornia==0.5.0 hydra-core==1.3.0 -omegaconf==2.3.0 -scikit-learn==1.3.0 +scikit-learn==1.1.3 +tensorboard==2.13.0 diff --git a/qai_hub_models/models/lama_dilated/test.py b/qai_hub_models/models/lama_dilated/test.py index 47b13144..2dbd27d3 100644 --- a/qai_hub_models/models/lama_dilated/test.py +++ b/qai_hub_models/models/lama_dilated/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest from qai_hub_models.models._shared.repaint.app import RepaintMaskApp from qai_hub_models.models.lama_dilated.demo import IMAGE_ADDRESS, MASK_ADDRESS @@ -38,6 +39,7 @@ def test_task(): ) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): net = LamaDilated.from_pretrained() diff --git a/qai_hub_models/models/litehrnet/README.md b/qai_hub_models/models/litehrnet/README.md index 0c628f39..7e786799 100644 --- a/qai_hub_models/models/litehrnet/README.md +++ b/qai_hub_models/models/litehrnet/README.md @@ -10,7 +10,7 @@ This is based on the implementation of LiteHRNet found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/litehrnet). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.litehrnet.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of LiteHRNet can be found [here](https://github.com/HRNet/Lite-HRNet/blob/hrnet/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Lite-HRNet: A Lightweight High-Resolution Network](https://arxiv.org/abs/2104.06403) diff --git a/qai_hub_models/models/litehrnet/conftest.py b/qai_hub_models/models/litehrnet/conftest.py new file mode 100644 index 00000000..2ee063ae --- /dev/null +++ b/qai_hub_models/models/litehrnet/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.litehrnet import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.litehrnet.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/litehrnet/demo.py b/qai_hub_models/models/litehrnet/demo.py index 8ad0ad05..d9e9e20d 100644 --- a/qai_hub_models/models/litehrnet/demo.py +++ b/qai_hub_models/models/litehrnet/demo.py @@ -39,8 +39,8 @@ def main(is_test: bool = False): ) args = parser.parse_args([] if is_test else None) litehrnet_model = model_from_cli_args(LiteHRNet, args) - hub_model = demo_model_from_cli_args(LiteHRNet, args) - validate_on_device_demo_args(args, LiteHRNet.get_model_id()) + hub_model = demo_model_from_cli_args(LiteHRNet, MODEL_ID, args) + validate_on_device_demo_args(args, MODEL_ID) # Load image & model image = load_image(args.image) diff --git a/qai_hub_models/models/litehrnet/export.py b/qai_hub_models/models/litehrnet/export.py index b163c9d0..13f51b82 100644 --- a/qai_hub_models/models/litehrnet/export.py +++ b/qai_hub_models/models/litehrnet/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -108,66 +108,76 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) sample_inputs = model.sample_inputs(input_spec) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=sample_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/litehrnet/info.yaml b/qai_hub_models/models/litehrnet/info.yaml index 9e62d950..31da35bd 100644 --- a/qai_hub_models/models/litehrnet/info.yaml +++ b/qai_hub_models/models/litehrnet/info.yaml @@ -11,6 +11,7 @@ tags: [] research_paper: https://arxiv.org/abs/2104.06403 research_paper_title: 'Lite-HRNet: A Lightweight High-Resolution Network' license: https://github.com/HRNet/Lite-HRNet/blob/hrnet/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/HRNet/Lite-HRNet technical_details: Input resolution: 256x192 @@ -28,4 +29,5 @@ related_models: [openpose, hrnet_pose] has_static_banner: yes has_animated_banner: no license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/litehrnet/model.py b/qai_hub_models/models/litehrnet/model.py index e2c31db2..aa2baf06 100644 --- a/qai_hub_models/models/litehrnet/model.py +++ b/qai_hub_models/models/litehrnet/model.py @@ -80,8 +80,8 @@ def forward( return keypoints, scores, heatmaps + @staticmethod def get_input_spec( - self, num_channels: int = 3, height: int = 256, width: int = 192, diff --git a/qai_hub_models/models/litehrnet/perf.yaml b/qai_hub_models/models/litehrnet/perf.yaml index 2b7cc7c9..d1a63bd8 100644 --- a/qai_hub_models/models/litehrnet/perf.yaml +++ b/qai_hub_models/models/litehrnet/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: LiteHRNet performance_metrics: - torchscript_onnx_tflite: - inference_time: 15966.0 - throughput: 62.63309532757109 + inference_time: 15866.0 + throughput: 63.02785831337451 estimated_peak_memory_range: - min: 6561792 - max: 13503904 + min: 6811648 + max: 10391632 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 10 total_layers: 1236 - job_id: jqp4ydwqp + job_id: jn5q83o57 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:41:50.802497Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 10704.0 + throughput: 93.42301943198804 + estimated_peak_memory_range: + min: 20480 + max: 71674208 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1226 + layers_on_gpu: 0 + layers_on_cpu: 10 + total_layers: 1236 + job_id: j1glnkmpv + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:25:31.033915Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:41:50.802505Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/litehrnet/requirements.txt b/qai_hub_models/models/litehrnet/requirements.txt index 048feb99..64d03ea1 100644 --- a/qai_hub_models/models/litehrnet/requirements.txt +++ b/qai_hub_models/models/litehrnet/requirements.txt @@ -1,3 +1,3 @@ -mmpose<=1.2.0 +mmpose==1.2.0 mmcv==2.1.0 -mmdet<=3.2.0 +mmdet==3.2.0 diff --git a/qai_hub_models/models/litehrnet/test.py b/qai_hub_models/models/litehrnet/test.py index 3fe634ef..6cc5b848 100644 --- a/qai_hub_models/models/litehrnet/test.py +++ b/qai_hub_models/models/litehrnet/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest from qai_hub_models.models.litehrnet.app import LiteHRNetApp from qai_hub_models.models.litehrnet.demo import IMAGE_ADDRESS @@ -52,6 +53,7 @@ def test_task(): _test_impl(LiteHRNetApp(litehrnet, litehrnet.inferencer)) +@pytest.mark.trace def test_trace(): litehrnet = LiteHRNet.from_pretrained() _test_impl(LiteHRNetApp(litehrnet.convert_to_torchscript(), litehrnet.inferencer)) diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md b/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md index a5546eed..c6955075 100644 --- a/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md +++ b/qai_hub_models/models/llama_v2_7b_chat_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Llama-v2-7B-Chat found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/llama_v2_7b_chat_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -18,7 +18,7 @@ a hosted Qualcomm® device. ## License - The license for the original implementation of Llama-v2-7B-Chat can be found [here](https://github.com/facebookresearch/llama/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml b/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml index 584f2f8b..8a9faf98 100644 --- a/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml +++ b/qai_hub_models/models/llama_v2_7b_chat_quantized/info.yaml @@ -18,6 +18,7 @@ tags: research_paper: https://arxiv.org/abs/2302.13971 research_paper_title: "LLaMA: Open and Efficient Foundation Language Models" license: https://github.com/facebookresearch/llama/blob/main/LICENSE +deploy_license: https://github.com/facebookresearch/llama/blob/main/LICENSE source_repo: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf technical_details: Number of parameters: 7B @@ -43,4 +44,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: llama2 +deploy_license_type: llama2 dataset: [] diff --git a/qai_hub_models/models/mediapipe_face/README.md b/qai_hub_models/models/mediapipe_face/README.md index 2e701b97..7ff2956c 100644 --- a/qai_hub_models/models/mediapipe_face/README.md +++ b/qai_hub_models/models/mediapipe_face/README.md @@ -10,17 +10,12 @@ This is based on the implementation of MediaPipe-Face-Detection found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_face). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. ## Example & Usage -Install the package via pip: -```bash -pip install "qai_hub_models[mediapipe_face]" -``` - Once installed, run the following simple CLI demo: @@ -30,7 +25,7 @@ python -m qai_hub_models.models.mediapipe_face.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MediaPipe-Face-Detection can be found [here](https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs](https://arxiv.org/abs/1907.05047) diff --git a/qai_hub_models/models/mediapipe_face/conftest.py b/qai_hub_models/models/mediapipe_face/conftest.py new file mode 100644 index 00000000..80a9de2b --- /dev/null +++ b/qai_hub_models/models/mediapipe_face/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mediapipe_face import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mediapipe_face.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mediapipe_face/export.py b/qai_hub_models/models/mediapipe_face/export.py index c7f32d04..fe627869 100644 --- a/qai_hub_models/models/mediapipe_face/export.py +++ b/qai_hub_models/models/mediapipe_face/export.py @@ -10,14 +10,14 @@ import os import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub import torch from qai_hub_models.models.mediapipe_face import Model from qai_hub_models.utils.args import export_parser, get_model_kwargs -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.compare import torch_inference from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( @@ -90,9 +90,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or ALL_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "mediapipe_face", @@ -111,68 +111,85 @@ def export_model( # 1. Initialize PyTorch model model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) - components_dict = {} + components_dict: Dict[str, BaseModel] = {} if "MediaPipeFaceDetector" in components: - components_dict["MediaPipeFaceDetector"] = model.face_detector + components_dict["MediaPipeFaceDetector"] = model.face_detector # type: ignore if "MediaPipeFaceLandmarkDetector" in components: - components_dict["MediaPipeFaceLandmarkDetector"] = model.face_landmark_detector + components_dict["MediaPipeFaceLandmarkDetector"] = model.face_landmark_detector # type: ignore - compile_jobs = {} + compile_jobs: Dict[str, hub.client.CompileJob] = {} for component_name, component in components_dict.items(): # Trace the model input_spec = component.get_input_spec() - source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {component_name} to run on-device.") - compile_jobs[component_name] = hub.submit_compile_job( + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), - name=f"{component_name}", + name=f"{model_name}_{component_name}", options=model_compile_options, ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_jobs[component_name].get_target_model(), device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() # Convert inputs from channel first to channel last hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_jobs[component_name].get_target_model(), inputs=hub_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Download the model assets to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) for component_name, compile_job in compile_jobs.items(): - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download( str(output_path / f"{model_name}_{component_name}.tflite") ) @@ -181,8 +198,8 @@ def export_model( if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: @@ -190,8 +207,8 @@ def export_model( inference_job = inference_jobs[component_name] sample_inputs = components_dict[component_name].sample_inputs() torch_out = torch_inference(components_dict[component_name], sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return { diff --git a/qai_hub_models/models/mediapipe_face/info.yaml b/qai_hub_models/models/mediapipe_face/info.yaml index 6f80819c..ec898a06 100644 --- a/qai_hub_models/models/mediapipe_face/info.yaml +++ b/qai_hub_models/models/mediapipe_face/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/1907.05047 research_paper_title: 'BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs' license: https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/zmurez/MediaPipePyTorch/ technical_details: Input resolution: 256x256 @@ -36,4 +37,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/mediapipe_face/model.py b/qai_hub_models/models/mediapipe_face/model.py index 89844824..29b79435 100644 --- a/qai_hub_models/models/mediapipe_face/model.py +++ b/qai_hub_models/models/mediapipe_face/model.py @@ -246,7 +246,8 @@ def from_pretrained( face_detector.load_anchors(detector_anchors) return cls(face_detector, face_detector.anchors) - def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + @staticmethod + def get_input_spec(batch_size: int = BATCH_SIZE) -> InputSpec: """ Returns the input specification (name -> (shape, type) of the face detector. This can be used to submit profiling job on Qualcomm AI Hub. @@ -274,7 +275,8 @@ def from_pretrained(cls, landmark_detector_weights: str = "blazeface_landmark.pt face_regressor.load_weights(landmark_detector_weights) return cls(face_regressor) - def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + @staticmethod + def get_input_spec(batch_size: int = BATCH_SIZE) -> InputSpec: """ Returns the input specification (name -> (shape, type) of the face landmark detector. This can be used to submit profiling job on Qualcomm AI Hub. diff --git a/qai_hub_models/models/mediapipe_face/perf.yaml b/qai_hub_models/models/mediapipe_face/perf.yaml index 3df1ee28..bf104ca8 100644 --- a/qai_hub_models/models/mediapipe_face/perf.yaml +++ b/qai_hub_models/models/mediapipe_face/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MediaPipeFaceDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 536.0 - throughput: 1865.6716417910447 + inference_time: 532.0 + throughput: 1879.6992481203008 estimated_peak_memory_range: min: 12288 - max: 1539856 + max: 1591696 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,39 +43,77 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 111 - job_id: jqp4ydjqp + job_id: jn5q8nm57 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:12:50.277943Z' torchscript_onnx_qnn: - inference_time: 592.0 - throughput: 1689.1891891891892 + inference_time: 535.0 + throughput: 1869.1588785046729 + estimated_peak_memory_range: + min: 16384 + max: 4401872 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 111 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 111 + job_id: jwgoyxd58 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 380.0 + throughput: 2631.5789473684213 estimated_peak_memory_range: - min: 802816 - max: 57565728 + min: 12288 + max: 27416464 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 147 + layers_on_npu: 111 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 147 - job_id: jo5m06vyg + total_layers: 111 + job_id: jw566x75o job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:08:54.792595Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:15:59.596663Z' + torchscript_onnx_qnn: + inference_time: 381.0 + throughput: 2624.6719160104985 + estimated_peak_memory_range: + min: 12288 + max: 26948416 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 111 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 111 + job_id: j7gjx98pd + job_status: Passed - name: MediaPipeFaceLandmarkDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 209.0 - throughput: 4784.688995215311 + inference_time: 211.0 + throughput: 4739.336492890995 estimated_peak_memory_range: min: 24576 - max: 1806472 + max: 1810232 primary_compute_unit: NPU precision: fp16 layer_info: @@ -80,28 +121,66 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 100 - job_id: j0pxl6ejp + job_id: j1glndlpv job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:24:30.398348Z' torchscript_onnx_qnn: - inference_time: 286.0 - throughput: 3496.5034965034965 + inference_time: 210.0 + throughput: 4761.9047619047615 estimated_peak_memory_range: - min: 462848 - max: 8766648 + min: 28672 + max: 1684984 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 106 + layers_on_npu: 100 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 106 - job_id: jegnzmxvg + total_layers: 100 + job_id: j1pv38m5x + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 159.0 + throughput: 6289.308176100629 + estimated_peak_memory_range: + min: 12288 + max: 24695408 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 100 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 100 + job_id: j1p3kdz52 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:12:20.881454Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:28:56.546828Z' + torchscript_onnx_qnn: + inference_time: 156.0 + throughput: 6410.25641025641 + estimated_peak_memory_range: + min: 16384 + max: 24996560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 100 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 100 + job_id: jlpe9q0gr + job_status: Passed diff --git a/qai_hub_models/models/mediapipe_face/requirements.txt b/qai_hub_models/models/mediapipe_face/requirements.txt deleted file mode 100644 index 9c11ddeb..00000000 --- a/qai_hub_models/models/mediapipe_face/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -opencv-python -requests diff --git a/qai_hub_models/models/mediapipe_hand/README.md b/qai_hub_models/models/mediapipe_hand/README.md index 8aa4b4e1..ff64fe98 100644 --- a/qai_hub_models/models/mediapipe_hand/README.md +++ b/qai_hub_models/models/mediapipe_hand/README.md @@ -10,17 +10,12 @@ This is based on the implementation of MediaPipe-Hand-Detection found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_hand). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. ## Example & Usage -Install the package via pip: -```bash -pip install "qai_hub_models[mediapipe_hand]" -``` - Once installed, run the following simple CLI demo: @@ -30,7 +25,7 @@ python -m qai_hub_models.models.mediapipe_hand.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MediaPipe-Hand-Detection can be found [here](https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [MediaPipe Hands: On-device Real-time Hand Tracking](https://arxiv.org/abs/2006.10214) diff --git a/qai_hub_models/models/mediapipe_hand/conftest.py b/qai_hub_models/models/mediapipe_hand/conftest.py new file mode 100644 index 00000000..8848bec2 --- /dev/null +++ b/qai_hub_models/models/mediapipe_hand/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mediapipe_hand import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mediapipe_hand.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mediapipe_hand/export.py b/qai_hub_models/models/mediapipe_hand/export.py index 7d92ac14..22ff6f03 100644 --- a/qai_hub_models/models/mediapipe_hand/export.py +++ b/qai_hub_models/models/mediapipe_hand/export.py @@ -10,14 +10,14 @@ import os import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub import torch from qai_hub_models.models.mediapipe_hand import Model from qai_hub_models.utils.args import export_parser, get_model_kwargs -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.compare import torch_inference from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( @@ -90,9 +90,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or ALL_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "mediapipe_hand", @@ -111,68 +111,85 @@ def export_model( # 1. Initialize PyTorch model model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) - components_dict = {} + components_dict: Dict[str, BaseModel] = {} if "MediaPipeHandDetector" in components: - components_dict["MediaPipeHandDetector"] = model.hand_detector + components_dict["MediaPipeHandDetector"] = model.hand_detector # type: ignore if "MediaPipeHandLandmarkDetector" in components: - components_dict["MediaPipeHandLandmarkDetector"] = model.hand_landmark_detector + components_dict["MediaPipeHandLandmarkDetector"] = model.hand_landmark_detector # type: ignore - compile_jobs = {} + compile_jobs: Dict[str, hub.client.CompileJob] = {} for component_name, component in components_dict.items(): # Trace the model input_spec = component.get_input_spec() - source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {component_name} to run on-device.") - compile_jobs[component_name] = hub.submit_compile_job( + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), - name=f"{component_name}", + name=f"{model_name}_{component_name}", options=model_compile_options, ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_jobs[component_name].get_target_model(), device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() # Convert inputs from channel first to channel last hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_jobs[component_name].get_target_model(), inputs=hub_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Download the model assets to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) for component_name, compile_job in compile_jobs.items(): - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download( str(output_path / f"{model_name}_{component_name}.tflite") ) @@ -181,8 +198,8 @@ def export_model( if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: @@ -190,8 +207,8 @@ def export_model( inference_job = inference_jobs[component_name] sample_inputs = components_dict[component_name].sample_inputs() torch_out = torch_inference(components_dict[component_name], sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return { diff --git a/qai_hub_models/models/mediapipe_hand/info.yaml b/qai_hub_models/models/mediapipe_hand/info.yaml index 32d75fb9..9266f51a 100644 --- a/qai_hub_models/models/mediapipe_hand/info.yaml +++ b/qai_hub_models/models/mediapipe_hand/info.yaml @@ -12,6 +12,7 @@ tags: research_paper: https://arxiv.org/abs/2006.10214 research_paper_title: 'MediaPipe Hands: On-device Real-time Hand Tracking' license: https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/zmurez/MediaPipePyTorch/ technical_details: Input resolution: 256x256 @@ -34,4 +35,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/mediapipe_hand/model.py b/qai_hub_models/models/mediapipe_hand/model.py index b41542ef..25d2c4e5 100644 --- a/qai_hub_models/models/mediapipe_hand/model.py +++ b/qai_hub_models/models/mediapipe_hand/model.py @@ -142,7 +142,8 @@ def from_pretrained( hand_detector.load_anchors(detector_anchors) return cls(hand_detector, hand_detector.anchors) - def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + @staticmethod + def get_input_spec(batch_size: int = BATCH_SIZE) -> InputSpec: """ Returns the input specification (name -> (shape, type) of the hand detector. This can be used to submit profiling job on Qualcomm AI Hub. @@ -170,7 +171,8 @@ def from_pretrained(cls, landmark_detector_weights: str = "blazehand_landmark.pt hand_regressor.load_weights(landmark_detector_weights) cls(hand_regressor) - def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + @staticmethod + def get_input_spec(batch_size: int = BATCH_SIZE) -> InputSpec: """ Returns the input specification (name -> (shape, type) of the hand landmark detector. This can be used to submit profiling job on Qualcomm AI Hub. diff --git a/qai_hub_models/models/mediapipe_hand/perf.yaml b/qai_hub_models/models/mediapipe_hand/perf.yaml index f79bb0af..dad8f571 100644 --- a/qai_hub_models/models/mediapipe_hand/perf.yaml +++ b/qai_hub_models/models/mediapipe_hand/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MediaPipeHandDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 762.0 - throughput: 1312.3359580052493 + inference_time: 765.0 + throughput: 1307.18954248366 estimated_peak_memory_range: min: 12288 - max: 3281536 + max: 12061368 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,39 +43,77 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 151 - job_id: jwgolne4g + job_id: jep28dxp6 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:23:15.414918Z' torchscript_onnx_qnn: - inference_time: 820.0 - throughput: 1219.5121951219512 + inference_time: 763.0 + throughput: 1310.615989515072 estimated_peak_memory_range: - min: 806912 - max: 6264240 + min: 12288 + max: 1709784 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 196 + layers_on_npu: 151 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 196 - job_id: j7gjr2k7p + total_layers: 151 + job_id: jogkz0ygd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 571.0 + throughput: 1751.3134851138354 + estimated_peak_memory_range: + min: 12288 + max: 51661744 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 151 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 151 + job_id: j2p0y92gw job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:08:53.710000Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:27:44.926097Z' + torchscript_onnx_qnn: + inference_time: 547.0 + throughput: 1828.1535648994516 + estimated_peak_memory_range: + min: 12288 + max: 52066480 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 151 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 151 + job_id: j1gln8epv + job_status: Passed - name: MediaPipeHandLandmarkDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 1017.0 - throughput: 983.284169124877 + inference_time: 1047.0 + throughput: 955.1098376313277 estimated_peak_memory_range: - min: 24576 - max: 2409872 + min: 28672 + max: 2017000 primary_compute_unit: NPU precision: fp16 layer_info: @@ -80,28 +121,66 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 158 - job_id: j1pvlrz75 + job_id: jqpye2rgy job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:35:00.465711Z' torchscript_onnx_qnn: - inference_time: 1088.0 - throughput: 919.1176470588235 + inference_time: 996.0 + throughput: 1004.0160642570281 estimated_peak_memory_range: - min: 577536 - max: 53567440 + min: 24576 + max: 10650592 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 209 + layers_on_npu: 158 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 209 - job_id: jlpe7w475 + total_layers: 158 + job_id: jn5q81757 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 749.0 + throughput: 1335.1134846461948 + estimated_peak_memory_range: + min: 16384 + max: 54372320 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 158 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 158 + job_id: j1p8orzg9 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:12:22.243551Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:36:54.085694Z' + torchscript_onnx_qnn: + inference_time: 747.0 + throughput: 1338.6880856760374 + estimated_peak_memory_range: + min: 12288 + max: 53941536 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 158 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 158 + job_id: jw566mv5o + job_status: Passed diff --git a/qai_hub_models/models/mediapipe_hand/requirements.txt b/qai_hub_models/models/mediapipe_hand/requirements.txt deleted file mode 100644 index 9c11ddeb..00000000 --- a/qai_hub_models/models/mediapipe_hand/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -opencv-python -requests diff --git a/qai_hub_models/models/mediapipe_pose/README.md b/qai_hub_models/models/mediapipe_pose/README.md index 693c634a..9a10818d 100644 --- a/qai_hub_models/models/mediapipe_pose/README.md +++ b/qai_hub_models/models/mediapipe_pose/README.md @@ -10,17 +10,12 @@ This is based on the implementation of MediaPipe-Pose-Estimation found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_pose). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. ## Example & Usage -Install the package via pip: -```bash -pip install "qai_hub_models[mediapipe_pose]" -``` - Once installed, run the following simple CLI demo: @@ -30,7 +25,7 @@ python -m qai_hub_models.models.mediapipe_pose.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MediaPipe-Pose-Estimation can be found [here](https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [BlazePose: On-device Real-time Body Pose tracking](https://arxiv.org/abs/2006.10204) diff --git a/qai_hub_models/models/mediapipe_pose/conftest.py b/qai_hub_models/models/mediapipe_pose/conftest.py new file mode 100644 index 00000000..2e30278e --- /dev/null +++ b/qai_hub_models/models/mediapipe_pose/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mediapipe_pose import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mediapipe_pose.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mediapipe_pose/export.py b/qai_hub_models/models/mediapipe_pose/export.py index 9d2b78aa..ec0c57a3 100644 --- a/qai_hub_models/models/mediapipe_pose/export.py +++ b/qai_hub_models/models/mediapipe_pose/export.py @@ -10,14 +10,14 @@ import os import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub import torch from qai_hub_models.models.mediapipe_pose import Model from qai_hub_models.utils.args import export_parser, get_model_kwargs -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.compare import torch_inference from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( @@ -90,9 +90,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or ALL_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "mediapipe_pose", @@ -111,68 +111,85 @@ def export_model( # 1. Initialize PyTorch model model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) - components_dict = {} + components_dict: Dict[str, BaseModel] = {} if "MediaPipePoseDetector" in components: - components_dict["MediaPipePoseDetector"] = model.pose_detector + components_dict["MediaPipePoseDetector"] = model.pose_detector # type: ignore if "MediaPipePoseLandmarkDetector" in components: - components_dict["MediaPipePoseLandmarkDetector"] = model.pose_landmark_detector + components_dict["MediaPipePoseLandmarkDetector"] = model.pose_landmark_detector # type: ignore - compile_jobs = {} + compile_jobs: Dict[str, hub.client.CompileJob] = {} for component_name, component in components_dict.items(): # Trace the model input_spec = component.get_input_spec() - source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {component_name} to run on-device.") - compile_jobs[component_name] = hub.submit_compile_job( + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), - name=f"{component_name}", + name=f"{model_name}_{component_name}", options=model_compile_options, ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_jobs[component_name].get_target_model(), device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() # Convert inputs from channel first to channel last hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_jobs[component_name].get_target_model(), inputs=hub_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Download the model assets to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) for component_name, compile_job in compile_jobs.items(): - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download( str(output_path / f"{model_name}_{component_name}.tflite") ) @@ -181,8 +198,8 @@ def export_model( if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: @@ -190,8 +207,8 @@ def export_model( inference_job = inference_jobs[component_name] sample_inputs = components_dict[component_name].sample_inputs() torch_out = torch_inference(components_dict[component_name], sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return { diff --git a/qai_hub_models/models/mediapipe_pose/info.yaml b/qai_hub_models/models/mediapipe_pose/info.yaml index 23e2045b..8665455f 100644 --- a/qai_hub_models/models/mediapipe_pose/info.yaml +++ b/qai_hub_models/models/mediapipe_pose/info.yaml @@ -12,6 +12,7 @@ tags: research_paper: https://arxiv.org/abs/2006.10204 research_paper_title: 'BlazePose: On-device Real-time Body Pose tracking' license: https://github.com/zmurez/MediaPipePyTorch/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/zmurez/MediaPipePyTorch/ technical_details: Input resolution: 256x256 @@ -34,4 +35,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/mediapipe_pose/model.py b/qai_hub_models/models/mediapipe_pose/model.py index ad32a893..7c96d6e5 100644 --- a/qai_hub_models/models/mediapipe_pose/model.py +++ b/qai_hub_models/models/mediapipe_pose/model.py @@ -138,7 +138,8 @@ def from_pretrained( pose_detector.load_anchors(detector_anchors) return cls(pose_detector, pose_detector.anchors) - def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + @staticmethod + def get_input_spec(batch_size: int = BATCH_SIZE) -> InputSpec: """ Returns the input specification (name -> (shape, type) of the pose detector. This can be used to submit profiling job on Qualcomm AI Hub. @@ -166,7 +167,8 @@ def from_pretrained(cls, landmark_detector_weights: str = "blazepose_landmark.pt pose_regressor.load_weights(landmark_detector_weights) cls(pose_regressor) - def get_input_spec(self, batch_size: int = BATCH_SIZE) -> InputSpec: + @staticmethod + def get_input_spec(batch_size: int = BATCH_SIZE) -> InputSpec: """ Returns the input specification (name -> (shape, type) of the pose landmark detector. This can be used to submit profiling job on Qualcomm AI Hub. diff --git a/qai_hub_models/models/mediapipe_pose/perf.yaml b/qai_hub_models/models/mediapipe_pose/perf.yaml index 7b7ebe7c..f642b95b 100644 --- a/qai_hub_models/models/mediapipe_pose/perf.yaml +++ b/qai_hub_models/models/mediapipe_pose/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MediaPipePoseDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 807.0 - throughput: 1239.1573729863692 + inference_time: 806.0 + throughput: 1240.6947890818858 estimated_peak_memory_range: - min: 28672 - max: 1641432 + min: 24576 + max: 1736000 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,39 +43,77 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 106 - job_id: j1p3z1wz5 + job_id: jygzelzg8 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:54:08.518654Z' torchscript_onnx_qnn: - inference_time: 865.0 - throughput: 1156.0693641618498 + inference_time: 808.0 + throughput: 1237.6237623762377 estimated_peak_memory_range: - min: 212992 - max: 66280848 + min: 28672 + max: 4909504 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 139 + layers_on_npu: 106 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 139 - job_id: j1pvlr9m5 + total_layers: 106 + job_id: jvgdwdk5j + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 577.0 + throughput: 1733.102253032929 + estimated_peak_memory_range: + min: 65536 + max: 39641680 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 106 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 106 + job_id: jmg9vzq57 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:22:09.229999Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:58:30.195464Z' + torchscript_onnx_qnn: + inference_time: 577.0 + throughput: 1733.102253032929 + estimated_peak_memory_range: + min: 61440 + max: 40004608 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 106 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 106 + job_id: jqp4qyqgo + job_status: Passed - name: MediaPipePoseLandmarkDetector performance_metrics: - torchscript_onnx_tflite: - inference_time: 1023.0 - throughput: 977.5171065493646 + inference_time: 1052.0 + throughput: 950.5703422053232 estimated_peak_memory_range: - min: 12288 - max: 3253904 + min: 16384 + max: 2847296 primary_compute_unit: NPU precision: fp16 layer_info: @@ -80,28 +121,66 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 229 - job_id: jwgoln4dg + job_id: jz5wolzp1 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:04:16.645350Z' torchscript_onnx_qnn: - inference_time: 1101.0 - throughput: 908.2652134423251 + inference_time: 1063.0 + throughput: 940.7337723424271 estimated_peak_memory_range: - min: 20480 - max: 149395360 + min: 12288 + max: 2768272 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 305 + layers_on_npu: 229 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 305 - job_id: j7gjr2w8p + total_layers: 229 + job_id: jz57zeqp3 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 756.0 + throughput: 1322.7513227513227 + estimated_peak_memory_range: + min: 12288 + max: 84633232 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 229 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 229 + job_id: jnp10nk5q job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:29:24.657545Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:06:07.166564Z' + torchscript_onnx_qnn: + inference_time: 772.0 + throughput: 1295.3367875647668 + estimated_peak_memory_range: + min: 12288 + max: 84377840 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 229 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 229 + job_id: j0pxvljg7 + job_status: Passed diff --git a/qai_hub_models/models/mediapipe_pose/requirements.txt b/qai_hub_models/models/mediapipe_pose/requirements.txt deleted file mode 100644 index 9c11ddeb..00000000 --- a/qai_hub_models/models/mediapipe_pose/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -opencv-python -requests diff --git a/qai_hub_models/models/mediapipe_selfie/README.md b/qai_hub_models/models/mediapipe_selfie/README.md index 9c0602dc..6c8c4440 100644 --- a/qai_hub_models/models/mediapipe_selfie/README.md +++ b/qai_hub_models/models/mediapipe_selfie/README.md @@ -10,7 +10,7 @@ This is based on the implementation of MediaPipe-Selfie-Segmentation found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mediapipe_selfie). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.mediapipe_selfie.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MediaPipe-Selfie-Segmentation can be found [here](https://github.com/google/mediapipe/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Image segmentation guide](https://developers.google.com/mediapipe/solutions/vision/image_segmenter/) diff --git a/qai_hub_models/models/mediapipe_selfie/conftest.py b/qai_hub_models/models/mediapipe_selfie/conftest.py new file mode 100644 index 00000000..70353f17 --- /dev/null +++ b/qai_hub_models/models/mediapipe_selfie/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mediapipe_selfie import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mediapipe_selfie.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mediapipe_selfie/demo.py b/qai_hub_models/models/mediapipe_selfie/demo.py index 38c43043..3c611dfd 100644 --- a/qai_hub_models/models/mediapipe_selfie/demo.py +++ b/qai_hub_models/models/mediapipe_selfie/demo.py @@ -4,8 +4,6 @@ # --------------------------------------------------------------------- from __future__ import annotations -from typing import Type - from PIL.Image import fromarray from qai_hub_models.models.mediapipe_selfie.app import SelfieSegmentationApp @@ -15,13 +13,15 @@ SelfieSegmentation, ) from qai_hub_models.utils.args import ( - add_output_dir_arg, + demo_model_from_cli_args, get_model_cli_parser, - model_from_cli_args, + get_on_device_demo_parser, + validate_on_device_demo_args, ) from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image -from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.base_model import TargetRuntime from qai_hub_models.utils.display import display_or_save_image +from qai_hub_models.utils.image_processing import pil_resize_pad, pil_undo_resize_pad IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( MODEL_ID, MODEL_ASSET_VERSION, "selfie.jpg" @@ -30,35 +30,38 @@ # Run selfie segmentation app end-to-end on a sample image. # The demo will display the predicted mask in a window. -def mediapipe_selfie_demo( - model_cls: Type[BaseModel], - default_image: str | CachedWebModelAsset, +def main( is_test: bool = False, ): # Demo parameters - parser = get_model_cli_parser(model_cls) + parser = get_model_cli_parser(SelfieSegmentation) + parser = get_on_device_demo_parser( + parser, available_target_runtimes=[TargetRuntime.TFLITE], add_output_dir=True + ) parser.add_argument( "--image", type=str, - default=default_image, + default=IMAGE_ADDRESS, help="File path or URL to an input image to use for the demo.", ) - add_output_dir_arg(parser) args = parser.parse_args([] if is_test else None) + validate_on_device_demo_args(args, MODEL_ID) # Load image & model - model = model_from_cli_args(model_cls, args) - print("Model loaded from pre-trained weights.") - image = load_image(args.image, verbose=True, desc="sample input image") + orig_image = load_image(args.image) + model = demo_model_from_cli_args(SelfieSegmentation, MODEL_ID, args) # Run app app = SelfieSegmentationApp(model) + (_, _, height, width) = SelfieSegmentation.get_input_spec()["image"][0] + + image, scale, padding = pil_resize_pad(orig_image, (height, width)) mask = app.predict(image) * 255.0 mask = fromarray(mask).convert("L") if not is_test: # Make sure the input image and mask are resized so the demo can visually # show the images in the same resolution. - image = image.resize(mask.size) + image = pil_undo_resize_pad(image, orig_image.size, scale, padding) display_or_save_image( image, args.output_dir, "mediapipe_selfie_image.png", "sample input image" ) @@ -67,13 +70,5 @@ def mediapipe_selfie_demo( ) -def main(is_test: bool = False): - mediapipe_selfie_demo( - SelfieSegmentation, - IMAGE_ADDRESS, - is_test, - ) - - if __name__ == "__main__": - main() + main(is_test=False) diff --git a/qai_hub_models/models/mediapipe_selfie/export.py b/qai_hub_models/models/mediapipe_selfie/export.py index b422b5e5..ce654b17 100644 --- a/qai_hub_models/models/mediapipe_selfie/export.py +++ b/qai_hub_models/models/mediapipe_selfie/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -26,6 +26,7 @@ from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( print_inference_metrics, + print_on_target_demo_cmd, print_profile_metrics_from_job, ) from qai_hub_models.utils.qai_hub_helpers import ( @@ -109,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,36 +158,40 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/mediapipe_selfie/info.yaml b/qai_hub_models/models/mediapipe_selfie/info.yaml index 3c85bfbc..f40e5ca0 100644 --- a/qai_hub_models/models/mediapipe_selfie/info.yaml +++ b/qai_hub_models/models/mediapipe_selfie/info.yaml @@ -12,6 +12,7 @@ tags: [] research_paper: https://developers.google.com/mediapipe/solutions/vision/image_segmenter/ research_paper_title: Image segmentation guide license: https://github.com/google/mediapipe/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/google/mediapipe/tree/master/mediapipe/modules/selfie_segmentation technical_details: @@ -36,4 +37,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/mediapipe_selfie/model.py b/qai_hub_models/models/mediapipe_selfie/model.py index b9c65235..76bdd98f 100644 --- a/qai_hub_models/models/mediapipe_selfie/model.py +++ b/qai_hub_models/models/mediapipe_selfie/model.py @@ -197,8 +197,9 @@ def from_pretrained(cls, image_type: str = DEFAULT_IMAGE_TYPE): front_net.load_state_dict(front_state_dict, strict=True) return front_net - def get_input_spec(self, batch_size: int = 1) -> InputSpec: - if self.image_type == "square": + @staticmethod + def get_input_spec(batch_size: int = 1, image_type: str = "square") -> InputSpec: + if image_type == "square": height, width = 256, 256 else: height, width = 144, 256 diff --git a/qai_hub_models/models/mediapipe_selfie/perf.yaml b/qai_hub_models/models/mediapipe_selfie/perf.yaml index f2615794..8e81b1b0 100644 --- a/qai_hub_models/models/mediapipe_selfie/perf.yaml +++ b/qai_hub_models/models/mediapipe_selfie/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MediaPipe-Selfie-Segmentation performance_metrics: - torchscript_onnx_tflite: - inference_time: 817.0 - throughput: 1223.9902080783354 + inference_time: 821.0 + throughput: 1218.026796589525 estimated_peak_memory_range: min: 12288 - max: 1802840 + max: 2051880 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 118 - job_id: jygzljvz5 + job_id: j1p3kox52 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:06:02.750038Z' torchscript_onnx_qnn: - inference_time: 801.0 - throughput: 1248.4394506866417 + inference_time: 805.0 + throughput: 1242.2360248447205 estimated_peak_memory_range: - min: 811008 - max: 91168416 + min: 815104 + max: 4449664 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 139 - job_id: jz5wl3mzp + job_id: j1pv3275x + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 555.0 + throughput: 1801.8018018018017 + estimated_peak_memory_range: + min: 12288 + max: 22552848 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 118 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 118 + job_id: jwgoyd458 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:39:49.005922Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:09:04.960914Z' + torchscript_onnx_qnn: + inference_time: 550.0 + throughput: 1818.1818181818182 + estimated_peak_memory_range: + min: 176128 + max: 42597216 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 139 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 139 + job_id: jlpe967gr + job_status: Passed diff --git a/qai_hub_models/models/mnasnet05/README.md b/qai_hub_models/models/mnasnet05/README.md index 4605ef09..c89a516d 100644 --- a/qai_hub_models/models/mnasnet05/README.md +++ b/qai_hub_models/models/mnasnet05/README.md @@ -10,7 +10,7 @@ This is based on the implementation of MNASNet05 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mnasnet05). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.mnasnet05.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MNASNet05 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [MnasNet: Platform-Aware Neural Architecture Search for Mobile](https://arxiv.org/abs/1807.11626) diff --git a/qai_hub_models/models/mnasnet05/conftest.py b/qai_hub_models/models/mnasnet05/conftest.py new file mode 100644 index 00000000..2e7f4bb5 --- /dev/null +++ b/qai_hub_models/models/mnasnet05/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mnasnet05 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mnasnet05.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mnasnet05/demo.py b/qai_hub_models/models/mnasnet05/demo.py index f674fb1c..1494a5b5 100644 --- a/qai_hub_models/models/mnasnet05/demo.py +++ b/qai_hub_models/models/mnasnet05/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.mnasnet05.model import MNASNet05 +from qai_hub_models.models.mnasnet05.model import MODEL_ID, MNASNet05 def main(is_test: bool = False): - imagenet_demo(MNASNet05, is_test) + imagenet_demo(MNASNet05, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/mnasnet05/export.py b/qai_hub_models/models/mnasnet05/export.py index 670aaac1..c500d613 100644 --- a/qai_hub_models/models/mnasnet05/export.py +++ b/qai_hub_models/models/mnasnet05/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/mnasnet05/info.yaml b/qai_hub_models/models/mnasnet05/info.yaml index be98a382..c3ce91e0 100644 --- a/qai_hub_models/models/mnasnet05/info.yaml +++ b/qai_hub_models/models/mnasnet05/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1807.11626 research_paper_title: 'MnasNet: Platform-Aware Neural Architecture Search for Mobile' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/mnasnet.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/mnasnet05/model.py b/qai_hub_models/models/mnasnet05/model.py index 0562410a..1c4e4e86 100644 --- a/qai_hub_models/models/mnasnet05/model.py +++ b/qai_hub_models/models/mnasnet05/model.py @@ -14,6 +14,6 @@ class MNASNet05(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> MNASNet05: net = tv_models.mnasnet0_5(weights=weights) return cls(net) diff --git a/qai_hub_models/models/mnasnet05/perf.yaml b/qai_hub_models/models/mnasnet05/perf.yaml index 8282b9b2..9536af51 100644 --- a/qai_hub_models/models/mnasnet05/perf.yaml +++ b/qai_hub_models/models/mnasnet05/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MNASNet05 performance_metrics: - torchscript_onnx_tflite: - inference_time: 370.0 - throughput: 2702.7027027027025 + inference_time: 383.0 + throughput: 2610.9660574412533 estimated_peak_memory_range: - min: 12288 - max: 8955784 + min: 20480 + max: 1718480 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 69 - job_id: jmg9zyxvp + job_id: j1p8o1qg9 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:31:16.872390Z' torchscript_onnx_qnn: - inference_time: 367.0 - throughput: 2724.7956403269754 + inference_time: 358.0 + throughput: 2793.2960893854747 estimated_peak_memory_range: - min: 196608 - max: 36330664 + min: 634880 + max: 4722696 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 102 - job_id: jnp1nwvlg + job_id: jn5q8ve57 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 282.0 + throughput: 3546.099290780142 + estimated_peak_memory_range: + min: 12288 + max: 44089552 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 69 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 69 + job_id: jogkz8vgd job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:13:59.738307Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:34:31.653300Z' + torchscript_onnx_qnn: + inference_time: 260.0 + throughput: 3846.153846153846 + estimated_peak_memory_range: + min: 0 + max: 33635600 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 102 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 102 + job_id: j1glnl2pv + job_status: Passed diff --git a/qai_hub_models/models/mnasnet05/test.py b/qai_hub_models/models/mnasnet05/test.py index e3758c14..254e9de5 100644 --- a/qai_hub_models/models/mnasnet05/test.py +++ b/qai_hub_models/models/mnasnet05/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -16,6 +18,7 @@ def test_task(): ) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(MNASNet05.from_pretrained()) diff --git a/qai_hub_models/models/mobilenet_v2/README.md b/qai_hub_models/models/mobilenet_v2/README.md index 7d7910b8..65366af1 100644 --- a/qai_hub_models/models/mobilenet_v2/README.md +++ b/qai_hub_models/models/mobilenet_v2/README.md @@ -10,7 +10,7 @@ This is based on the implementation of MobileNet-v2 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v2). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.mobilenet_v2.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MobileNet-v2 can be found [here](https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) diff --git a/qai_hub_models/models/mobilenet_v2/conftest.py b/qai_hub_models/models/mobilenet_v2/conftest.py new file mode 100644 index 00000000..09f23c24 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mobilenet_v2 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mobilenet_v2.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mobilenet_v2/demo.py b/qai_hub_models/models/mobilenet_v2/demo.py index b2100921..b91c82ce 100644 --- a/qai_hub_models/models/mobilenet_v2/demo.py +++ b/qai_hub_models/models/mobilenet_v2/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.mobilenet_v2.model import MobileNetV2 +from qai_hub_models.models.mobilenet_v2.model import MODEL_ID, MobileNetV2 def main(is_test: bool = False): - imagenet_demo(MobileNetV2, is_test) + imagenet_demo(MobileNetV2, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/mobilenet_v2/export.py b/qai_hub_models/models/mobilenet_v2/export.py index c9c07912..d5d83eb6 100644 --- a/qai_hub_models/models/mobilenet_v2/export.py +++ b/qai_hub_models/models/mobilenet_v2/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/mobilenet_v2/info.yaml b/qai_hub_models/models/mobilenet_v2/info.yaml index 977fd7ed..693da4bb 100644 --- a/qai_hub_models/models/mobilenet_v2/info.yaml +++ b/qai_hub_models/models/mobilenet_v2/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1801.04381 research_paper_title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks' license: https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/tonylins/pytorch-mobilenet-v2/tree/master technical_details: Model checkpoint: Imagenet @@ -37,6 +38,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v2/model.py b/qai_hub_models/models/mobilenet_v2/model.py index b8ba2bc4..3af9c3e9 100644 --- a/qai_hub_models/models/mobilenet_v2/model.py +++ b/qai_hub_models/models/mobilenet_v2/model.py @@ -29,10 +29,10 @@ def __init__( super().__init__(mobilenet_v2_model) @classmethod - def from_pretrained(cls) -> MobileNetV2: + def from_pretrained(cls, weights: str = MOBILENETV2_WEIGHTS) -> MobileNetV2: model = _load_mobilenet_v2_source_model() checkpoint_path = CachedWebModelAsset.from_asset_store( - MODEL_ID, MODEL_ASSET_VERSION, MOBILENETV2_WEIGHTS + MODEL_ID, MODEL_ASSET_VERSION, weights ).fetch() checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu")) # rename classifier.1.weight -> classifier.weight, and bias similarly @@ -45,9 +45,7 @@ def from_pretrained(cls) -> MobileNetV2: return cls(model) -def _load_mobilenet_v2_source_model( - keep_sys_path=False, -) -> torch.nn.Module: +def _load_mobilenet_v2_source_model() -> torch.nn.Module: cfg_path = CachedWebModelAsset.from_asset_store( MODEL_ID, MODEL_ASSET_VERSION, MOBILENETV2_CFG ).fetch() @@ -58,7 +56,6 @@ def _load_mobilenet_v2_source_model( MOBILENETV2_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION, - keep_sys_path=keep_sys_path, ): # necessary import. `modeling.deeplab` comes from the DeepLabV3 repo. from MobileNetV2 import MobileNetV2 as _MobileNetV2 diff --git a/qai_hub_models/models/mobilenet_v2/perf.yaml b/qai_hub_models/models/mobilenet_v2/perf.yaml index 7aa2220c..eaee0237 100644 --- a/qai_hub_models/models/mobilenet_v2/perf.yaml +++ b/qai_hub_models/models/mobilenet_v2/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MobileNet-v2 performance_metrics: - torchscript_onnx_tflite: - inference_time: 533.0 - throughput: 1876.172607879925 + inference_time: 540.0 + throughput: 1851.851851851852 estimated_peak_memory_range: - min: 20480 - max: 1466112 + min: 12288 + max: 1921936 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 70 - job_id: jep2r9vmg + job_id: jygzeyzg8 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:53:26.997975Z' torchscript_onnx_qnn: - inference_time: 809.0 - throughput: 1236.0939431396787 + inference_time: 808.0 + throughput: 1237.6237623762377 estimated_peak_memory_range: - min: 618496 - max: 5733064 + min: 622592 + max: 6011376 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 104 - job_id: jqpyoj745 + job_id: jmg9v2q57 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 393.0 + throughput: 2544.529262086514 + estimated_peak_memory_range: + min: 12288 + max: 55502880 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 70 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 70 + job_id: jz5wozzp1 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:29:39.371442Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:57:53.996541Z' + torchscript_onnx_qnn: + inference_time: 537.0 + throughput: 1862.1973929236499 + estimated_peak_memory_range: + min: 618496 + max: 37101856 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 104 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 104 + job_id: jnp101k5q + job_status: Passed diff --git a/qai_hub_models/models/mobilenet_v2/test.py b/qai_hub_models/models/mobilenet_v2/test.py index 3b688c68..9a629c73 100644 --- a/qai_hub_models/models/mobilenet_v2/test.py +++ b/qai_hub_models/models/mobilenet_v2/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -25,6 +27,7 @@ def test_task(): ) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): run_imagenet_classifier_trace_test(MobileNetV2.from_pretrained()) diff --git a/qai_hub_models/models/mobilenet_v2_quantized/README.md b/qai_hub_models/models/mobilenet_v2_quantized/README.md index 541b9418..7a3bcf84 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/README.md +++ b/qai_hub_models/models/mobilenet_v2_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of MobileNet-v2-Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v2_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.mobilenet_v2_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MobileNet-v2-Quantized can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) diff --git a/qai_hub_models/models/mobilenet_v2_quantized/conftest.py b/qai_hub_models/models/mobilenet_v2_quantized/conftest.py new file mode 100644 index 00000000..0ce2d5b9 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v2_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mobilenet_v2_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mobilenet_v2_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mobilenet_v2_quantized/demo.py b/qai_hub_models/models/mobilenet_v2_quantized/demo.py index 89bb7cce..7bf265df 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/demo.py +++ b/qai_hub_models/models/mobilenet_v2_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable +from qai_hub_models.models.mobilenet_v2_quantized.model import ( + MODEL_ID, + MobileNetV2Quantizable, +) def main(is_test: bool = False): - imagenet_demo(MobileNetV2Quantizable, is_test) + imagenet_demo(MobileNetV2Quantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/mobilenet_v2_quantized/export.py b/qai_hub_models/models/mobilenet_v2_quantized/export.py index 63655d62..f73c0796 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/export.py +++ b/qai_hub_models/models/mobilenet_v2_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,35 +163,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, metrics="psnr,top1,top5" ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/mobilenet_v2_quantized/info.yaml b/qai_hub_models/models/mobilenet_v2_quantized/info.yaml index 21ac3a89..302fcc0a 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/info.yaml +++ b/qai_hub_models/models/mobilenet_v2_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1801.04381 research_paper_title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks' license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/mobilenetv2 technical_details: Model checkpoint: Imagenet @@ -37,6 +38,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v2_quantized/model.py b/qai_hub_models/models/mobilenet_v2_quantized/model.py index 5c922ffe..d72efcb4 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/model.py +++ b/qai_hub_models/models/mobilenet_v2_quantized/model.py @@ -8,13 +8,13 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, - HubCompileOptionsInt8Mixin, ) # isort: on import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.mobilenet_v2.model import ( @@ -24,18 +24,17 @@ from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime +from qai_hub_models.utils.quantization_aimet import convert_all_depthwise_to_per_tensor MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 2 +MODEL_ASSET_VERSION = 3 # Weights downloaded from https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/torch_mobilenetv2_w8a8_state_dict.pth QUANTIZED_WEIGHTS = "torch_mobilenetv2_w8a8_state_dict.pth" -DEFAULT_ENCODINGS = "encodings.json" +DEFAULT_ENCODINGS = "mobilenet_v2_quantized_encodings.json" -class MobileNetV2Quantizable( - HubCompileOptionsInt8Mixin, AIMETQuantizableMixin, MobileNetV2 -): +class MobileNetV2Quantizable(AIMETQuantizableMixin, MobileNetV2): """MobileNetV2 with post train quantization support.""" def __init__( @@ -66,13 +65,12 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ # Load Model - model_fp32 = _load_mobilenet_v2_source_model( - keep_sys_path=True, - ) - input_shape = MobileNetV2(None).get_input_spec()["image_tensor"][0] + model = _load_mobilenet_v2_source_model() + input_shape = cls.get_input_spec()["image_tensor"][0] # Following # https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/mobilenetv2/model/model_definition.py#L64 - equalize_model(model_fp32, input_shape) + model = prepare_model(model) + equalize_model(model, input_shape) # Download weights and quantization parameters weights = CachedWebModelAsset.from_asset_store( @@ -80,21 +78,22 @@ def from_pretrained( ).fetch() aimet_config = get_default_aimet_config() - # Load the QAT/PTQ tuned model_fp32 weights + # Load the QAT/PTQ tuned model weights checkpoint = torch.load(weights, map_location=torch.device("cpu")) state_dict = { k.replace("classifier.1", "classifier"): v for k, v in checkpoint["state_dict"].items() } - model_fp32.load_state_dict(state_dict) + model.load_state_dict(state_dict) sim = QuantizationSimModel( - model_fp32, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, config_file=aimet_config, dummy_input=torch.rand(input_shape), ) + convert_all_depthwise_to_per_tensor(sim.model) if aimet_encodings: if aimet_encodings == "DEFAULT": @@ -105,3 +104,11 @@ def from_pretrained( sim.model.eval() return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml index 9521afec..55ab7eff 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml +++ b/qai_hub_models/models/mobilenet_v2_quantized/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MobileNet-v2-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 240.0 - throughput: 4166.666666666667 + inference_time: 237.0 + throughput: 4219.4092827004215 estimated_peak_memory_range: min: 12288 - max: 1557248 + max: 1520264 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 70 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 70 - job_id: j1p8em3zp + job_id: j1p3klz52 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:42:55.837359Z' torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 352.0 + throughput: 2840.909090909091 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 135168 + max: 94316568 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 69 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: '' - job_status: Skipped + total_layers: 69 + job_id: j1pv3ym5x + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 168.0 + throughput: 5952.380952380952 + estimated_peak_memory_range: + min: 12288 + max: 35960128 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 70 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 70 + job_id: jwgoy7d58 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:15:21.382192Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:47:22.207861Z' + torchscript_onnx_qnn: + inference_time: 253.0 + throughput: 3952.5691699604745 + estimated_peak_memory_range: + min: 163840 + max: 35983856 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 69 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 69 + job_id: j7gjx68pd + job_status: Passed diff --git a/qai_hub_models/models/mobilenet_v2_quantized/test.py b/qai_hub_models/models/mobilenet_v2_quantized/test.py index b6c94ba4..9837761a 100644 --- a/qai_hub_models/models/mobilenet_v2_quantized/test.py +++ b/qai_hub_models/models/mobilenet_v2_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.mobilenet_v2_quantized.demo import main as demo_main from qai_hub_models.models.mobilenet_v2_quantized.model import ( @@ -26,15 +25,6 @@ def test_task(): ) -@skip_clone_repo_check -def test_trace(): - run_imagenet_classifier_trace_test( - MobileNetV2Quantizable.from_pretrained(), - is_quantized=True, - atol=0.03, - ) - - @skip_clone_repo_check def test_demo(): # Verify demo does not crash diff --git a/qai_hub_models/models/mobilenet_v3_large/README.md b/qai_hub_models/models/mobilenet_v3_large/README.md index d36f9fa9..dc1194bc 100644 --- a/qai_hub_models/models/mobilenet_v3_large/README.md +++ b/qai_hub_models/models/mobilenet_v3_large/README.md @@ -3,14 +3,14 @@ # [MobileNet-v3-Large: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/mobilenet_v3_large) -MobileNetV3Large is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. +MobileNet-v3-Large is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. This is based on the implementation of MobileNet-v3-Large found [here](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py). This repository contains scripts for optimized on-device export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v3_large). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.mobilenet_v3_large.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MobileNet-v3-Large can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244) diff --git a/qai_hub_models/models/mobilenet_v3_large/conftest.py b/qai_hub_models/models/mobilenet_v3_large/conftest.py new file mode 100644 index 00000000..9733dfed --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mobilenet_v3_large import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mobilenet_v3_large.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mobilenet_v3_large/demo.py b/qai_hub_models/models/mobilenet_v3_large/demo.py index 46a8418d..2eabd08a 100644 --- a/qai_hub_models/models/mobilenet_v3_large/demo.py +++ b/qai_hub_models/models/mobilenet_v3_large/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.mobilenet_v3_large.model import MobileNetV3Large +from qai_hub_models.models.mobilenet_v3_large.model import MODEL_ID, MobileNetV3Large def main(is_test: bool = False): - imagenet_demo(MobileNetV3Large, is_test) + imagenet_demo(MobileNetV3Large, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/mobilenet_v3_large/export.py b/qai_hub_models/models/mobilenet_v3_large/export.py index 50da73da..5bb2fffd 100644 --- a/qai_hub_models/models/mobilenet_v3_large/export.py +++ b/qai_hub_models/models/mobilenet_v3_large/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/mobilenet_v3_large/info.yaml b/qai_hub_models/models/mobilenet_v3_large/info.yaml index d733455e..be6173d9 100644 --- a/qai_hub_models/models/mobilenet_v3_large/info.yaml +++ b/qai_hub_models/models/mobilenet_v3_large/info.yaml @@ -4,7 +4,7 @@ id: mobilenet_v3_large status: public headline: Imagenet classifier and general purpose backbone. domain: Computer Vision -description: MobileNetV3Large is a machine learning model that can classify images +description: MobileNet-v3-Large is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. use_case: Image Classification @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1905.02244 research_paper_title: Searching for MobileNetV3 license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v3_large/model.py b/qai_hub_models/models/mobilenet_v3_large/model.py index 578f0355..a69d5547 100644 --- a/qai_hub_models/models/mobilenet_v3_large/model.py +++ b/qai_hub_models/models/mobilenet_v3_large/model.py @@ -14,6 +14,6 @@ class MobileNetV3Large(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> MobileNetV3Large: net = tv_models.mobilenet_v3_large(weights=weights) return cls(net) diff --git a/qai_hub_models/models/mobilenet_v3_large/perf.yaml b/qai_hub_models/models/mobilenet_v3_large/perf.yaml index 5907cc30..a03a69e5 100644 --- a/qai_hub_models/models/mobilenet_v3_large/perf.yaml +++ b/qai_hub_models/models/mobilenet_v3_large/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MobileNet-v3-Large performance_metrics: - torchscript_onnx_tflite: - inference_time: 600.0 - throughput: 1666.6666666666667 + inference_time: 603.0 + throughput: 1658.374792703151 estimated_peak_memory_range: - min: 32768 - max: 17746392 + min: 12288 + max: 2319320 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 134 - job_id: j1gly2ee5 + job_id: jnp10025q job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:10:33.448407Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 433.0 + throughput: 2309.4688221709007 + estimated_peak_memory_range: + min: 12288 + max: 60000912 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 134 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 134 + job_id: jvgdwwe5j + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:19:38.868341Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:10:33.448414Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/mobilenet_v3_large/test.py b/qai_hub_models/models/mobilenet_v3_large/test.py index 60de58c0..fb3cafa2 100644 --- a/qai_hub_models/models/mobilenet_v3_large/test.py +++ b/qai_hub_models/models/mobilenet_v3_large/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(MobileNetV3Large.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(MobileNetV3Large.from_pretrained()) diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/README.md b/qai_hub_models/models/mobilenet_v3_large_quantized/README.md new file mode 100644 index 00000000..da95e166 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/README.md @@ -0,0 +1,54 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [MobileNet-v3-Large-Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/mobilenet_v3_large_quantized) + +MobileNet-v3-Large is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of MobileNet-v3-Large-Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v3_large_quantized). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.mobilenet_v3_large_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.mobilenet_v3_large_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of MobileNet-v3-Large-Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/__init__.py b/qai_hub_models/models/mobilenet_v3_large_quantized/__init__.py new file mode 100644 index 00000000..32a17b68 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/__init__.py @@ -0,0 +1,13 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) +from qai_hub_models.models.mobilenet_v3_large_quantized.model import ( # noqa: F401 + MODEL_ID, +) +from qai_hub_models.models.mobilenet_v3_large_quantized.model import ( # noqa: F401 + MobileNetV3LargeQuantizable as Model, +) diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/conftest.py b/qai_hub_models/models/mobilenet_v3_large_quantized/conftest.py new file mode 100644 index 00000000..7faea3d5 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mobilenet_v3_large_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mobilenet_v3_large_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/demo.py b/qai_hub_models/models/mobilenet_v3_large_quantized/demo.py new file mode 100644 index 00000000..5577f0f4 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/demo.py @@ -0,0 +1,23 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.mobilenet_v3_large_quantized.model import ( + MODEL_ID, + MobileNetV3LargeQuantizable, +) +from qai_hub_models.utils.base_model import TargetRuntime + + +def main(is_test: bool = False): + imagenet_demo( + MobileNetV3LargeQuantizable, + MODEL_ID, + is_test, + available_target_runtimes=[TargetRuntime.TFLITE], + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/export.py b/qai_hub_models/models/mobilenet_v3_large_quantized/export.py new file mode 100644 index 00000000..5305d6b9 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/export.py @@ -0,0 +1,202 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.mobilenet_v3_large_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "mobilenet_v3_large_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "mobilenet_v3_large_quantized", + "MobileNet-v3-Large-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/info.yaml b/qai_hub_models/models/mobilenet_v3_large_quantized/info.yaml new file mode 100644 index 00000000..9232ebd6 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/info.yaml @@ -0,0 +1,44 @@ +name: MobileNet-v3-Large-Quantized +# id must match with the model dir name in qai_hub_models +id: mobilenet_v3_large_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: MobileNet-v3-Large is a machine learning model that can classify images + from the Imagenet dataset. It can also be used as a backbone in building more complex + models for specific use cases. +use_case: Image Classification +tags: + - quantized + - backbone + - real-time +research_paper: https://arxiv.org/abs/1905.02244 +research_paper_title: Searching for MobileNetV3 +license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py +technical_details: + Model checkpoint: Imagenet + Input resolution: 224x224 + Number of parameters: 5.47M + Model size: 5.79 MB +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +deploy_license_type: AI Model Hub License +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/model.py b/qai_hub_models/models/mobilenet_v3_large_quantized/model.py new file mode 100644 index 00000000..55b92db4 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/model.py @@ -0,0 +1,85 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.mobilenet_v3_large.model import MobileNetV3Large +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "mobilenet_v3_large_quantized_encodings.json" + + +class MobileNetV3LargeQuantizable(AIMETQuantizableMixin, MobileNetV3Large): + """MobileNetV3Large with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + MobileNetV3Large.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, + sim_model, + ) + + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "MobileNetV3LargeQuantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = MobileNetV3Large.from_pretrained() + input_shape = cls.get_input_spec()["image_tensor"][0] + + model = prepare_model(model) + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_default_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml b/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml new file mode 100644 index 00000000..724d7aa9 --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/perf.yaml @@ -0,0 +1,108 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 +models: +- name: MobileNet-v3-Large-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 2972.0 + throughput: 336.47375504710635 + estimated_peak_memory_range: + min: 12288 + max: 3564432 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 136 + layers_on_gpu: 0 + layers_on_cpu: 15 + total_layers: 151 + job_id: j1pv3m75x + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:39:39.924043Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 2352.0 + throughput: 425.1700680272109 + estimated_peak_memory_range: + min: 0 + max: 46180704 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 136 + layers_on_gpu: 0 + layers_on_cpu: 15 + total_layers: 151 + job_id: jlpe9x7gr + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:39:39.924051Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/mobilenet_v3_large_quantized/test.py b/qai_hub_models/models/mobilenet_v3_large_quantized/test.py new file mode 100644 index 00000000..6767deef --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_large_quantized/test.py @@ -0,0 +1,29 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, +) +from qai_hub_models.models.mobilenet_v3_large_quantized.demo import main as demo_main +from qai_hub_models.models.mobilenet_v3_large_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + MobileNetV3LargeQuantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + MobileNetV3LargeQuantizable.from_pretrained(), + MODEL_ID, + asset_version=MODEL_ASSET_VERSION, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/mobilenet_v3_small/README.md b/qai_hub_models/models/mobilenet_v3_small/README.md index 41d5e976..25937f85 100644 --- a/qai_hub_models/models/mobilenet_v3_small/README.md +++ b/qai_hub_models/models/mobilenet_v3_small/README.md @@ -10,7 +10,7 @@ This is based on the implementation of MobileNet-v3-Small found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/mobilenet_v3_small). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.mobilenet_v3_small.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of MobileNet-v3-Small can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244) diff --git a/qai_hub_models/models/mobilenet_v3_small/conftest.py b/qai_hub_models/models/mobilenet_v3_small/conftest.py new file mode 100644 index 00000000..e523e36a --- /dev/null +++ b/qai_hub_models/models/mobilenet_v3_small/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.mobilenet_v3_small import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.mobilenet_v3_small.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/mobilenet_v3_small/demo.py b/qai_hub_models/models/mobilenet_v3_small/demo.py index b603d666..eb6b15b9 100644 --- a/qai_hub_models/models/mobilenet_v3_small/demo.py +++ b/qai_hub_models/models/mobilenet_v3_small/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.mobilenet_v3_small.model import MobileNetV3Small +from qai_hub_models.models.mobilenet_v3_small.model import MODEL_ID, MobileNetV3Small def main(is_test: bool = False): - imagenet_demo(MobileNetV3Small, is_test) + imagenet_demo(MobileNetV3Small, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/mobilenet_v3_small/export.py b/qai_hub_models/models/mobilenet_v3_small/export.py index 26c8f0f1..92037cb9 100644 --- a/qai_hub_models/models/mobilenet_v3_small/export.py +++ b/qai_hub_models/models/mobilenet_v3_small/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/mobilenet_v3_small/info.yaml b/qai_hub_models/models/mobilenet_v3_small/info.yaml index e26f6a3a..8984b9c8 100644 --- a/qai_hub_models/models/mobilenet_v3_small/info.yaml +++ b/qai_hub_models/models/mobilenet_v3_small/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1905.02244 research_paper_title: Searching for MobileNetV3 license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/mobilenet_v3_small/model.py b/qai_hub_models/models/mobilenet_v3_small/model.py index 2eb733dc..f41d01c4 100644 --- a/qai_hub_models/models/mobilenet_v3_small/model.py +++ b/qai_hub_models/models/mobilenet_v3_small/model.py @@ -14,6 +14,6 @@ class MobileNetV3Small(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> MobileNetV3Small: net = tv_models.mobilenet_v3_small(weights=weights) return cls(net) diff --git a/qai_hub_models/models/mobilenet_v3_small/perf.yaml b/qai_hub_models/models/mobilenet_v3_small/perf.yaml index 65b78de3..20a2652a 100644 --- a/qai_hub_models/models/mobilenet_v3_small/perf.yaml +++ b/qai_hub_models/models/mobilenet_v3_small/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: MobileNet-v3-Small performance_metrics: - torchscript_onnx_tflite: - inference_time: 427.0 - throughput: 2341.92037470726 + inference_time: 424.0 + throughput: 2358.490566037736 estimated_peak_memory_range: - min: 12288 - max: 1724768 + min: 36864 + max: 1921728 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 122 - job_id: j1gly20e5 + job_id: jlpe900gr job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:22:40.354876Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 288.0 + throughput: 3472.222222222222 + estimated_peak_memory_range: + min: 12288 + max: 40067360 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 122 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 122 + job_id: jygzeq6g8 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:09:16.610887Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:22:40.354885Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/mobilenet_v3_small/test.py b/qai_hub_models/models/mobilenet_v3_small/test.py index 45f656b0..4d73f95b 100644 --- a/qai_hub_models/models/mobilenet_v3_small/test.py +++ b/qai_hub_models/models/mobilenet_v3_small/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(MobileNetV3Small.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(MobileNetV3Small.from_pretrained()) diff --git a/qai_hub_models/models/openai_clip/README.md b/qai_hub_models/models/openai_clip/README.md index 531af4a8..1212603c 100644 --- a/qai_hub_models/models/openai_clip/README.md +++ b/qai_hub_models/models/openai_clip/README.md @@ -10,7 +10,7 @@ This is based on the implementation of OpenAI-Clip found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/openai_clip). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.openai_clip.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of OpenAI-Clip can be found [here](https://github.com/openai/CLIP/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) diff --git a/qai_hub_models/models/openai_clip/app.py b/qai_hub_models/models/openai_clip/app.py index 92ffd8c3..3df122c1 100644 --- a/qai_hub_models/models/openai_clip/app.py +++ b/qai_hub_models/models/openai_clip/app.py @@ -97,8 +97,8 @@ def process_text(self, text: str) -> torch.Tensor: """ return self.tokenizer(text) + @staticmethod def get_input_spec( - self, image_size: Tuple[int, int] = (224, 224), text_size: Tuple[int, int] = (3, 77), ) -> InputSpec: diff --git a/qai_hub_models/models/openai_clip/conftest.py b/qai_hub_models/models/openai_clip/conftest.py new file mode 100644 index 00000000..bb6c08ac --- /dev/null +++ b/qai_hub_models/models/openai_clip/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.openai_clip import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.openai_clip.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/openai_clip/export.py b/qai_hub_models/models/openai_clip/export.py index d8d60fc1..780dff90 100644 --- a/qai_hub_models/models/openai_clip/export.py +++ b/qai_hub_models/models/openai_clip/export.py @@ -10,14 +10,14 @@ import os import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub import torch from qai_hub_models.models.openai_clip import Model from qai_hub_models.utils.args import export_parser, get_model_kwargs -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.compare import torch_inference from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( @@ -90,9 +90,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or ALL_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "openai_clip", @@ -111,68 +111,85 @@ def export_model( # 1. Initialize PyTorch model model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) - components_dict = {} + components_dict: Dict[str, BaseModel] = {} if "CLIPTextEncoder" in components: - components_dict["CLIPTextEncoder"] = model.text_encoder + components_dict["CLIPTextEncoder"] = model.text_encoder # type: ignore if "CLIPImageEncoder" in components: - components_dict["CLIPImageEncoder"] = model.image_encoder + components_dict["CLIPImageEncoder"] = model.image_encoder # type: ignore - compile_jobs = {} + compile_jobs: Dict[str, hub.client.CompileJob] = {} for component_name, component in components_dict.items(): # Trace the model input_spec = component.get_input_spec() - source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {component_name} to run on-device.") - compile_jobs[component_name] = hub.submit_compile_job( + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), - name=f"{component_name}", + name=f"{model_name}_{component_name}", options=model_compile_options, ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_jobs[component_name].get_target_model(), device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() # Convert inputs from channel first to channel last hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_jobs[component_name].get_target_model(), inputs=hub_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Download the model assets to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) for component_name, compile_job in compile_jobs.items(): - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download( str(output_path / f"{model_name}_{component_name}.tflite") ) @@ -181,8 +198,8 @@ def export_model( if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: @@ -190,8 +207,8 @@ def export_model( inference_job = inference_jobs[component_name] sample_inputs = components_dict[component_name].sample_inputs() torch_out = torch_inference(components_dict[component_name], sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return { diff --git a/qai_hub_models/models/openai_clip/info.yaml b/qai_hub_models/models/openai_clip/info.yaml index 2e439649..d3145f66 100644 --- a/qai_hub_models/models/openai_clip/info.yaml +++ b/qai_hub_models/models/openai_clip/info.yaml @@ -15,6 +15,7 @@ tags: research_paper: https://arxiv.org/abs/2103.00020 research_paper_title: Learning Transferable Visual Models From Natural Language Supervision license: https://github.com/openai/CLIP/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/openai/CLIP/ technical_details: Model checkpoint: ViT-B/16 @@ -35,4 +36,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: mit +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/openai_clip/model.py b/qai_hub_models/models/openai_clip/model.py index 9e4c452d..5c94b0ef 100644 --- a/qai_hub_models/models/openai_clip/model.py +++ b/qai_hub_models/models/openai_clip/model.py @@ -90,8 +90,8 @@ def forward(self, text: torch.Tensor): text_features = text_features / text_features.norm(dim=1, keepdim=True) return text_features + @staticmethod def get_input_spec( - self, batch_size: int = 1, text_length: int = 77, ) -> InputSpec: @@ -135,8 +135,8 @@ def forward(self, image: torch.Tensor): image_features = image_features / image_features.norm(dim=1, keepdim=True) return self.net.logit_scale.exp() * image_features + @staticmethod def get_input_spec( - self, height: int = 224, width: int = 224, ) -> InputSpec: diff --git a/qai_hub_models/models/openai_clip/perf.yaml b/qai_hub_models/models/openai_clip/perf.yaml index 2a6ddaed..0989352c 100644 --- a/qai_hub_models/models/openai_clip/perf.yaml +++ b/qai_hub_models/models/openai_clip/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: CLIPTextEncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 15528.0 - throughput: 64.39979392065945 + inference_time: 15516.0 + throughput: 64.44960041247744 estimated_peak_memory_range: - min: 40960 - max: 3106072 + min: 49152 + max: 3267008 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,39 +43,77 @@ models: layers_on_gpu: 0 layers_on_cpu: 2 total_layers: 576 - job_id: j2p0m2veg + job_id: jz5worjp1 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:47:17.422656Z' torchscript_onnx_qnn: - inference_time: 8149.0 - throughput: 122.71444348999877 + inference_time: 15586.0 + throughput: 64.16014371872193 estimated_peak_memory_range: - min: 40960 - max: 23728064 + min: 45056 + max: 2975720 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 377 + layers_on_npu: 574 layers_on_gpu: 0 - layers_on_cpu: 0 - total_layers: 377 - job_id: jogk2q9og + layers_on_cpu: 2 + total_layers: 576 + job_id: jz57z1rp3 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 11115.0 + throughput: 89.9685110211426 + estimated_peak_memory_range: + min: 16384 + max: 204316144 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 574 + layers_on_gpu: 0 + layers_on_cpu: 2 + total_layers: 576 + job_id: jnp10ml5q job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:25:08.294036Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:49:22.781059Z' + torchscript_onnx_qnn: + inference_time: 11246.0 + throughput: 88.92050506846878 + estimated_peak_memory_range: + min: 40960 + max: 205502128 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 574 + layers_on_gpu: 0 + layers_on_cpu: 2 + total_layers: 576 + job_id: j0pxv89g7 + job_status: Passed - name: CLIPImageEncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 127729.0 - throughput: 7.829075621041424 + inference_time: 128196.0 + throughput: 7.800555399544447 estimated_peak_memory_range: - min: 159744 - max: 3867320 + min: 143360 + max: 3847064 primary_compute_unit: NPU precision: fp16 layer_info: @@ -80,28 +121,66 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 575 - job_id: j1p8em48p + job_id: jmg9vqv57 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:59:18.769511Z' torchscript_onnx_qnn: - inference_time: 50903.0 - throughput: 19.645207551617784 + inference_time: 127795.0 + throughput: 7.825032278258147 + estimated_peak_memory_range: + min: 180224 + max: 4074336 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 575 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 575 + job_id: jqp4q6lgo + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 98556.0 + throughput: 10.14651568651325 estimated_peak_memory_range: - min: 86016 - max: 59741752 + min: 163840 + max: 781391856 primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 370 + layers_on_npu: 575 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 370 - job_id: jn5qlrmmp + total_layers: 575 + job_id: jvgdwml5j job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:30:00.084732Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:01:23.890974Z' + torchscript_onnx_qnn: + inference_time: 97281.0 + throughput: 10.279499593959766 + estimated_peak_memory_range: + min: 237568 + max: 783870384 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 575 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 575 + job_id: jo5mr1qgk + job_status: Passed diff --git a/qai_hub_models/models/openai_clip/requirements.txt b/qai_hub_models/models/openai_clip/requirements.txt index 4245d2f9..b44c6052 100644 --- a/qai_hub_models/models/openai_clip/requirements.txt +++ b/qai_hub_models/models/openai_clip/requirements.txt @@ -1,3 +1,2 @@ -torchvision ftfy==6.1.1 regex==2023.10.3 diff --git a/qai_hub_models/models/openpose/README.md b/qai_hub_models/models/openpose/README.md index 98224c0f..34c86010 100644 --- a/qai_hub_models/models/openpose/README.md +++ b/qai_hub_models/models/openpose/README.md @@ -10,7 +10,7 @@ This is based on the implementation of OpenPose found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/openpose). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.openpose.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of OpenPose can be found [here](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [OpenPose: Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields](https://arxiv.org/abs/1812.08008) diff --git a/qai_hub_models/models/openpose/conftest.py b/qai_hub_models/models/openpose/conftest.py new file mode 100644 index 00000000..49e58484 --- /dev/null +++ b/qai_hub_models/models/openpose/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.openpose import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.openpose.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/openpose/demo.py b/qai_hub_models/models/openpose/demo.py index 885d7631..23059539 100644 --- a/qai_hub_models/models/openpose/demo.py +++ b/qai_hub_models/models/openpose/demo.py @@ -2,13 +2,17 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -import argparse - from qai_hub_models.models.openpose.app import OpenPoseApp from qai_hub_models.models.openpose.model import MODEL_ASSET_VERSION, MODEL_ID, OpenPose -from qai_hub_models.utils.args import add_output_dir_arg +from qai_hub_models.utils.args import ( + demo_model_from_cli_args, + get_model_cli_parser, + get_on_device_demo_parser, + validate_on_device_demo_args, +) from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image from qai_hub_models.utils.display import display_or_save_image +from qai_hub_models.utils.image_processing import pil_resize_pad, pil_undo_resize_pad IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( MODEL_ID, MODEL_ASSET_VERSION, "openpose_demo.png" @@ -19,21 +23,31 @@ # The demo will display the input image with circles drawn over the estimated joint positions. def main(is_test: bool = False): # Demo parameters - parser = argparse.ArgumentParser() + parser = get_model_cli_parser(OpenPose) + parser = get_on_device_demo_parser(parser, add_output_dir=True) parser.add_argument( "--image", type=str, default=IMAGE_ADDRESS, help="image file path or URL.", ) - add_output_dir_arg(parser) args = parser.parse_args([] if is_test else None) + model = demo_model_from_cli_args(OpenPose, MODEL_ID, args) + validate_on_device_demo_args(args, MODEL_ID) + + # Load image + app = OpenPoseApp(model) + (_, _, height, width) = OpenPose.get_input_spec()["image"][0] + orig_image = load_image(args.image) + image, scale, padding = pil_resize_pad(orig_image, (height, width)) - # Load image & model - app = OpenPoseApp(OpenPose.from_pretrained()) - image = load_image(args.image) + # Run inference pred_image = app.estimate_pose(image) + + # Resize / unpad annotated image + pred_image = pil_undo_resize_pad(pred_image, orig_image.size, scale, padding) + if not is_test: display_or_save_image(pred_image, args.output_dir) diff --git a/qai_hub_models/models/openpose/export.py b/qai_hub_models/models/openpose/export.py index bb86fa55..98bbb750 100644 --- a/qai_hub_models/models/openpose/export.py +++ b/qai_hub_models/models/openpose/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -26,6 +26,7 @@ from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( print_inference_metrics, + print_on_target_demo_cmd, print_profile_metrics_from_job, ) from qai_hub_models.utils.qai_hub_helpers import ( @@ -109,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0,output_1", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,36 +158,40 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0,output_1", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/openpose/info.yaml b/qai_hub_models/models/openpose/info.yaml index 1941c9ef..3ec39a9a 100644 --- a/qai_hub_models/models/openpose/info.yaml +++ b/qai_hub_models/models/openpose/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/1812.08008 research_paper_title: 'OpenPose: Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields' license: https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/CMU-Perceptual-Computing-Lab/openpose technical_details: Model checkpoint: body_pose_model.pth @@ -32,4 +33,5 @@ related_models: has_static_banner: yes has_animated_banner: no license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/openpose/model.py b/qai_hub_models/models/openpose/model.py index f2749ddf..6379a270 100644 --- a/qai_hub_models/models/openpose/model.py +++ b/qai_hub_models/models/openpose/model.py @@ -98,8 +98,8 @@ def forward(self, image: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: return paf, heatmap + @staticmethod def get_input_spec( - self, batch_size: int = 1, num_channels: int = 3, height: int = 224, diff --git a/qai_hub_models/models/openpose/perf.yaml b/qai_hub_models/models/openpose/perf.yaml index 29999a3f..6f677f3e 100644 --- a/qai_hub_models/models/openpose/perf.yaml +++ b/qai_hub_models/models/openpose/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: OpenPose performance_metrics: - torchscript_onnx_tflite: - inference_time: 11747.0 - throughput: 85.12811781731506 + inference_time: 11718.0 + throughput: 85.33879501621438 estimated_peak_memory_range: min: 229376 - max: 2462464 + max: 2888976 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 103 - job_id: jnp1nw3kg + job_id: j1pvokj5x job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:20:33.773079Z' torchscript_onnx_qnn: - inference_time: 11820.0 - throughput: 84.60236886632826 + inference_time: 11832.0 + throughput: 84.51656524678837 estimated_peak_memory_range: - min: 622592 - max: 241891488 + min: 643072 + max: 242325320 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 187 - job_id: jvgddq0kg + job_id: jlpe1m15r + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 8755.0 + throughput: 114.22044545973729 + estimated_peak_memory_range: + min: 192512 + max: 33307600 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 103 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 103 + job_id: j7gjmnxgd job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:07:34.029953Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:22:29.557459Z' + torchscript_onnx_qnn: + inference_time: 8772.0 + throughput: 113.99908800729594 + estimated_peak_memory_range: + min: 618496 + max: 53437584 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 187 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 187 + job_id: jygz9dk58 + job_status: Passed diff --git a/qai_hub_models/models/openpose/requirements.txt b/qai_hub_models/models/openpose/requirements.txt index ab4e628e..e4c17c9c 100644 --- a/qai_hub_models/models/openpose/requirements.txt +++ b/qai_hub_models/models/openpose/requirements.txt @@ -1,2 +1,2 @@ -scipy -matplotlib +scipy==1.8.1 +matplotlib==3.7.4 diff --git a/qai_hub_models/models/protocols.py b/qai_hub_models/models/protocols.py new file mode 100644 index 00000000..e263482e --- /dev/null +++ b/qai_hub_models/models/protocols.py @@ -0,0 +1,194 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +""" +This file defines type helpers. Specifically, those helpers are python Protocols. + +Protocols are helpful for defining interfaces that must be implemented for specific functions. + +For example, a function may take any class that implements FromPretrained. +The parameter would be typed "FromPretrainedProtocol", as defined in this file. + +Protocols may also be inherited to declare that a class must implement said protocol. +For example, AIMETQuantizableMixin inherits HubModelProtocol. This informs the type +checker that the class that inherits the mixin must implement HubModelProtocol. + +These are type checked at compile time. +""" +from __future__ import annotations + +from abc import abstractmethod +from typing import Protocol, Type, TypeVar, runtime_checkable + +from qai_hub.client import DatasetEntries + +from qai_hub_models.evaluators.base_evaluators import BaseEvaluator, _DataLoader +from qai_hub_models.models.common import SampleInputsType, TargetRuntime +from qai_hub_models.utils.input_spec import InputSpec + +FromPretrainedTypeVar = TypeVar("FromPretrainedTypeVar", bound="FromPretrainedProtocol") + +FromPrecompiledTypeVar = TypeVar( + "FromPrecompiledTypeVar", bound="FromPrecompiledProtocol" +) + + +class HubModelProtocol(Protocol): + """ + All AI Hub Models must, at minimum, implement this interface. + """ + + @staticmethod + @abstractmethod + def get_input_spec(*args, **kwargs) -> InputSpec: + """ + Returns a map from `{input_name -> (shape, dtype)}` + specifying the shape and dtype for each input argument. + """ + ... + + @abstractmethod + def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType: + """ + Returns a set of sample inputs for the model. + + For each input name in the model, a list of numpy arrays is provided. + If the returned set is batch N, all input names must contain exactly N numpy arrays. + + This is a default implementation that returns a single random data array + for each input name based on the shapes and dtypes in `get_input_spec`. + + A subclass may choose to override this and fetch a batch of real input data + from a data source. + """ + ... + + +class QuantizableModelProtocol(Protocol): + """ + Methods required for a model to be quantizable. + """ + + @abstractmethod + def quantize( + self, + data: _DataLoader, + num_samples: int | None = None, + evaluator: BaseEvaluator | None = None, + device: str = "cpu", + requantize_model_weights=False, + ) -> float | None: + """ + Compute quantization encodings for this model with the given dataset and model evaluator. + + This model will be updated with a new set of quantization parameters. Future calls to + forward() and export_...() will take these quantization parameters into account. + + Parameters: + data: torch DataLoader | Collection + Data loader for the dataset to use for evaluation. + If an evaluator is __NOT__ provided (see "evaluator" parameter), the iterator must return + inputs: Collection[torch.Tensor] | torch.Tensor + + otherwise, if an evaluator __IS__ provided, the iterator must return + tuple( + inputs: Collection[torch.Tensor] | torch.Tensor, + ground_truth: Collection[torch.Tensor] | torch.Tensor] + ) + + num_samples: int | None + Number of samples to use for evaluation. One sample is one iteration from iter(data). + If none, defaults to the number of samples in the dataset. + + evaluator: BaseModelEvaluator | None + Evaluator to populate while quantizing the data. + If not provided, an evaluator is not used. + + device: str + Name of device on which inference should be run. + + requantize_model_weights: bool + If a weight is quantized, recompute its quantization parameters. + + Returns: + If an evaluator is provided, returns its accuracy score. No return value otherwise. + """ + ... + + @abstractmethod + def get_calibration_data( + self, + target_runtime: TargetRuntime, + input_spec: InputSpec | None = None, + ) -> DatasetEntries | None: + """ + Calibration dataset for this model and input spec. + """ + ... + + +class ExecutableModelProtocol(Protocol): + """ + Classes follow this protocol if they are executable. + """ + + @abstractmethod + def __call__(self, *args, **kwargs): + """ + Execute the model and return its output. + """ + ... + + +@runtime_checkable +class EvalModelProtocol(Protocol): + """ + Models follow this protocol if they can be numerically evaluated. + """ + + @abstractmethod + def get_evaluator(self) -> BaseEvaluator: + """ + Gets a class for evaluating output of this model. + """ + ... + + +@runtime_checkable +class FromPretrainedProtocol(Protocol): + """ + Models follow this protocol if they can be initiated from a pretrained torch model. + """ + + @classmethod + @abstractmethod + def from_pretrained( + cls: Type[FromPretrainedTypeVar], *args, **kwargs + ) -> FromPretrainedTypeVar: + """ + Utility function that helps users get up and running with a default + pretrained model. While this function may take arguments, all arguments + should have default values specified, so that all classes can be invoked + with `cls.from_pretrained()` and always have it return something reasonable. + """ + ... + + +class FromPrecompiledProtocol(Protocol): + """ + Models follow this protocol if they can be initiated from a precompiled torch model. + """ + + @classmethod + @abstractmethod + def from_precompiled( + cls: Type[FromPrecompiledTypeVar], *args, **kwargs + ) -> "FromPrecompiledTypeVar": + """ + Utility function that helps users get up and running with a default + precompiled model. While this function may take arguments, all arguments + should have default values specified, so that all classes can be invoked + with `cls.from_precompiled()` and always have it return something reasonable. + """ + ... diff --git a/qai_hub_models/models/quicksrnetlarge/README.md b/qai_hub_models/models/quicksrnetlarge/README.md index dece0985..9d1f26c2 100644 --- a/qai_hub_models/models/quicksrnetlarge/README.md +++ b/qai_hub_models/models/quicksrnetlarge/README.md @@ -10,7 +10,7 @@ This is based on the implementation of QuickSRNetLarge found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetlarge). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.quicksrnetlarge.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of QuickSRNetLarge can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) diff --git a/qai_hub_models/models/quicksrnetlarge/conftest.py b/qai_hub_models/models/quicksrnetlarge/conftest.py new file mode 100644 index 00000000..b6f5b722 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.quicksrnetlarge import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.quicksrnetlarge.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/quicksrnetlarge/demo.py b/qai_hub_models/models/quicksrnetlarge/demo.py index 77a29a77..12d688c3 100644 --- a/qai_hub_models/models/quicksrnetlarge/demo.py +++ b/qai_hub_models/models/quicksrnetlarge/demo.py @@ -20,6 +20,7 @@ def main(is_test: bool = False): super_resolution_demo( model_cls=QuickSRNetLarge, + model_id=MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, ) diff --git a/qai_hub_models/models/quicksrnetlarge/export.py b/qai_hub_models/models/quicksrnetlarge/export.py index 07628b5b..9e1976a4 100644 --- a/qai_hub_models/models/quicksrnetlarge/export.py +++ b/qai_hub_models/models/quicksrnetlarge/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -26,6 +26,7 @@ from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( print_inference_metrics, + print_on_target_demo_cmd, print_profile_metrics_from_job, ) from qai_hub_models.utils.qai_hub_helpers import ( @@ -109,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,36 +158,40 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/quicksrnetlarge/info.yaml b/qai_hub_models/models/quicksrnetlarge/info.yaml index 3bb825dc..b9cc532d 100644 --- a/qai_hub_models/models/quicksrnetlarge/info.yaml +++ b/qai_hub_models/models/quicksrnetlarge/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/2303.04336 research_paper_title: 'QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms' license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet technical_details: Model checkpoint: quicksrnet_large_4x_checkpoint_float32 @@ -27,6 +28,7 @@ form_factors: - Tablet related_models: [xlsr, esrgan, quicksrnetlarge_quantized] has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/quicksrnetlarge/model.py b/qai_hub_models/models/quicksrnetlarge/model.py index 609b9163..bfed7f6c 100644 --- a/qai_hub_models/models/quicksrnetlarge/model.py +++ b/qai_hub_models/models/quicksrnetlarge/model.py @@ -40,8 +40,6 @@ def __init__( @classmethod def from_pretrained(cls) -> QuickSRNetLarge: model = _load_quicksrnet_source_model( - MODEL_ID, - MODEL_ASSET_VERSION, SCALING_FACTOR, NUM_CHANNELS, NUM_INTERMEDIATE_LAYERS, diff --git a/qai_hub_models/models/quicksrnetlarge/perf.yaml b/qai_hub_models/models/quicksrnetlarge/perf.yaml index 1b0102ea..d7559b5d 100644 --- a/qai_hub_models/models/quicksrnetlarge/perf.yaml +++ b/qai_hub_models/models/quicksrnetlarge/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: QuickSRNetLarge performance_metrics: - torchscript_onnx_tflite: - inference_time: 2532.0 - throughput: 394.9447077409163 + inference_time: 2500.0 + throughput: 400.0 estimated_peak_memory_range: min: 16384 - max: 8035880 + max: 1492864 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 31 - job_id: jz57el4rp + job_id: jn5q8l757 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:06:24.010143Z' torchscript_onnx_qnn: - inference_time: 2106.0 - throughput: 474.8338081671415 + inference_time: 2109.0 + throughput: 474.158368895211 estimated_peak_memory_range: - min: 212992 - max: 76319976 + min: 16384 + max: 5120280 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 32 - job_id: jqp4yd1lp + job_id: jw5668v5o + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1780.0 + throughput: 561.7977528089888 + estimated_peak_memory_range: + min: 20480 + max: 27633264 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 28 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 31 + job_id: j1glnyepv job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:38:01.534196Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:10:42.992618Z' + torchscript_onnx_qnn: + inference_time: 1506.0 + throughput: 664.0106241699867 + estimated_peak_memory_range: + min: 208896 + max: 18546960 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 32 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 32 + job_id: j1p3kzx52 + job_status: Passed diff --git a/qai_hub_models/models/quicksrnetlarge/test.py b/qai_hub_models/models/quicksrnetlarge/test.py index 66f6fc53..ad63526d 100644 --- a/qai_hub_models/models/quicksrnetlarge/test.py +++ b/qai_hub_models/models/quicksrnetlarge/test.py @@ -35,5 +35,6 @@ def test_task(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/README.md b/qai_hub_models/models/quicksrnetlarge_quantized/README.md new file mode 100644 index 00000000..22069d4a --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/README.md @@ -0,0 +1,54 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [QuickSRNetLarge-Quantized: Upscale images and remove image noise](https://aihub.qualcomm.com/models/quicksrnetlarge_quantized) + +QuickSRNet Large is designed for upscaling images on mobile platforms to sharpen in real-time. + +This is based on the implementation of QuickSRNetLarge-Quantized found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetlarge_quantized). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.quicksrnetlarge_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.quicksrnetlarge_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of QuickSRNetLarge-Quantized can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/__init__.py b/qai_hub_models/models/quicksrnetlarge_quantized/__init__.py new file mode 100644 index 00000000..dbfdb539 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import QuickSRNetLargeQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/conftest.py b/qai_hub_models/models/quicksrnetlarge_quantized/conftest.py new file mode 100644 index 00000000..c1b1a9d9 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.quicksrnetlarge_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.quicksrnetlarge_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/demo.py b/qai_hub_models/models/quicksrnetlarge_quantized/demo.py new file mode 100644 index 00000000..53d37094 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/demo.py @@ -0,0 +1,28 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.quicksrnetlarge_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetLargeQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnet_demo.jpg" +) + + +def main(is_test: bool = False): + super_resolution_demo( + QuickSRNetLargeQuantizable, + MODEL_ID, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/export.py b/qai_hub_models/models/quicksrnetlarge_quantized/export.py new file mode 100644 index 00000000..ea5568eb --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/export.py @@ -0,0 +1,215 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.quicksrnetlarge_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "quicksrnetlarge_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "quicksrnetlarge_quantized", + "QuickSRNetLarge-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/info.yaml b/qai_hub_models/models/quicksrnetlarge_quantized/info.yaml new file mode 100644 index 00000000..93c2001d --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/info.yaml @@ -0,0 +1,35 @@ +name: QuickSRNetLarge-Quantized +# id must match with the model dir name in qai_hub_models +id: quicksrnetlarge_quantized +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: QuickSRNet Large is designed for upscaling images on mobile platforms + to sharpen in real-time. +use_case: Super Resolution +tags: + - quantized +research_paper: https://arxiv.org/abs/2303.04336 +research_paper_title: 'QuickSRNet: Plain Single-Image Super-Resolution Architecture + for Faster Inference on Mobile Platforms' +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet +technical_details: + Model checkpoint: quicksrnet_large_4x_checkpoint_int8 + Input resolution: 128x128 + Number of parameters: 436K + Model size: 464 KB +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +form_factors: + - Phone + - Tablet +related_models: [xlsr, xlsr_quantized, quicksrnetlarge] +has_static_banner: yes +has_animated_banner: yes +license_type: other +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/model.py b/qai_hub_models/models/quicksrnetlarge_quantized/model.py new file mode 100644 index 00000000..9bca792d --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/model.py @@ -0,0 +1,99 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.quicksrnetlarge.model import QuickSRNetLarge +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 2 + +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_large_4x_w8a8.json: +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_large_4x_checkpoint_int8.pth +# and +# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js +# Encodings were generated with AIMET QuantSim library +QUANTIZED_WEIGHTS = "quicksrnet_large_4x_checkpoint_int8.pth" +AIMET_ENCODINGS = "aimet_quantization_encodings.json" +SCALING_FACTOR = 4 + + +class QuickSRNetLargeQuantizable(AIMETQuantizableMixin, QuickSRNetLarge): + """QuickSRNetLarge with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + quicksrnet_model: QuantizationSimModel, + ) -> None: + QuickSRNetLarge.__init__(self, quicksrnet_model.model) + AIMETQuantizableMixin.__init__( + self, quicksrnet_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "QuickSRNetLargeQuantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on BSD300. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + # Load Model + quicksrnet = QuickSRNetLarge.from_pretrained() + input_shape = quicksrnet.get_input_spec()["image"][0] + equalize_model(quicksrnet, input_shape) + + # Download weights and quantization parameters + weights = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS + ).fetch() + aimet_config = get_default_aimet_config_legacy_v2() + + # Load the model weights and quantization parameters + # In this particular instance, the state_dict keys from the model are all named "model." + # where is the name of each key in the weights file - without the word model. + # We rename all the keys to add the word model + state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] + new_state_dict = {"model." + key: value for key, value in state_dict.items()} + quicksrnet.load_state_dict(new_state_dict) + sim = QuantizationSimModel( + quicksrnet, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=aimet_config, + dummy_input=torch.rand(input_shape), + ) + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + + return cls(sim) diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml b/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml new file mode 100644 index 00000000..174979a6 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/perf.yaml @@ -0,0 +1,108 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 +models: +- name: QuickSRNetLarge-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1868.0 + throughput: 535.3319057815846 + estimated_peak_memory_range: + min: 12288 + max: 1533296 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 30 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 33 + job_id: jygze66g8 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:21:00.166706Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 1484.0 + throughput: 673.8544474393531 + estimated_peak_memory_range: + min: 20480 + max: 25007104 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 30 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 33 + job_id: jz5wokjp1 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:21:00.166728Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/quicksrnetlarge_quantized/test.py b/qai_hub_models/models/quicksrnetlarge_quantized/test.py new file mode 100644 index 00000000..32337b60 --- /dev/null +++ b/qai_hub_models/models/quicksrnetlarge_quantized/test.py @@ -0,0 +1,89 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import os +import tempfile +import zipfile + +import numpy as np +import pytest +import torch + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.quicksrnetlarge_quantized.demo import IMAGE_ADDRESS +from qai_hub_models.models.quicksrnetlarge_quantized.demo import main as demo_main +from qai_hub_models.models.quicksrnetlarge_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetLargeQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnetlarge_quantized_output.png" +) + + +@skip_clone_repo_check +def test_task(): + # AIMET Quantization Simulator introduces randomness. Eliminate that for this test. + torch.manual_seed(0) + image = load_image(IMAGE_ADDRESS) + model = QuickSRNetLargeQuantizable.from_pretrained() + app = SuperResolutionApp(model=model) + app_output_image = app.predict(image)[0] + + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@pytest.mark.trace +@skip_clone_repo_check +def test_trace(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = SuperResolutionApp( + QuickSRNetLargeQuantizable.from_pretrained().convert_to_torchscript() + ) + app_output_image = app.predict(image)[0] + + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@pytest.mark.skip("https://github.com/tetraai/tetracode/issues/9606") +@skip_clone_repo_check +def test_aimet_export(): + model = QuickSRNetLargeQuantizable.from_pretrained() + name = model.__class__.__name__ + with tempfile.TemporaryDirectory() as tmpdir: + output_zip = model.convert_to_onnx_and_aimet_encodings( + tmpdir, + ) + assert os.path.exists(output_zip) + with zipfile.ZipFile(output_zip, "r") as zip: + assert zip.namelist() == [ + f"{name}.aimet/", + f"{name}.aimet/{name}.onnx", + f"{name}.aimet/{name}.encodings", + ] + + # No test of torchscipt and aimet encodings due to #8954 + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/quicksrnetmedium/README.md b/qai_hub_models/models/quicksrnetmedium/README.md index 9c489c7e..b4cf8f71 100644 --- a/qai_hub_models/models/quicksrnetmedium/README.md +++ b/qai_hub_models/models/quicksrnetmedium/README.md @@ -10,7 +10,7 @@ This is based on the implementation of QuickSRNetMedium found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetmedium). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.quicksrnetmedium.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of QuickSRNetMedium can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) diff --git a/qai_hub_models/models/quicksrnetmedium/conftest.py b/qai_hub_models/models/quicksrnetmedium/conftest.py new file mode 100644 index 00000000..9f8c04db --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.quicksrnetmedium import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.quicksrnetmedium.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/quicksrnetmedium/demo.py b/qai_hub_models/models/quicksrnetmedium/demo.py index 55e5d7a1..51c1ffec 100644 --- a/qai_hub_models/models/quicksrnetmedium/demo.py +++ b/qai_hub_models/models/quicksrnetmedium/demo.py @@ -20,6 +20,7 @@ def main(is_test: bool = False): super_resolution_demo( model_cls=QuickSRNetMedium, + model_id=MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, ) diff --git a/qai_hub_models/models/quicksrnetmedium/export.py b/qai_hub_models/models/quicksrnetmedium/export.py index 9c9cde02..ad1016dc 100644 --- a/qai_hub_models/models/quicksrnetmedium/export.py +++ b/qai_hub_models/models/quicksrnetmedium/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -26,6 +26,7 @@ from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( print_inference_metrics, + print_on_target_demo_cmd, print_profile_metrics_from_job, ) from qai_hub_models.utils.qai_hub_helpers import ( @@ -109,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,36 +158,40 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/quicksrnetmedium/info.yaml b/qai_hub_models/models/quicksrnetmedium/info.yaml index aaa2691e..72ae05be 100644 --- a/qai_hub_models/models/quicksrnetmedium/info.yaml +++ b/qai_hub_models/models/quicksrnetmedium/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/2303.04336 research_paper_title: 'QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms' license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet technical_details: Model checkpoint: quicksrnet_medium_4x_checkpoint_float32 @@ -27,6 +28,7 @@ form_factors: - Tablet related_models: [xlsr, esrgan, quicksrnetlarge] has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/quicksrnetmedium/model.py b/qai_hub_models/models/quicksrnetmedium/model.py index 24c4345c..e050d160 100644 --- a/qai_hub_models/models/quicksrnetmedium/model.py +++ b/qai_hub_models/models/quicksrnetmedium/model.py @@ -40,8 +40,6 @@ def __init__( @classmethod def from_pretrained(cls) -> QuickSRNetMedium: model = _load_quicksrnet_source_model( - MODEL_ID, - MODEL_ASSET_VERSION, SCALING_FACTOR, NUM_CHANNELS, NUM_INTERMEDIATE_LAYERS, diff --git a/qai_hub_models/models/quicksrnetmedium/perf.yaml b/qai_hub_models/models/quicksrnetmedium/perf.yaml index 5851a60e..f1316110 100644 --- a/qai_hub_models/models/quicksrnetmedium/perf.yaml +++ b/qai_hub_models/models/quicksrnetmedium/perf.yaml @@ -1,76 +1,92 @@ models: - name: QuickSRNetMedium performance_metrics: - - reference_device_info: - name: Samsung Galaxy S23 - os: '13' - form_factor: Phone - os_name: Android - manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-08T22:33:17.244157Z' - torchscript_onnx_tflite: - inference_time: 1407.0 - throughput: 710.7320540156361 + - torchscript_onnx_tflite: + inference_time: 1398.0 + throughput: 715.307582260372 estimated_peak_memory_range: - min: 32768 - max: 8364248 + min: 16384 + max: 8236496 + primary_compute_unit: NPU + precision: fp16 layer_info: layers_on_npu: 14 layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 17 - precision: fp16 - primary_compute_unit: NPU - job_id: jvgd2x1z5 + job_id: jwgoy9d58 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:12:36.328807Z' torchscript_onnx_qnn: - inference_time: 992.0 - throughput: 1008.0645161290323 + inference_time: 989.0 + throughput: 1011.1223458038422 estimated_peak_memory_range: - min: 217088 - max: 28908792 + min: 212992 + max: 7267624 + primary_compute_unit: NPU + precision: fp16 layer_info: layers_on_npu: 18 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 18 - precision: fp16 - primary_compute_unit: NPU - job_id: j1gllveeg + job_id: j7gjx88pd job_status: Passed - torchscript_onnx_ort_qnn_htp: - inference_time: 17078.0 - throughput: 58.55486590935707 + - torchscript_onnx_tflite: + inference_time: 935.0 + throughput: 1069.51871657754 estimated_peak_memory_range: - min: 15241216 - max: 26970304 + min: 16384 + max: 19630352 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 0 + layers_on_npu: 14 layers_on_gpu: 0 - layers_on_cpu: 8 - total_layers: 8 - precision: fp32 - primary_compute_unit: CPU - job_id: j0pxxkv3p + layers_on_cpu: 3 + total_layers: 17 + job_id: j1pv3nm5x job_status: Passed - torchscript_qnn: - inference_time: 'null' - throughput: 'null' + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:15:44.251341Z' + torchscript_onnx_qnn: + inference_time: 648.0 + throughput: 1543.20987654321 estimated_peak_memory_range: - min: 0 - max: 0 + min: 208896 + max: 14213744 + primary_compute_unit: NPU + precision: fp16 layer_info: - layers_on_npu: 'null' - layers_on_gpu: 'null' - layers_on_cpu: 'null' - total_layers: 'null' - precision: 'null' - primary_compute_unit: 'null' - job_id: 'null' - job_status: 'null' + layers_on_npu: 18 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 18 + job_id: jlpe9n0gr + job_status: Passed aggregated: + supported_oses: + - Android supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -80,58 +96,13 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 + - Xiaomi 12 - Xiaomi 12 Pro - - Xiaomi 13 - - Xiaomi 13 Pro - supported_oses: - - Android supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 - performance_metrics: - - reference_device_info: - name: Samsung Galaxy S23 - os: '13' - form_factor: Phone - os_name: Android - manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-08T22:33:19.043922Z' - torchscript_onnx_tflite: - inference_time: 1407.0 - throughput: 710.7320540156361 - estimated_peak_memory_range: - min: 32768 - max: 8364248 - precision: fp16 - primary_compute_unit: NPU - job_status: Passed - torchscript_onnx_qnn: - inference_time: 992.0 - throughput: 1008.0645161290323 - estimated_peak_memory_range: - min: 217088 - max: 28908792 - precision: fp16 - primary_compute_unit: NPU - job_status: Passed - torchscript_onnx_ort_qnn_htp: - inference_time: 17078.0 - throughput: 58.55486590935707 - estimated_peak_memory_range: - min: 15241216 - max: 26970304 - precision: fp32 - primary_compute_unit: CPU - job_status: Passed - torchscript_qnn: - inference_time: 'null' - throughput: 'null' - estimated_peak_memory_range: - min: 0.0 - max: 0.0 - precision: 'null' - primary_compute_unit: 'null' - job_status: 'null' diff --git a/qai_hub_models/models/quicksrnetmedium/test.py b/qai_hub_models/models/quicksrnetmedium/test.py index aca388ad..9cd04d8e 100644 --- a/qai_hub_models/models/quicksrnetmedium/test.py +++ b/qai_hub_models/models/quicksrnetmedium/test.py @@ -35,5 +35,6 @@ def test_task(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/README.md b/qai_hub_models/models/quicksrnetmedium_quantized/README.md new file mode 100644 index 00000000..87b906ad --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/README.md @@ -0,0 +1,54 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [QuickSRNetMedium-Quantized: Upscale images and remove image noise](https://aihub.qualcomm.com/models/quicksrnetmedium_quantized) + +QuickSRNet Medium is designed for upscaling images on mobile platforms to sharpen in real-time. + +This is based on the implementation of QuickSRNetMedium-Quantized found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetmedium_quantized). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.quicksrnetmedium_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.quicksrnetmedium_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of QuickSRNetMedium-Quantized can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/__init__.py b/qai_hub_models/models/quicksrnetmedium_quantized/__init__.py new file mode 100644 index 00000000..acef8c96 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import QuickSRNetMediumQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/conftest.py b/qai_hub_models/models/quicksrnetmedium_quantized/conftest.py new file mode 100644 index 00000000..8ede0a24 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.quicksrnetmedium_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.quicksrnetmedium_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/demo.py b/qai_hub_models/models/quicksrnetmedium_quantized/demo.py new file mode 100644 index 00000000..f45370ab --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/demo.py @@ -0,0 +1,28 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.quicksrnetmedium_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetMediumQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnetmedium_demo.jpg" +) + + +def main(is_test: bool = False): + super_resolution_demo( + QuickSRNetMediumQuantizable, + MODEL_ID, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/export.py b/qai_hub_models/models/quicksrnetmedium_quantized/export.py new file mode 100644 index 00000000..da516367 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/export.py @@ -0,0 +1,215 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.quicksrnetmedium_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "quicksrnetmedium_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "quicksrnetmedium_quantized", + "QuickSRNetMedium-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/info.yaml b/qai_hub_models/models/quicksrnetmedium_quantized/info.yaml new file mode 100644 index 00000000..070615b3 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/info.yaml @@ -0,0 +1,35 @@ +name: QuickSRNetMedium-Quantized +# id must match with the model dir name in qai_hub_models +id: quicksrnetmedium_quantized +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: QuickSRNet Medium is designed for upscaling images on mobile platforms + to sharpen in real-time. +use_case: Super Resolution +tags: + - quantized +research_paper: https://arxiv.org/abs/2303.04336 +research_paper_title: 'QuickSRNet: Plain Single-Image Super-Resolution Architecture + for Faster Inference on Mobile Platforms' +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet +technical_details: + Model checkpoint: quicksrnet_medium_4x_checkpoint_int8 + Input resolution: 128x128 + Number of parameters: 61.0K + Model size: 244 KB +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +form_factors: + - Phone + - Tablet +related_models: [xlsr_quantized, esrgan, quicksrnetmedium] +has_static_banner: yes +has_animated_banner: yes +license_type: other +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/model.py b/qai_hub_models/models/quicksrnetmedium_quantized/model.py new file mode 100644 index 00000000..3b6cb7b6 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/model.py @@ -0,0 +1,98 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.quicksrnetmedium.model import QuickSRNetMedium +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 2 + +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_medium_4x_w8a8.json: +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_medium_4x_checkpoint_int8.pth +# and +# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js +# Encodings were generated with AIMET QuantSim library +QUANTIZED_WEIGHTS = "quicksrnet_medium_4x_checkpoint_int8.pth" +AIMET_ENCODINGS = "aimet_quantization_encodings.json" +SCALING_FACTOR = 4 + + +class QuickSRNetMediumQuantizable(AIMETQuantizableMixin, QuickSRNetMedium): + """QuickSRNetMedium with post train quantization support. + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + quicksrnet_model: QuantizationSimModel, + ) -> None: + QuickSRNetMedium.__init__(self, quicksrnet_model.model) + AIMETQuantizableMixin.__init__( + self, quicksrnet_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "QuickSRNetMediumQuantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on BSD300. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + # Load Model + quicksrnet = QuickSRNetMedium.from_pretrained() + input_shape = quicksrnet.get_input_spec()["image"][0] + equalize_model(quicksrnet, input_shape) + + # Download weights and quantization parameters + weights = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS + ).fetch() + aimet_config = get_default_aimet_config_legacy_v2() + + # Load the model weights and quantization parameters + # In this particular instance, the state_dict keys from the model are all named "model." + # where is the name of each key in the weights file - without the word model. + # We rename all the keys to add the word model + state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] + new_state_dict = {"model." + key: value for key, value in state_dict.items()} + quicksrnet.load_state_dict(new_state_dict) + sim = QuantizationSimModel( + quicksrnet, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=aimet_config, + dummy_input=torch.rand(input_shape), + ) + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + + return cls(sim) diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml b/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml new file mode 100644 index 00000000..2843af27 --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/perf.yaml @@ -0,0 +1,108 @@ +models: +- name: QuickSRNetMedium-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1411.0 + throughput: 708.7172218284904 + estimated_peak_memory_range: + min: 28672 + max: 1545320 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 16 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 19 + job_id: joprkj950 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:24:06.170051Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 1149.0 + throughput: 870.3220191470845 + estimated_peak_memory_range: + min: 20480 + max: 20002352 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 16 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 19 + job_id: jep28n4p6 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:24:06.170059Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 diff --git a/qai_hub_models/models/quicksrnetmedium_quantized/test.py b/qai_hub_models/models/quicksrnetmedium_quantized/test.py new file mode 100644 index 00000000..4da76b9d --- /dev/null +++ b/qai_hub_models/models/quicksrnetmedium_quantized/test.py @@ -0,0 +1,91 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import os +import tempfile +import zipfile + +import numpy as np +import pytest +import torch + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.quicksrnetmedium_quantized.demo import IMAGE_ADDRESS +from qai_hub_models.models.quicksrnetmedium_quantized.demo import main as demo_main +from qai_hub_models.models.quicksrnetmedium_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetMediumQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnetmedium_quantized_output.png" +) + + +@skip_clone_repo_check +def test_task(): + # AIMET Quantization Simulator introduces randomness. Eliminate that for this test. + torch.manual_seed(0) + image = load_image(IMAGE_ADDRESS) + model = QuickSRNetMediumQuantizable.from_pretrained() + app = SuperResolutionApp(model=model) + app_output_image = app.predict(image)[0] + + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@pytest.mark.trace +@skip_clone_repo_check +def test_trace(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = SuperResolutionApp( + QuickSRNetMediumQuantizable.from_pretrained().convert_to_torchscript() + ) + app_output_image = app.predict(image)[0] + + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@pytest.mark.skip("https://github.com/tetraai/tetracode/issues/9606") +@skip_clone_repo_check +def test_aimet_export(): + model = QuickSRNetMediumQuantizable.from_pretrained() + name = model.__class__.__name__ + with tempfile.TemporaryDirectory() as tmpdir: + output_zip = model.convert_to_onnx_and_aimet_encodings( + tmpdir, + ) + assert os.path.exists(output_zip) + with zipfile.ZipFile(output_zip, "r") as zip: + assert zip.namelist() == [ + f"{name}.aimet/", + f"{name}.aimet/{name}.onnx", + f"{name}.aimet/{name}.encodings", + ] + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) + + +if __name__ == "__main__": + test_task() diff --git a/qai_hub_models/models/quicksrnetsmall/README.md b/qai_hub_models/models/quicksrnetsmall/README.md index 9b4c48ac..1c70ac12 100644 --- a/qai_hub_models/models/quicksrnetsmall/README.md +++ b/qai_hub_models/models/quicksrnetsmall/README.md @@ -10,7 +10,7 @@ This is based on the implementation of QuickSRNetSmall found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetsmall). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.quicksrnetsmall.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of QuickSRNetSmall can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) diff --git a/qai_hub_models/models/quicksrnetsmall/conftest.py b/qai_hub_models/models/quicksrnetsmall/conftest.py new file mode 100644 index 00000000..3d406665 --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.quicksrnetsmall import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.quicksrnetsmall.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/quicksrnetsmall/demo.py b/qai_hub_models/models/quicksrnetsmall/demo.py index 880f23bd..dd0e6c43 100644 --- a/qai_hub_models/models/quicksrnetsmall/demo.py +++ b/qai_hub_models/models/quicksrnetsmall/demo.py @@ -20,6 +20,7 @@ def main(is_test: bool = False): super_resolution_demo( model_cls=QuickSRNetSmall, + model_id=MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, ) diff --git a/qai_hub_models/models/quicksrnetsmall/export.py b/qai_hub_models/models/quicksrnetsmall/export.py index 35428879..f4ecbcca 100644 --- a/qai_hub_models/models/quicksrnetsmall/export.py +++ b/qai_hub_models/models/quicksrnetsmall/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -119,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -150,37 +158,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/quicksrnetsmall/info.yaml b/qai_hub_models/models/quicksrnetsmall/info.yaml index 91aff954..128750f2 100644 --- a/qai_hub_models/models/quicksrnetsmall/info.yaml +++ b/qai_hub_models/models/quicksrnetsmall/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/2303.04336 research_paper_title: 'QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms' license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet technical_details: Model checkpoint: quicksrnet_small_4x_checkpoint_float32 @@ -27,6 +28,7 @@ form_factors: - Tablet related_models: [xlsr, esrgan, quicksrnetlarge] has_static_banner: yes -has_animated_banner: yes +has_animated_banner: no license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/quicksrnetsmall/model.py b/qai_hub_models/models/quicksrnetsmall/model.py index 92ad52d1..54b22d82 100644 --- a/qai_hub_models/models/quicksrnetsmall/model.py +++ b/qai_hub_models/models/quicksrnetsmall/model.py @@ -40,8 +40,6 @@ def __init__( @classmethod def from_pretrained(cls) -> QuickSRNetSmall: model = _load_quicksrnet_source_model( - MODEL_ID, - MODEL_ASSET_VERSION, SCALING_FACTOR, NUM_CHANNELS, NUM_INTERMEDIATE_LAYERS, diff --git a/qai_hub_models/models/quicksrnetsmall/perf.yaml b/qai_hub_models/models/quicksrnetsmall/perf.yaml index 3cf63319..d0815180 100644 --- a/qai_hub_models/models/quicksrnetsmall/perf.yaml +++ b/qai_hub_models/models/quicksrnetsmall/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: QuickSRNetSmall performance_metrics: - torchscript_onnx_tflite: - inference_time: 1333.0 - throughput: 750.1875468867216 + inference_time: 1338.0 + throughput: 747.3841554559043 estimated_peak_memory_range: - min: 16384 - max: 8022608 + min: 24576 + max: 1376064 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 11 - job_id: j1pvl9zr5 + job_id: jygzezzg8 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:09:24.184304Z' torchscript_onnx_qnn: - inference_time: 1017.0 - throughput: 983.284169124877 + inference_time: 1025.0 + throughput: 975.609756097561 estimated_peak_memory_range: min: 212992 - max: 64518392 + max: 37245776 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 12 - job_id: jep2rv3qg + job_id: jnp10ok5q + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 839.0 + throughput: 1191.8951132300358 + estimated_peak_memory_range: + min: 16384 + max: 17771072 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 8 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 11 + job_id: jmg9voq57 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-22T22:36:34.984329Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:13:39.690790Z' + torchscript_onnx_qnn: + inference_time: 616.0 + throughput: 1623.3766233766235 + estimated_peak_memory_range: + min: 212992 + max: 14001568 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 12 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 12 + job_id: jz57zoqp3 + job_status: Passed diff --git a/qai_hub_models/models/quicksrnetsmall/test.py b/qai_hub_models/models/quicksrnetsmall/test.py index d7645b6f..87ccaee0 100644 --- a/qai_hub_models/models/quicksrnetsmall/test.py +++ b/qai_hub_models/models/quicksrnetsmall/test.py @@ -35,5 +35,6 @@ def test_task(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/README.md b/qai_hub_models/models/quicksrnetsmall_quantized/README.md new file mode 100644 index 00000000..a8cf057f --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/README.md @@ -0,0 +1,54 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [QuickSRNetSmall-Quantized: Upscale images and remove image noise](https://aihub.qualcomm.com/models/quicksrnetsmall_quantized) + +QuickSRNet Small is designed for upscaling images on mobile platforms to sharpen in real-time. + +This is based on the implementation of QuickSRNetSmall-Quantized found +[here](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/quicksrnetsmall_quantized). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.quicksrnetsmall_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.quicksrnetsmall_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of QuickSRNetSmall-Quantized can be found + [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [QuickSRNet: Plain Single-Image Super-Resolution Architecture for Faster Inference on Mobile Platforms](https://arxiv.org/abs/2303.04336) +* [Source Model Implementation](https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/__init__.py b/qai_hub_models/models/quicksrnetsmall_quantized/__init__.py new file mode 100644 index 00000000..ef07760b --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.super_resolution.app import ( # noqa: F401 + SuperResolutionApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import QuickSRNetSmallQuantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/conftest.py b/qai_hub_models/models/quicksrnetsmall_quantized/conftest.py new file mode 100644 index 00000000..d24003b5 --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.quicksrnetsmall_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.quicksrnetsmall_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/demo.py b/qai_hub_models/models/quicksrnetsmall_quantized/demo.py new file mode 100644 index 00000000..cb2dcd45 --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/demo.py @@ -0,0 +1,28 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.super_resolution.demo import super_resolution_demo +from qai_hub_models.models.quicksrnetsmall_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetSmallQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnet_demo.jpg" +) + + +def main(is_test: bool = False): + super_resolution_demo( + QuickSRNetSmallQuantizable, + MODEL_ID, + default_image=IMAGE_ADDRESS, + is_test=is_test, + ) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/export.py b/qai_hub_models/models/quicksrnetsmall_quantized/export.py new file mode 100644 index 00000000..4944af8c --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/export.py @@ -0,0 +1,215 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.quicksrnetsmall_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, + transpose_channel_last_to_first, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "quicksrnetsmall_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "quicksrnetsmall_quantized", + "QuickSRNetSmall-Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, + compile_options + + " --force_channel_last_input image" + + " --force_channel_last_output output_0", + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image", sample_inputs, target_runtime + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + # Convert outputs from channel last to channel first + inference_result = transpose_channel_last_to_first( + "output_0", inference_result, target_runtime + ) + print_inference_metrics(inference_job, inference_result, torch_out) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/info.yaml b/qai_hub_models/models/quicksrnetsmall_quantized/info.yaml new file mode 100644 index 00000000..fad05b98 --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/info.yaml @@ -0,0 +1,35 @@ +name: QuickSRNetSmall-Quantized +# id must match with the model dir name in qai_hub_models +id: quicksrnetsmall_quantized +status: public +headline: Upscale images and remove image noise. +domain: Computer Vision +description: QuickSRNet Small is designed for upscaling images on mobile platforms + to sharpen in real-time. +use_case: Super Resolution +tags: + - quantized +research_paper: https://arxiv.org/abs/2303.04336 +research_paper_title: 'QuickSRNet: Plain Single-Image Super-Resolution Architecture + for Faster Inference on Mobile Platforms' +license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/quicksrnet +technical_details: + Model checkpoint: quicksrnet_small_4x_checkpoint_int8 + Input resolution: 128x128 + Number of parameters: 33.3K + Model size: 42.5 KB +applicable_scenarios: + - Virtual Real Estate Tours + - Gaming + - ARVR +form_factors: + - Phone + - Tablet +related_models: [xlsr_quantized, esrgan, quicksrnetsmall] +has_static_banner: yes +has_animated_banner: yes +license_type: other +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/model.py b/qai_hub_models/models/quicksrnetsmall_quantized/model.py new file mode 100644 index 00000000..5ba7fb5e --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/model.py @@ -0,0 +1,97 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.quicksrnetsmall.model import QuickSRNetSmall +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config_legacy_v2 +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 2 + +# Weights and config stored in S3 are sourced from +# https://github.com/quic/aimet-model-zoo/blob/develop/aimet_zoo_torch/quicksrnet/model/model_cards/quicksrnet_small_4x_w8a8.json: +# https://github.com/quic/aimet-model-zoo/releases/download/phase_2_january_artifacts/quicksrnet_small_4x_checkpoint_int8.pth +# and +# https://raw.githubusercontent.com/quic/aimet/release-aimet-1.23/TrainingExtensions/common/src/python/aimet_common/quantsim_config/default_config_per_channel.js +# Encodings were generated with AIMET QuantSim library +QUANTIZED_WEIGHTS = "quicksrnet_small_4x_checkpoint_int8.pth" +AIMET_ENCODINGS = "aimet_quantization_encodings.json" +SCALING_FACTOR = 4 + + +class QuickSRNetSmallQuantizable(AIMETQuantizableMixin, QuickSRNetSmall): + """QuickSRNetSmall with post train quantization support. + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + quicksrnet_model: QuantizationSimModel, + ) -> None: + QuickSRNetSmall.__init__(self, quicksrnet_model.model) + AIMETQuantizableMixin.__init__( + self, quicksrnet_model, needs_onnx_direct_aimet_export=True + ) + + @classmethod + def from_pretrained( + cls, aimet_encodings: str | None = "DEFAULT" + ) -> "QuickSRNetSmallQuantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on BSD300. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + # Load Model + quicksrnet = QuickSRNetSmall.from_pretrained() + input_shape = quicksrnet.get_input_spec()["image"][0] + equalize_model(quicksrnet, input_shape) + + # Download weights and quantization parameters + weights = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, QUANTIZED_WEIGHTS + ).fetch() + aimet_config = get_default_aimet_config_legacy_v2() + + # Load the model weights and quantization parameters + # In this particular instance, the state_dict keys from the model are all named "model." + # where is the name of each key in the weights file - without the word model. + # We rename all the keys to add the word model + state_dict = torch.load(weights, map_location=torch.device("cpu"))["state_dict"] + new_state_dict = {"model." + key: value for key, value in state_dict.items()} + quicksrnet.load_state_dict(new_state_dict) + sim = QuantizationSimModel( + quicksrnet, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=aimet_config, + dummy_input=torch.rand(input_shape), + ) + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, AIMET_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + + return cls(sim) diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml b/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml new file mode 100644 index 00000000..8766199c --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/perf.yaml @@ -0,0 +1,108 @@ +models: +- name: QuickSRNetSmall-Quantized + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 1355.0 + throughput: 738.0073800738007 + estimated_peak_memory_range: + min: 20480 + max: 2224928 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 10 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 13 + job_id: jz57zknp3 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:22:40.346377Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 1099.0 + throughput: 909.9181073703367 + estimated_peak_memory_range: + min: 20480 + max: 20205264 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 10 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 13 + job_id: jqp4qm2go + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:22:40.346384Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 diff --git a/qai_hub_models/models/quicksrnetsmall_quantized/test.py b/qai_hub_models/models/quicksrnetsmall_quantized/test.py new file mode 100644 index 00000000..be878b99 --- /dev/null +++ b/qai_hub_models/models/quicksrnetsmall_quantized/test.py @@ -0,0 +1,87 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +import os +import tempfile +import zipfile + +import numpy as np +import pytest +import torch + +from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp +from qai_hub_models.models.quicksrnetsmall_quantized.demo import IMAGE_ADDRESS +from qai_hub_models.models.quicksrnetsmall_quantized.demo import main as demo_main +from qai_hub_models.models.quicksrnetsmall_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + QuickSRNetSmallQuantizable, +) +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image +from qai_hub_models.utils.testing import assert_most_close, skip_clone_repo_check + +OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, "quicksrnetsmall_quantized_output.png" +) + + +@skip_clone_repo_check +def test_task(): + # AIMET Quantization Simulator introduces randomness. Eliminate that for this test. + torch.manual_seed(0) + image = load_image(IMAGE_ADDRESS) + model = QuickSRNetSmallQuantizable.from_pretrained() + app = SuperResolutionApp(model=model) + app_output_image = app.predict(image)[0] + + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@pytest.mark.trace +@skip_clone_repo_check +def test_trace(): + image = load_image(IMAGE_ADDRESS) + output_image = load_image(OUTPUT_IMAGE_ADDRESS) + app = SuperResolutionApp( + QuickSRNetSmallQuantizable.from_pretrained().convert_to_torchscript() + ) + app_output_image = app.predict(image)[0] + + assert_most_close( + np.asarray(app_output_image, dtype=np.float32) / 255, + np.asarray(output_image, dtype=np.float32) / 255, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + ) + + +@pytest.mark.skip("https://github.com/tetraai/tetracode/issues/9606") +@skip_clone_repo_check +def test_aimet_export(): + model = QuickSRNetSmallQuantizable.from_pretrained() + name = model.__class__.__name__ + with tempfile.TemporaryDirectory() as tmpdir: + output_zip = model.convert_to_onnx_and_aimet_encodings( + tmpdir, + ) + assert os.path.exists(output_zip) + with zipfile.ZipFile(output_zip, "r") as zip: + assert zip.namelist() == [ + f"{name}.aimet/", + f"{name}.aimet/{name}.onnx", + f"{name}.aimet/{name}.encodings", + ] + + +@skip_clone_repo_check +def test_demo(): + demo_main(is_test=True) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/README.md b/qai_hub_models/models/real_esrgan_general_x4v3/README.md index a3ce066c..982d103e 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/README.md +++ b/qai_hub_models/models/real_esrgan_general_x4v3/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Real-ESRGAN-General-x4v3 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/real_esrgan_general_x4v3). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.real_esrgan_general_x4v3.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Real-ESRGAN-General-x4v3 can be found [here](https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data](https://arxiv.org/abs/2107.10833) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/conftest.py b/qai_hub_models/models/real_esrgan_general_x4v3/conftest.py new file mode 100644 index 00000000..a24a89d5 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_general_x4v3/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.real_esrgan_general_x4v3 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.real_esrgan_general_x4v3.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/demo.py b/qai_hub_models/models/real_esrgan_general_x4v3/demo.py index ae8541e7..27580a37 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/demo.py +++ b/qai_hub_models/models/real_esrgan_general_x4v3/demo.py @@ -21,6 +21,7 @@ def main(is_test: bool = False): super_resolution_demo( model_cls=Real_ESRGAN_General_x4v3, + model_id=MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, ) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/export.py b/qai_hub_models/models/real_esrgan_general_x4v3/export.py index 4f86924c..7f5ce2b6 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/export.py +++ b/qai_hub_models/models/real_esrgan_general_x4v3/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -26,6 +26,7 @@ from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( print_inference_metrics, + print_on_target_demo_cmd, print_profile_metrics_from_job, ) from qai_hub_models.utils.qai_hub_helpers import ( @@ -109,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,36 +158,40 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/info.yaml b/qai_hub_models/models/real_esrgan_general_x4v3/info.yaml index f2dbd297..157e0325 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/info.yaml +++ b/qai_hub_models/models/real_esrgan_general_x4v3/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/2107.10833 research_paper_title: 'Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data' license: https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/xinntao/Real-ESRGAN/tree/master technical_details: Model checkpoint: realesr-general-x4v3 @@ -33,4 +34,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml index 5305963f..c79aa05c 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml +++ b/qai_hub_models/models/real_esrgan_general_x4v3/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Real-ESRGAN-General-x4v3 performance_metrics: - torchscript_onnx_tflite: - inference_time: 7168.0 - throughput: 139.50892857142858 + inference_time: 7285.0 + throughput: 137.26835964310226 estimated_peak_memory_range: - min: 15761408 - max: 27106520 + min: 15745024 + max: 20241416 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 72 - job_id: jmg9zy3qp + job_id: j1glno2pv job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:12:36.720476Z' torchscript_onnx_qnn: - inference_time: 6995.0 - throughput: 142.9592566118656 + inference_time: 6983.0 + throughput: 143.20492624946297 estimated_peak_memory_range: - min: 45056 - max: 67127640 + min: 12288 + max: 10852600 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 73 - job_id: jnp1nwdkg + job_id: j1p3kxm52 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 5660.0 + throughput: 176.67844522968198 + estimated_peak_memory_range: + min: 57344 + max: 53042192 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 69 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 72 + job_id: jw566rn5o job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:15:20.798589Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:17:08.081378Z' + torchscript_onnx_qnn: + inference_time: 4939.0 + throughput: 202.47013565499088 + estimated_peak_memory_range: + min: 208896 + max: 32676160 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 73 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 73 + job_id: jwgoyo158 + job_status: Passed diff --git a/qai_hub_models/models/real_esrgan_general_x4v3/requirements.txt b/qai_hub_models/models/real_esrgan_general_x4v3/requirements.txt index 80ca5630..00e6cc01 100644 --- a/qai_hub_models/models/real_esrgan_general_x4v3/requirements.txt +++ b/qai_hub_models/models/real_esrgan_general_x4v3/requirements.txt @@ -1,6 +1,3 @@ -opencv-python -PyYAML -requests -scipy -seaborn -basicsr +scipy==1.8.1 +seaborn==0.11.0 +basicsr==1.4.2 diff --git a/qai_hub_models/models/real_esrgan_x4plus/README.md b/qai_hub_models/models/real_esrgan_x4plus/README.md index 9ef627ff..b5c8fd81 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/README.md +++ b/qai_hub_models/models/real_esrgan_x4plus/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Real-ESRGAN-x4plus found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/real_esrgan_x4plus). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.real_esrgan_x4plus.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Real-ESRGAN-x4plus can be found [here](https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data](https://arxiv.org/abs/2107.10833) diff --git a/qai_hub_models/models/real_esrgan_x4plus/conftest.py b/qai_hub_models/models/real_esrgan_x4plus/conftest.py new file mode 100644 index 00000000..c6d91d61 --- /dev/null +++ b/qai_hub_models/models/real_esrgan_x4plus/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.real_esrgan_x4plus import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.real_esrgan_x4plus.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/real_esrgan_x4plus/demo.py b/qai_hub_models/models/real_esrgan_x4plus/demo.py index 5eeb17e2..60e6495a 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/demo.py +++ b/qai_hub_models/models/real_esrgan_x4plus/demo.py @@ -21,6 +21,7 @@ def main(is_test: bool = False): super_resolution_demo( model_cls=Real_ESRGAN_x4plus, + model_id=MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, ) diff --git a/qai_hub_models/models/real_esrgan_x4plus/export.py b/qai_hub_models/models/real_esrgan_x4plus/export.py index c0bade9f..73ab228c 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/export.py +++ b/qai_hub_models/models/real_esrgan_x4plus/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -26,6 +26,7 @@ from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( print_inference_metrics, + print_on_target_demo_cmd, print_profile_metrics_from_job, ) from qai_hub_models.utils.qai_hub_helpers import ( @@ -107,65 +108,77 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) sample_inputs = model.sample_inputs(input_spec) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=sample_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/real_esrgan_x4plus/info.yaml b/qai_hub_models/models/real_esrgan_x4plus/info.yaml index 3b9bc903..578bb93f 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/info.yaml +++ b/qai_hub_models/models/real_esrgan_x4plus/info.yaml @@ -10,6 +10,7 @@ tags: [] research_paper: https://arxiv.org/abs/2107.10833 research_paper_title: "Real-ESRGAN: Training Real-World Blind Super-Resolution with Pure Synthetic Data" license: https://github.com/xinntao/Real-ESRGAN/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/xinntao/Real-ESRGAN technical_details: Number of parameters: 16.7M @@ -27,4 +28,5 @@ related_models: ['esrgan', 'real_esrgan_general_x4v3'] has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/real_esrgan_x4plus/perf.yaml b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml index d059c356..3bc0350c 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/perf.yaml +++ b/qai_hub_models/models/real_esrgan_x4plus/perf.yaml @@ -17,37 +17,48 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Real-ESRGAN-x4plus performance_metrics: - torchscript_onnx_tflite: - inference_time: 69426.0 - throughput: 14.40382565609426 + inference_time: 'null' + throughput: 'null' estimated_peak_memory_range: - min: 3272704 - max: 6458720 - primary_compute_unit: NPU - precision: fp16 + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' layer_info: - layers_on_npu: 1028 + layers_on_npu: 0 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1028 - job_id: jygzl8665 - job_status: Passed + total_layers: 0 + job_id: '' + job_status: Skipped + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:12:52.767646Z' torchscript_onnx_qnn: - inference_time: 67244.0 - throughput: 14.87121527571233 + inference_time: 66635.0 + throughput: 15.007128385983343 estimated_peak_memory_range: - min: 102400 - max: 106071688 + min: 94208 + max: 104137800 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 1031 - job_id: jygzljxz5 + job_id: jz57zzlp3 job_status: Passed + - torchscript_onnx_tflite: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-22T18:50:48.142201Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:16:11.670851Z' + torchscript_onnx_qnn: + inference_time: 50978.0 + throughput: 19.61630507277649 + estimated_peak_memory_range: + min: 90112 + max: 248878432 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1031 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1031 + job_id: jqp4qqvgo + job_status: Passed diff --git a/qai_hub_models/models/real_esrgan_x4plus/requirements.txt b/qai_hub_models/models/real_esrgan_x4plus/requirements.txt index 6292b978..00e6cc01 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/requirements.txt +++ b/qai_hub_models/models/real_esrgan_x4plus/requirements.txt @@ -1,4 +1,3 @@ -opencv-python -scipy -seaborn -basicsr +scipy==1.8.1 +seaborn==0.11.0 +basicsr==1.4.2 diff --git a/qai_hub_models/models/real_esrgan_x4plus/test.py b/qai_hub_models/models/real_esrgan_x4plus/test.py index 905f473c..e473252b 100644 --- a/qai_hub_models/models/real_esrgan_x4plus/test.py +++ b/qai_hub_models/models/real_esrgan_x4plus/test.py @@ -35,5 +35,6 @@ def test_task(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/regnet/README.md b/qai_hub_models/models/regnet/README.md index a123a009..9448b7b8 100644 --- a/qai_hub_models/models/regnet/README.md +++ b/qai_hub_models/models/regnet/README.md @@ -10,7 +10,7 @@ This is based on the implementation of RegNet found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/regnet). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.regnet.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of RegNet can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) diff --git a/qai_hub_models/models/regnet/conftest.py b/qai_hub_models/models/regnet/conftest.py new file mode 100644 index 00000000..27aafbbe --- /dev/null +++ b/qai_hub_models/models/regnet/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.regnet import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.regnet.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/regnet/demo.py b/qai_hub_models/models/regnet/demo.py index 3fe6310f..b4fb48c7 100644 --- a/qai_hub_models/models/regnet/demo.py +++ b/qai_hub_models/models/regnet/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.regnet.model import RegNet +from qai_hub_models.models.regnet.model import MODEL_ID, RegNet def main(is_test: bool = False): - imagenet_demo(RegNet, is_test) + imagenet_demo(RegNet, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/regnet/export.py b/qai_hub_models/models/regnet/export.py index ff916e30..2d4e54ce 100644 --- a/qai_hub_models/models/regnet/export.py +++ b/qai_hub_models/models/regnet/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/regnet/info.yaml b/qai_hub_models/models/regnet/info.yaml index 457a281f..863f909d 100644 --- a/qai_hub_models/models/regnet/info.yaml +++ b/qai_hub_models/models/regnet/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/2003.13678 research_paper_title: Designing Network Design Spaces license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/regnet.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/regnet/perf.yaml b/qai_hub_models/models/regnet/perf.yaml index 896bd9cb..f5814c03 100644 --- a/qai_hub_models/models/regnet/perf.yaml +++ b/qai_hub_models/models/regnet/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: RegNet performance_metrics: - torchscript_onnx_tflite: - inference_time: 1921.0 - throughput: 520.5622071837585 + inference_time: 1974.0 + throughput: 506.5856129685917 estimated_peak_memory_range: - min: 16384 - max: 1931624 + min: 32768 + max: 1789416 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 112 - job_id: jogk2q8og + job_id: jqpyey4gy job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:08:16.252038Z' torchscript_onnx_qnn: - inference_time: 1659.0 - throughput: 602.7727546714889 + inference_time: 1675.0 + throughput: 597.0149253731344 estimated_peak_memory_range: - min: 237568 - max: 59498896 + min: 241664 + max: 59486296 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 187 - job_id: jn5qlrvmp + job_id: j1p8ok8g9 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1359.0 + throughput: 735.8351729212657 + estimated_peak_memory_range: + min: 16384 + max: 131931280 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 112 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 112 + job_id: j2p0yxegw job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:36:39.546315Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:11:23.127753Z' + torchscript_onnx_qnn: + inference_time: 1197.0 + throughput: 835.421888053467 + estimated_peak_memory_range: + min: 618496 + max: 68520544 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 187 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 187 + job_id: jogkzkogd + job_status: Passed diff --git a/qai_hub_models/models/regnet/test.py b/qai_hub_models/models/regnet/test.py index f6ebd563..fdb34aaf 100644 --- a/qai_hub_models/models/regnet/test.py +++ b/qai_hub_models/models/regnet/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -21,6 +23,7 @@ def test_task(): ) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(RegNet.from_pretrained()) diff --git a/qai_hub_models/models/resnet101/README.md b/qai_hub_models/models/resnet101/README.md index 88189d34..415f99bb 100644 --- a/qai_hub_models/models/resnet101/README.md +++ b/qai_hub_models/models/resnet101/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNet101 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet101). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnet101.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNet101 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) diff --git a/qai_hub_models/models/resnet101/conftest.py b/qai_hub_models/models/resnet101/conftest.py new file mode 100644 index 00000000..b2281868 --- /dev/null +++ b/qai_hub_models/models/resnet101/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnet101 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnet101.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnet101/demo.py b/qai_hub_models/models/resnet101/demo.py index 1b1a3524..9c59e469 100644 --- a/qai_hub_models/models/resnet101/demo.py +++ b/qai_hub_models/models/resnet101/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnet101.model import ResNet101 +from qai_hub_models.models.resnet101.model import MODEL_ID, ResNet101 def main(is_test: bool = False): - imagenet_demo(ResNet101, is_test) + imagenet_demo(ResNet101, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnet101/export.py b/qai_hub_models/models/resnet101/export.py index d9c41ba1..92c2ea4b 100644 --- a/qai_hub_models/models/resnet101/export.py +++ b/qai_hub_models/models/resnet101/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/resnet101/info.yaml b/qai_hub_models/models/resnet101/info.yaml index 7d37336b..f1410fb4 100644 --- a/qai_hub_models/models/resnet101/info.yaml +++ b/qai_hub_models/models/resnet101/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1512.03385 research_paper_title: Deep Residual Learning for Image Recognition license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnet101/perf.yaml b/qai_hub_models/models/resnet101/perf.yaml index 3baa5107..cd12a3b8 100644 --- a/qai_hub_models/models/resnet101/perf.yaml +++ b/qai_hub_models/models/resnet101/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNet101 performance_metrics: - torchscript_onnx_tflite: - inference_time: 3008.0 - throughput: 332.4468085106383 + inference_time: 2993.0 + throughput: 334.1129301703976 estimated_peak_memory_range: min: 28672 - max: 1505496 + max: 1903408 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 145 - job_id: jnp1nw6lg + job_id: j7gjxmxpd job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:51:59.720577Z' torchscript_onnx_qnn: - inference_time: 2895.0 - throughput: 345.4231433506045 + inference_time: 2921.0 + throughput: 342.3485107839781 estimated_peak_memory_range: min: 622592 - max: 226606408 + max: 226849752 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 244 - job_id: jvgddq2lg + job_id: jygze9kg8 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 2221.0 + throughput: 450.24763619990995 + estimated_peak_memory_range: + min: 16384 + max: 103000720 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 145 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 145 + job_id: jlpe911gr job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:20:33.212112Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:54:07.329383Z' + torchscript_onnx_qnn: + inference_time: 2126.0 + throughput: 470.36688617121354 + estimated_peak_memory_range: + min: 618496 + max: 71779728 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 244 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 244 + job_id: jz5won6p1 + job_status: Passed diff --git a/qai_hub_models/models/resnet101/test.py b/qai_hub_models/models/resnet101/test.py index f27696cd..ac498cf1 100644 --- a/qai_hub_models/models/resnet101/test.py +++ b/qai_hub_models/models/resnet101/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -21,6 +23,7 @@ def test_task(): ) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(ResNet101.from_pretrained()) diff --git a/qai_hub_models/models/resnet101_quantized/README.md b/qai_hub_models/models/resnet101_quantized/README.md index 64886381..822c7f98 100644 --- a/qai_hub_models/models/resnet101_quantized/README.md +++ b/qai_hub_models/models/resnet101_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNet101Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet101_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnet101_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNet101Quantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) diff --git a/qai_hub_models/models/resnet101_quantized/conftest.py b/qai_hub_models/models/resnet101_quantized/conftest.py new file mode 100644 index 00000000..f7bf84da --- /dev/null +++ b/qai_hub_models/models/resnet101_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnet101_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnet101_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnet101_quantized/demo.py b/qai_hub_models/models/resnet101_quantized/demo.py index a3c5dff7..4f7f5032 100644 --- a/qai_hub_models/models/resnet101_quantized/demo.py +++ b/qai_hub_models/models/resnet101_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnet101_quantized.model import ResNet101Quantizable +from qai_hub_models.models.resnet101_quantized.model import ( + MODEL_ID, + ResNet101Quantizable, +) def main(is_test: bool = False): - imagenet_demo(ResNet101Quantizable, is_test) + imagenet_demo(ResNet101Quantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnet101_quantized/export.py b/qai_hub_models/models/resnet101_quantized/export.py index 01fb1b0c..774ea807 100644 --- a/qai_hub_models/models/resnet101_quantized/export.py +++ b/qai_hub_models/models/resnet101_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,42 +163,44 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, metrics="psnr,top1,top5" ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/resnet101_quantized/info.yaml b/qai_hub_models/models/resnet101_quantized/info.yaml index e9f4491f..e25f53d7 100644 --- a/qai_hub_models/models/resnet101_quantized/info.yaml +++ b/qai_hub_models/models/resnet101_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1512.03385 research_paper_title: Deep Residual Learning for Image Recognition license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnet101_quantized/model.py b/qai_hub_models/models/resnet101_quantized/model.py index 3ff4ad4c..bcfb3730 100644 --- a/qai_hub_models/models/resnet101_quantized/model.py +++ b/qai_hub_models/models/resnet101_quantized/model.py @@ -8,7 +8,6 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, - HubCompileOptionsInt8Mixin, ) # isort: on @@ -18,19 +17,22 @@ equalize_bn_folded_model, fold_all_batch_norms, ) +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.resnet101.model import ResNet101 -from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 3 +MODEL_ASSET_VERSION = 4 DEFAULT_ENCODINGS = "resnet101_quantized_encodings.json" class ResNet101Quantizable( - HubCompileOptionsInt8Mixin, AIMETQuantizableMixin, ResNet101 + AIMETQuantizableMixin, + ResNet101, ): """ResNet101 with post train quantization support. @@ -43,9 +45,15 @@ def __init__( ) -> None: ResNet101.__init__(self, sim_model.model) AIMETQuantizableMixin.__init__( - self, sim_model, needs_onnx_direct_aimet_export=False + self, + sim_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, @@ -59,17 +67,18 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ model = ResNet101.from_pretrained() - input_shape = model.get_input_spec()["image_tensor"][0] + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) dummy_input = torch.rand(input_shape) pairs = fold_all_batch_norms(model, input_shape, dummy_input) equalize_bn_folded_model(model, input_shape, pairs, dummy_input) sim = QuantizationSimModel( - model.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_per_channel_aimet_config(), + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) @@ -82,3 +91,11 @@ def from_pretrained( sim.model.eval() return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/resnet101_quantized/perf.yaml b/qai_hub_models/models/resnet101_quantized/perf.yaml index b8f30516..f5bd3459 100644 --- a/qai_hub_models/models/resnet101_quantized/perf.yaml +++ b/qai_hub_models/models/resnet101_quantized/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNet101Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 74926.0 - throughput: 13.346501881856765 + inference_time: 1122.0 + throughput: 891.2655971479501 estimated_peak_memory_range: - min: 151552 - max: 2762960 + min: 12288 + max: 2141424 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 149 + layers_on_npu: 146 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 149 - job_id: joprl2nep + total_layers: 146 + job_id: jvgdw7z5j job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:05:31.212967Z' torchscript_onnx_qnn: - inference_time: 'null' - throughput: 'null' + inference_time: 1101.0 + throughput: 908.2652134423251 + estimated_peak_memory_range: + min: 12288 + max: 196790880 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: jvgdw765j + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 839.0 + throughput: 1191.8951132300358 estimated_peak_memory_range: - min: 0 - max: 0 - primary_compute_unit: 'null' - precision: 'null' + min: 12288 + max: 91234848 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 + layers_on_npu: 146 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 0 - job_id: '' - job_status: Skipped + total_layers: 146 + job_id: jmg9v9m57 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:35.238685Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:07:20.101134Z' + torchscript_onnx_qnn: + inference_time: 830.0 + throughput: 1204.8192771084337 + estimated_peak_memory_range: + min: 167936 + max: 53969312 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 144 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 144 + job_id: jo5mrv7gk + job_status: Passed diff --git a/qai_hub_models/models/resnet101_quantized/test.py b/qai_hub_models/models/resnet101_quantized/test.py index fb9b6b7c..876ebffe 100644 --- a/qai_hub_models/models/resnet101_quantized/test.py +++ b/qai_hub_models/models/resnet101_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.resnet101_quantized.demo import main as demo_main from qai_hub_models.models.resnet101_quantized.model import ( @@ -26,16 +25,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - ResNet101Quantizable.from_pretrained(), - is_quantized=True, - diff_tol=0.005, - rtol=0.02, - atol=0.2, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/resnet18/README.md b/qai_hub_models/models/resnet18/README.md index d9466a14..0be3c986 100644 --- a/qai_hub_models/models/resnet18/README.md +++ b/qai_hub_models/models/resnet18/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNet18 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet18). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnet18.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNet18 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) diff --git a/qai_hub_models/models/resnet18/conftest.py b/qai_hub_models/models/resnet18/conftest.py new file mode 100644 index 00000000..86a5865d --- /dev/null +++ b/qai_hub_models/models/resnet18/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnet18 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnet18.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnet18/demo.py b/qai_hub_models/models/resnet18/demo.py index 8d7dcc1b..ccbabc44 100644 --- a/qai_hub_models/models/resnet18/demo.py +++ b/qai_hub_models/models/resnet18/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnet18.model import ResNet18 +from qai_hub_models.models.resnet18.model import MODEL_ID, ResNet18 def main(is_test: bool = False): - imagenet_demo(ResNet18, is_test) + imagenet_demo(ResNet18, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnet18/export.py b/qai_hub_models/models/resnet18/export.py index b01f4ead..6dc6e2b2 100644 --- a/qai_hub_models/models/resnet18/export.py +++ b/qai_hub_models/models/resnet18/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/resnet18/info.yaml b/qai_hub_models/models/resnet18/info.yaml index 91be8429..8b06179f 100644 --- a/qai_hub_models/models/resnet18/info.yaml +++ b/qai_hub_models/models/resnet18/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1512.03385 research_paper_title: Deep Residual Learning for Image Recognition license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnet18/perf.yaml b/qai_hub_models/models/resnet18/perf.yaml index f58a8d78..0bc59fb1 100644 --- a/qai_hub_models/models/resnet18/perf.yaml +++ b/qai_hub_models/models/resnet18/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNet18 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1054.0 - throughput: 948.7666034155598 + inference_time: 1053.0 + throughput: 949.667616334283 estimated_peak_memory_range: - min: 12288 - max: 1722456 + min: 32768 + max: 2028832 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 36 - job_id: j1p3z1xx5 + job_id: j2p0y8egw job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:41:20.341762Z' torchscript_onnx_qnn: - inference_time: 980.0 - throughput: 1020.4081632653061 + inference_time: 989.0 + throughput: 1011.1223458038422 estimated_peak_memory_range: - min: 16384 - max: 84353688 + min: 12288 + max: 84688848 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 52 - job_id: jwgolno4g + job_id: jogkzwogd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 771.0 + throughput: 1297.0168612191958 + estimated_peak_memory_range: + min: 12288 + max: 23627952 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 36 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 36 + job_id: j1p8od8g9 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:25:13.005640Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:45:48.504221Z' + torchscript_onnx_qnn: + inference_time: 717.0 + throughput: 1394.700139470014 + estimated_peak_memory_range: + min: 630784 + max: 25268288 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 52 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 52 + job_id: jn5q8xm57 + job_status: Passed diff --git a/qai_hub_models/models/resnet18/test.py b/qai_hub_models/models/resnet18/test.py index 591e93e3..eac1e51e 100644 --- a/qai_hub_models/models/resnet18/test.py +++ b/qai_hub_models/models/resnet18/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -21,6 +23,7 @@ def test_task(): ) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(ResNet18.from_pretrained()) diff --git a/qai_hub_models/models/resnet18_quantized/README.md b/qai_hub_models/models/resnet18_quantized/README.md index 4952a434..676945a5 100644 --- a/qai_hub_models/models/resnet18_quantized/README.md +++ b/qai_hub_models/models/resnet18_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNet18Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet18_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnet18_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNet18Quantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) diff --git a/qai_hub_models/models/resnet18_quantized/conftest.py b/qai_hub_models/models/resnet18_quantized/conftest.py new file mode 100644 index 00000000..7afa5897 --- /dev/null +++ b/qai_hub_models/models/resnet18_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnet18_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnet18_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnet18_quantized/demo.py b/qai_hub_models/models/resnet18_quantized/demo.py index bc848d5c..4d9909dc 100644 --- a/qai_hub_models/models/resnet18_quantized/demo.py +++ b/qai_hub_models/models/resnet18_quantized/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnet18_quantized.model import ResNet18Quantizable +from qai_hub_models.models.resnet18_quantized.model import MODEL_ID, ResNet18Quantizable def main(is_test: bool = False): - imagenet_demo(ResNet18Quantizable, is_test) + imagenet_demo(ResNet18Quantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnet18_quantized/export.py b/qai_hub_models/models/resnet18_quantized/export.py index 0d8d0e85..d8d39572 100644 --- a/qai_hub_models/models/resnet18_quantized/export.py +++ b/qai_hub_models/models/resnet18_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,35 +163,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, metrics="psnr,top1,top5" ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/resnet18_quantized/info.yaml b/qai_hub_models/models/resnet18_quantized/info.yaml index 20fcdfc5..f132fa42 100644 --- a/qai_hub_models/models/resnet18_quantized/info.yaml +++ b/qai_hub_models/models/resnet18_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1512.03385 research_paper_title: Deep Residual Learning for Image Recognition license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnet18_quantized/model.py b/qai_hub_models/models/resnet18_quantized/model.py index 7641d899..e000d973 100644 --- a/qai_hub_models/models/resnet18_quantized/model.py +++ b/qai_hub_models/models/resnet18_quantized/model.py @@ -8,26 +8,26 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, - HubCompileOptionsInt8Mixin, ) # isort: on import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.resnet18.model import ResNet18 -from qai_hub_models.utils.aimet.config_loader import get_aimet_config_path +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 6 +MODEL_ASSET_VERSION = 7 DEFAULT_ENCODINGS = "resnet18_quantized_encodings.json" -AIMET_CONFIG = "default_config_per_channel_qnn" -class ResNet18Quantizable(HubCompileOptionsInt8Mixin, AIMETQuantizableMixin, ResNet18): +class ResNet18Quantizable(AIMETQuantizableMixin, ResNet18): """ResNet with post train quantization support. Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. @@ -39,9 +39,15 @@ def __init__( ) -> None: ResNet18.__init__(self, resnet18_model.model) AIMETQuantizableMixin.__init__( - self, resnet18_model, needs_onnx_direct_aimet_export=False + self, + resnet18_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, @@ -54,16 +60,17 @@ def from_pretrained( elif None: Doesn't load any encodings. Used when computing encodings. else: Interprets as a filepath and loads the encodings stored there. """ - resnet18 = ResNet18.from_pretrained() - input_shape = resnet18.get_input_spec()["image_tensor"][0] + model = ResNet18.from_pretrained() + input_shape = cls.get_input_spec()["image_tensor"][0] - equalize_model(resnet18, input_shape) + model = prepare_model(model) + equalize_model(model, input_shape) sim = QuantizationSimModel( - resnet18.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_aimet_config_path(AIMET_CONFIG), + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) @@ -76,3 +83,11 @@ def from_pretrained( sim.model.eval() return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/resnet18_quantized/perf.yaml b/qai_hub_models/models/resnet18_quantized/perf.yaml index 38fc614e..df4b298b 100644 --- a/qai_hub_models/models/resnet18_quantized/perf.yaml +++ b/qai_hub_models/models/resnet18_quantized/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNet18Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 375.0 - throughput: 2666.6666666666665 + inference_time: 356.0 + throughput: 2808.9887640449438 estimated_peak_memory_range: min: 12288 - max: 14684784 + max: 1529808 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 37 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 37 - job_id: j2p0mj06g + job_id: j1p3k8m52 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:59:49.885782Z' torchscript_onnx_qnn: - inference_time: 359.0 - throughput: 2785.515320334262 + inference_time: 354.0 + throughput: 2824.858757062147 estimated_peak_memory_range: - min: 12288 - max: 71291800 + min: 20480 + max: 62738248 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 35 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 35 - job_id: j1glyxm85 + job_id: j1pv34z5x + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 301.0 + throughput: 3322.2591362126245 + estimated_peak_memory_range: + min: 12288 + max: 23414560 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 37 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 37 + job_id: jwgoym158 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-23T04:45:17.544674Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:03:01.789875Z' + torchscript_onnx_qnn: + inference_time: 282.0 + throughput: 3546.099290780142 + estimated_peak_memory_range: + min: 12288 + max: 21538672 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 35 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 35 + job_id: j7gjx11pd + job_status: Passed diff --git a/qai_hub_models/models/resnet18_quantized/test.py b/qai_hub_models/models/resnet18_quantized/test.py index 88db0ac9..4405e8d2 100644 --- a/qai_hub_models/models/resnet18_quantized/test.py +++ b/qai_hub_models/models/resnet18_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.resnet18_quantized.demo import main as demo_main from qai_hub_models.models.resnet18_quantized.model import ( @@ -26,16 +25,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - ResNet18Quantizable.from_pretrained(), - diff_tol=0.007, - rtol=0.02, - atol=0.2, - is_quantized=True, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/resnet50/README.md b/qai_hub_models/models/resnet50/README.md index ec7ff6e2..210c9f96 100644 --- a/qai_hub_models/models/resnet50/README.md +++ b/qai_hub_models/models/resnet50/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNet50 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnet50). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnet50.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNet50 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) diff --git a/qai_hub_models/models/resnet50/conftest.py b/qai_hub_models/models/resnet50/conftest.py new file mode 100644 index 00000000..809b3143 --- /dev/null +++ b/qai_hub_models/models/resnet50/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnet50 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnet50.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnet50/demo.py b/qai_hub_models/models/resnet50/demo.py index 81d4a995..8b596d3d 100644 --- a/qai_hub_models/models/resnet50/demo.py +++ b/qai_hub_models/models/resnet50/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnet50.model import ResNet50 +from qai_hub_models.models.resnet50.model import MODEL_ID, ResNet50 def main(is_test: bool = False): - imagenet_demo(ResNet50, is_test) + imagenet_demo(ResNet50, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnet50/export.py b/qai_hub_models/models/resnet50/export.py index ceacf649..e7835d92 100644 --- a/qai_hub_models/models/resnet50/export.py +++ b/qai_hub_models/models/resnet50/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/resnet50/info.yaml b/qai_hub_models/models/resnet50/info.yaml index 5f726eb5..7b52d418 100644 --- a/qai_hub_models/models/resnet50/info.yaml +++ b/qai_hub_models/models/resnet50/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1512.03385 research_paper_title: Deep Residual Learning for Image Recognition license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -34,6 +35,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnet50/perf.yaml b/qai_hub_models/models/resnet50/perf.yaml index f070af48..0037092b 100644 --- a/qai_hub_models/models/resnet50/perf.yaml +++ b/qai_hub_models/models/resnet50/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1904.0 - throughput: 525.2100840336135 + inference_time: 1898.0 + throughput: 526.8703898840885 estimated_peak_memory_range: - min: 20480 - max: 2314168 + min: 36864 + max: 2234848 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 77 - job_id: j1p8em6zp + job_id: j2p0yk0gw job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:27:02.086108Z' torchscript_onnx_qnn: - inference_time: 1768.0 - throughput: 565.6108597285067 + inference_time: 1790.0 + throughput: 558.659217877095 estimated_peak_memory_range: - min: 634880 - max: 186280024 + min: 626688 + max: 186659664 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 125 - job_id: jogk2qoyg + job_id: jogkzdvgd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1392.0 + throughput: 718.3908045977012 + estimated_peak_memory_range: + min: 16384 + max: 68731008 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 77 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 77 + job_id: j1p8o8qg9 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:07:34.762219Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:28:58.275338Z' + torchscript_onnx_qnn: + inference_time: 1307.0 + throughput: 765.1109410864575 + estimated_peak_memory_range: + min: 0 + max: 45987408 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 125 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 125 + job_id: jn5q8we57 + job_status: Passed diff --git a/qai_hub_models/models/resnet50/test.py b/qai_hub_models/models/resnet50/test.py index ca60e960..911fc066 100644 --- a/qai_hub_models/models/resnet50/test.py +++ b/qai_hub_models/models/resnet50/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -21,6 +23,7 @@ def test_task(): ) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(ResNet50.from_pretrained()) diff --git a/qai_hub_models/models/resnext101/README.md b/qai_hub_models/models/resnext101/README.md index 17da6857..0b3904ee 100644 --- a/qai_hub_models/models/resnext101/README.md +++ b/qai_hub_models/models/resnext101/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNeXt101 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnext101). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnext101.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNeXt101 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431) diff --git a/qai_hub_models/models/resnext101/conftest.py b/qai_hub_models/models/resnext101/conftest.py new file mode 100644 index 00000000..fbfcb9c4 --- /dev/null +++ b/qai_hub_models/models/resnext101/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnext101 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnext101.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnext101/demo.py b/qai_hub_models/models/resnext101/demo.py index ffb7bc4d..c9f8003d 100644 --- a/qai_hub_models/models/resnext101/demo.py +++ b/qai_hub_models/models/resnext101/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnext101.model import ResNeXt101 +from qai_hub_models.models.resnext101.model import MODEL_ID, ResNeXt101 def main(is_test: bool = False): - imagenet_demo(ResNeXt101, is_test) + imagenet_demo(ResNeXt101, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnext101/export.py b/qai_hub_models/models/resnext101/export.py index 5d591b8f..e1b1fcd6 100644 --- a/qai_hub_models/models/resnext101/export.py +++ b/qai_hub_models/models/resnext101/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/resnext101/info.yaml b/qai_hub_models/models/resnext101/info.yaml index e2662e67..40e7612f 100644 --- a/qai_hub_models/models/resnext101/info.yaml +++ b/qai_hub_models/models/resnext101/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1611.05431 research_paper_title: Aggregated Residual Transformations for Deep Neural Networks license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -34,6 +35,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnext101/perf.yaml b/qai_hub_models/models/resnext101/perf.yaml index 05f03000..d8d493ff 100644 --- a/qai_hub_models/models/resnext101/perf.yaml +++ b/qai_hub_models/models/resnext101/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNeXt101 performance_metrics: - torchscript_onnx_tflite: - inference_time: 6434.0 - throughput: 155.4243083618278 + inference_time: 6315.0 + throughput: 158.3531274742676 estimated_peak_memory_range: min: 28672 - max: 2709368 + max: 2570472 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 145 - job_id: j1pvlr475 + job_id: j2p0yrngw job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:40:16.043830Z' torchscript_onnx_qnn: - inference_time: 6146.0 - throughput: 162.70745200130165 + inference_time: 6079.0 + throughput: 164.50074025333114 estimated_peak_memory_range: min: 16384 - max: 38657672 + max: 34444952 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 244 - job_id: j7gjr217p + job_id: jogkzyngd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 4552.0 + throughput: 219.6836555360281 + estimated_peak_memory_range: + min: 20480 + max: 357156576 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 145 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 145 + job_id: j1p8o7og9 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:21:26.759411Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:42:21.825443Z' + torchscript_onnx_qnn: + inference_time: 4377.0 + throughput: 228.4669865204478 + estimated_peak_memory_range: + min: 618496 + max: 123852368 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 244 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 244 + job_id: jn5q82o57 + job_status: Passed diff --git a/qai_hub_models/models/resnext101/test.py b/qai_hub_models/models/resnext101/test.py index 257a15b6..dab967e5 100644 --- a/qai_hub_models/models/resnext101/test.py +++ b/qai_hub_models/models/resnext101/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -16,6 +18,7 @@ def test_task(): ) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(ResNeXt101.from_pretrained()) diff --git a/qai_hub_models/models/resnext101_quantized/README.md b/qai_hub_models/models/resnext101_quantized/README.md index 3e071fe6..e5a91213 100644 --- a/qai_hub_models/models/resnext101_quantized/README.md +++ b/qai_hub_models/models/resnext101_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNeXt101Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnext101_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnext101_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNeXt101Quantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431) diff --git a/qai_hub_models/models/resnext101_quantized/conftest.py b/qai_hub_models/models/resnext101_quantized/conftest.py new file mode 100644 index 00000000..66e0502c --- /dev/null +++ b/qai_hub_models/models/resnext101_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnext101_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnext101_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnext101_quantized/demo.py b/qai_hub_models/models/resnext101_quantized/demo.py index 51d4cde1..140307a2 100644 --- a/qai_hub_models/models/resnext101_quantized/demo.py +++ b/qai_hub_models/models/resnext101_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnext101_quantized.model import ResNeXt101Quantizable +from qai_hub_models.models.resnext101_quantized.model import ( + MODEL_ID, + ResNeXt101Quantizable, +) def main(is_test: bool = False): - imagenet_demo(ResNeXt101Quantizable, is_test) + imagenet_demo(ResNeXt101Quantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnext101_quantized/export.py b/qai_hub_models/models/resnext101_quantized/export.py index ed09dd42..865e3205 100644 --- a/qai_hub_models/models/resnext101_quantized/export.py +++ b/qai_hub_models/models/resnext101_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,35 +163,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, metrics="psnr,top1,top5" ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/resnext101_quantized/info.yaml b/qai_hub_models/models/resnext101_quantized/info.yaml index 8d169468..47a2496c 100644 --- a/qai_hub_models/models/resnext101_quantized/info.yaml +++ b/qai_hub_models/models/resnext101_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1611.05431 research_paper_title: Aggregated Residual Transformations for Deep Neural Networks license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnext101_quantized/model.py b/qai_hub_models/models/resnext101_quantized/model.py index 9a6277e1..98ea0d76 100644 --- a/qai_hub_models/models/resnext101_quantized/model.py +++ b/qai_hub_models/models/resnext101_quantized/model.py @@ -8,27 +8,26 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, - HubCompileOptionsInt8Mixin, ) # isort: on import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.resnext101.model import ResNeXt101 -from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 3 +MODEL_ASSET_VERSION = 4 DEFAULT_ENCODINGS = "resnext101_quantized_encodings.json" -class ResNeXt101Quantizable( - HubCompileOptionsInt8Mixin, AIMETQuantizableMixin, ResNeXt101 -): +class ResNeXt101Quantizable(AIMETQuantizableMixin, ResNeXt101): """ResNeXt101 with post train quantization support. Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. @@ -40,9 +39,15 @@ def __init__( ) -> None: ResNeXt101.__init__(self, sim_model.model) AIMETQuantizableMixin.__init__( - self, sim_model, needs_onnx_direct_aimet_export=False + self, + sim_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, @@ -56,15 +61,16 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ model = ResNeXt101.from_pretrained() - input_shape = model.get_input_spec()["image_tensor"][0] + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) equalize_model(model, input_shape) sim = QuantizationSimModel( - model.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_per_channel_aimet_config(), + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) @@ -77,3 +83,11 @@ def from_pretrained( sim.model.eval() return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/resnext101_quantized/perf.yaml b/qai_hub_models/models/resnext101_quantized/perf.yaml index 0d14a5d4..ddc514fb 100644 --- a/qai_hub_models/models/resnext101_quantized/perf.yaml +++ b/qai_hub_models/models/resnext101_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNeXt101Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 76378.0 - throughput: 13.092775406530677 + inference_time: 2842.0 + throughput: 351.8648838845883 estimated_peak_memory_range: - min: 143360 - max: 3223784 + min: 16384 + max: 1739432 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 149 + layers_on_npu: 146 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 149 - job_id: jmg9zy8qp + total_layers: 146 + job_id: jygzekkg8 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:42:18.013006Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 2088.0 + throughput: 478.9272030651341 + estimated_peak_memory_range: + min: 36864 + max: 251955536 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 146 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 146 + job_id: jnp1lz25q + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:24:55.190881Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:42:18.013015Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/resnext101_quantized/test.py b/qai_hub_models/models/resnext101_quantized/test.py index 8beed1b8..1df1173a 100644 --- a/qai_hub_models/models/resnext101_quantized/test.py +++ b/qai_hub_models/models/resnext101_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.resnext101_quantized.demo import main as demo_main from qai_hub_models.models.resnext101_quantized.model import ( @@ -26,16 +25,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - ResNeXt101Quantizable.from_pretrained(), - is_quantized=True, - diff_tol=0.007, - rtol=0.02, - atol=0.2, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/resnext50/README.md b/qai_hub_models/models/resnext50/README.md index 91f1ce68..a7426562 100644 --- a/qai_hub_models/models/resnext50/README.md +++ b/qai_hub_models/models/resnext50/README.md @@ -10,7 +10,7 @@ This is based on the implementation of ResNeXt50 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnext50). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.resnext50.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of ResNeXt50 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431) diff --git a/qai_hub_models/models/resnext50/conftest.py b/qai_hub_models/models/resnext50/conftest.py new file mode 100644 index 00000000..abe7f903 --- /dev/null +++ b/qai_hub_models/models/resnext50/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnext50 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnext50.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnext50/demo.py b/qai_hub_models/models/resnext50/demo.py index 46e8761a..770e8c07 100644 --- a/qai_hub_models/models/resnext50/demo.py +++ b/qai_hub_models/models/resnext50/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.resnext50.model import ResNeXt50 +from qai_hub_models.models.resnext50.model import MODEL_ID, ResNeXt50 def main(is_test: bool = False): - imagenet_demo(ResNeXt50, is_test) + imagenet_demo(ResNeXt50, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/resnext50/export.py b/qai_hub_models/models/resnext50/export.py index ecdcf293..7baf5fa2 100644 --- a/qai_hub_models/models/resnext50/export.py +++ b/qai_hub_models/models/resnext50/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/resnext50/info.yaml b/qai_hub_models/models/resnext50/info.yaml index a2e0e3c4..9ba350da 100644 --- a/qai_hub_models/models/resnext50/info.yaml +++ b/qai_hub_models/models/resnext50/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1611.05431 research_paper_title: Aggregated Residual Transformations for Deep Neural Networks license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -34,6 +35,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/resnext50/model.py b/qai_hub_models/models/resnext50/model.py index 94a5239f..5b6cd9db 100644 --- a/qai_hub_models/models/resnext50/model.py +++ b/qai_hub_models/models/resnext50/model.py @@ -14,6 +14,6 @@ class ResNeXt50(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ResNeXt50: net = tv_models.resnext50_32x4d(weights=weights) return cls(net) diff --git a/qai_hub_models/models/resnext50/perf.yaml b/qai_hub_models/models/resnext50/perf.yaml index 7801c578..806199a4 100644 --- a/qai_hub_models/models/resnext50/perf.yaml +++ b/qai_hub_models/models/resnext50/perf.yaml @@ -17,12 +17,15 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: ResNeXt50 @@ -32,7 +35,7 @@ models: throughput: 472.14353163361665 estimated_peak_memory_range: min: 16384 - max: 2188056 + max: 2846256 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 77 - job_id: jep2r94xg + job_id: j7gjxq1pd job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:21:42.740361Z' torchscript_onnx_qnn: - inference_time: 2068.0 - throughput: 483.55899419729207 + inference_time: 2081.0 + throughput: 480.5382027871216 estimated_peak_memory_range: - min: 16384 - max: 67185584 + min: 12288 + max: 67945728 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 125 - job_id: jqpyojqr5 + job_id: jygzen4g8 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1551.0 + throughput: 644.7453255963894 + estimated_peak_memory_range: + min: 16384 + max: 161276560 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 77 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 77 + job_id: jlpe9y8gr job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:07:32.076107Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:23:42.732818Z' + torchscript_onnx_qnn: + inference_time: 1518.0 + throughput: 658.7615283267457 + estimated_peak_memory_range: + min: 618496 + max: 57881488 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 125 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 125 + job_id: jz5wo44p1 + job_status: Passed diff --git a/qai_hub_models/models/resnext50/test.py b/qai_hub_models/models/resnext50/test.py index 923fbdeb..80736558 100644 --- a/qai_hub_models/models/resnext50/test.py +++ b/qai_hub_models/models/resnext50/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(ResNeXt50.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(ResNeXt50.from_pretrained()) diff --git a/qai_hub_models/models/resnext50_quantized/README.md b/qai_hub_models/models/resnext50_quantized/README.md new file mode 100644 index 00000000..36fd8218 --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/README.md @@ -0,0 +1,54 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [ResNeXt50Quantized: Imagenet classifier and general purpose backbone](https://aihub.qualcomm.com/models/resnext50_quantized) + +ResNeXt50 is a machine learning model that can classify images from the Imagenet dataset. It can also be used as a backbone in building more complex models for specific use cases. + +This is based on the implementation of ResNeXt50Quantized found +[here](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/resnext50_quantized). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.resnext50_quantized.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.resnext50_quantized.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of ResNeXt50Quantized can be found + [here](https://github.com/pytorch/vision/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431) +* [Source Model Implementation](https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/resnext50_quantized/__init__.py b/qai_hub_models/models/resnext50_quantized/__init__.py new file mode 100644 index 00000000..0e0b34c7 --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/__init__.py @@ -0,0 +1,10 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.app import ( # noqa: F401 + ImagenetClassifierApp as App, +) + +from .model import MODEL_ID # noqa: F401 +from .model import ResNeXt50Quantizable as Model # noqa: F401 diff --git a/qai_hub_models/models/resnext50_quantized/conftest.py b/qai_hub_models/models/resnext50_quantized/conftest.py new file mode 100644 index 00000000..e26716d6 --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.resnext50_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.resnext50_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/resnext50_quantized/demo.py b/qai_hub_models/models/resnext50_quantized/demo.py new file mode 100644 index 00000000..58d9d2b2 --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/demo.py @@ -0,0 +1,17 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo +from qai_hub_models.models.resnext50_quantized.model import ( + MODEL_ID, + ResNeXt50Quantizable, +) + + +def main(is_test: bool = False): + imagenet_demo(ResNeXt50Quantizable, MODEL_ID, is_test) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext50_quantized/export.py b/qai_hub_models/models/resnext50_quantized/export.py new file mode 100644 index 00000000..7f4a6bcc --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/export.py @@ -0,0 +1,209 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import qai_hub as hub + +from qai_hub_models.models.resnext50_quantized import Model +from qai_hub_models.utils.args import ( + export_parser, + get_input_spec_kwargs, + get_model_kwargs, +) +from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_on_target_demo_cmd, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, + transpose_channel_first_to_last, +) +from qai_hub_models.utils.qnn_helpers import get_qnn_inputs + + +def export_model( + device: str = "Samsung Galaxy S23", + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] | List[ + str +]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` and `model.get_input_spec` + + Returns: + A 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "resnext50_quantized" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "resnext50_quantized", + "ResNeXt50Quantized", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + input_spec = model.get_input_spec( + **get_input_spec_kwargs(model, additional_model_kwargs) + ) + + # Trace the model + source_model = model.convert_to_hub_source_model( + target_runtime, output_path, input_spec + ) + if target_runtime == TargetRuntime.TFLITE: + quant_calibration_data = None + else: + quant_calibration_data = model.get_calibration_data(target_runtime, input_spec) + + # 2. Compile the model to an on-device asset + model_compile_options = model.get_hub_compile_options( + target_runtime, compile_options + " --force_channel_last_input image_tensor" + ) + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=model_name, + calibration_data=quant_calibration_data, + options=model_compile_options, + ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) + + # 3. Profile the model asset on real devices + profile_job: Optional[hub.client.ProfileJob] = None + if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print(f"Profiling model {model_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_job.get_target_model(), + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) + + # 4. Run inference on-device with sample inputs + inference_job: Optional[hub.client.InferenceJob] = None + if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) + print( + f"Running inference for {model_name} on a hosted device with example inputs." + ) + sample_inputs = model.sample_inputs(input_spec) + hub_inputs = sample_inputs + if target_runtime == TargetRuntime.QNN: + hub_inputs = get_qnn_inputs(compile_job, sample_inputs) + # Convert inputs from channel first to channel last + hub_inputs = transpose_channel_first_to_last( + "image_tensor", sample_inputs, target_runtime + ) + submitted_inference_job = hub.submit_inference_job( + model=compile_job.get_target_model(), + inputs=hub_inputs, + device=hub.Device(device), + name=model_name, + options=profile_options_all, + ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) + + # 5. Download the model asset to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download(str(output_path / f"{model_name}.tflite")) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + torch_out = torch_inference(model, sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics( + inference_job, inference_result, torch_out, metrics="psnr,top1,top5" + ) + + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + + return (compile_job, profile_job, inference_job) + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser(model_cls=Model, supports_qnn=False) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/resnext50_quantized/info.yaml b/qai_hub_models/models/resnext50_quantized/info.yaml new file mode 100644 index 00000000..69be8e37 --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/info.yaml @@ -0,0 +1,43 @@ +name: ResNeXt50Quantized +# id must match with the model dir name in qai_hub_models +id: resnext50_quantized +status: public +headline: Imagenet classifier and general purpose backbone. +domain: Computer Vision +description: ResNeXt50 is a machine learning model that can classify images from the + Imagenet dataset. It can also be used as a backbone in building more complex models + for specific use cases. +use_case: Image Classification +tags: + - backbone + - quantized +research_paper: https://arxiv.org/abs/1611.05431 +research_paper_title: Aggregated Residual Transformations for Deep Neural Networks +license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py +technical_details: + Model checkpoint: Imagenet + Input resolution: 224x224 + Number of parameters: 88.7M + Model size: 87.3 MB +applicable_scenarios: + - Medical Imaging + - Anomaly Detection + - Inventory Management +related_models: + - mobilenet_v2 + - densenet121 + - googlenet +form_factors: + - Phone + - Tablet + - IoT + - XR +has_static_banner: yes +has_animated_banner: yes +license_type: bsd-3-clause +deploy_license_type: AI Model Hub License +dataset: + - imagenet-1k + - imagenet-22k diff --git a/qai_hub_models/models/resnext50_quantized/model.py b/qai_hub_models/models/resnext50_quantized/model.py new file mode 100644 index 00000000..dca50076 --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/model.py @@ -0,0 +1,93 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +# isort: off +# This verifies aimet is installed, and this must be included first. +from qai_hub_models.utils.quantization_aimet import ( + AIMETQuantizableMixin, +) + +# isort: on + +import torch +from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model +from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim + +from qai_hub_models.models.resnext50.model import ResNeXt50 +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime + +MODEL_ID = __name__.split(".")[-2] +MODEL_ASSET_VERSION = 1 +DEFAULT_ENCODINGS = "resnext50_quantized_encodings.json" + + +class ResNeXt50Quantizable(AIMETQuantizableMixin, ResNeXt50): + """ResNeXt50 with post train quantization support. + + Supports only 8 bit weights and activations, and only loads pre-quantized checkpoints. + Support for quantizing using your own weights & data will come at a later date.""" + + def __init__( + self, + sim_model: QuantizationSimModel, + ) -> None: + ResNeXt50.__init__(self, sim_model.model) + AIMETQuantizableMixin.__init__( + self, + sim_model, + ) + + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + + @classmethod + def from_pretrained( + cls, + aimet_encodings: str | None = "DEFAULT", + ) -> "ResNeXt50Quantizable": + """ + Parameters: + aimet_encodings: + if "DEFAULT": Loads the model with aimet encodings calibrated on imagenette. + elif None: Doesn't load any encodings. Used when computing encodings. + else: Interprets as a filepath and loads the encodings stored there. + """ + model = ResNeXt50.from_pretrained() + input_shape = cls.get_input_spec()["image_tensor"][0] + + model = prepare_model(model) + equalize_model(model, input_shape) + sim = QuantizationSimModel( + model, + quant_scheme="tf_enhanced", + default_param_bw=8, + default_output_bw=8, + config_file=get_default_aimet_config(), + dummy_input=torch.rand(input_shape), + ) + + if aimet_encodings: + if aimet_encodings == "DEFAULT": + aimet_encodings = CachedWebModelAsset.from_asset_store( + MODEL_ID, MODEL_ASSET_VERSION, DEFAULT_ENCODINGS + ).fetch() + load_encodings_to_sim(sim, aimet_encodings) + + sim.model.eval() + return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/whisper_asr/perf.yaml b/qai_hub_models/models/resnext50_quantized/perf.yaml similarity index 71% rename from qai_hub_models/models/whisper_asr/perf.yaml rename to qai_hub_models/models/resnext50_quantized/perf.yaml index f8e81783..67cbf162 100644 --- a/qai_hub_models/models/whisper_asr/perf.yaml +++ b/qai_hub_models/models/resnext50_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: -- name: WhisperEncoder +- name: ResNeXt50Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 68918.0 - throughput: 14.50999738820047 + inference_time: 874.0 + throughput: 1144.1647597254005 estimated_peak_memory_range: - min: 18612224 - max: 67240168 - primary_compute_unit: GPU - precision: fp16 + min: 12288 + max: 1920376 + primary_compute_unit: NPU + precision: int8 layer_info: - layers_on_npu: 0 - layers_on_gpu: 216 + layers_on_npu: 78 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 216 - job_id: j1p3z16z5 + total_layers: 78 + job_id: jegn27jgo job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:10:31.822073Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,31 +68,29 @@ models: total_layers: 0 job_id: '' job_status: Skipped - reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' - form_factor: Phone - os_name: Android - manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:20.996693Z' -- name: WhisperDecoder - performance_metrics: - torchscript_onnx_tflite: - inference_time: 7924.0 - throughput: 126.19888944977284 + inference_time: 656.0 + throughput: 1524.3902439024391 estimated_peak_memory_range: - min: 3014656 - max: 5380072 + min: 12288 + max: 96222112 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 293 + layers_on_npu: 78 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 293 - job_id: jwgoln8dg + total_layers: 78 + job_id: joprknk50 job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:10:31.822087Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -97,11 +106,3 @@ models: total_layers: 0 job_id: '' job_status: Skipped - reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' - form_factor: Phone - os_name: Android - manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:35:36.297844Z' diff --git a/qai_hub_models/models/resnext50_quantized/test.py b/qai_hub_models/models/resnext50_quantized/test.py new file mode 100644 index 00000000..4cd1dbbd --- /dev/null +++ b/qai_hub_models/models/resnext50_quantized/test.py @@ -0,0 +1,30 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( + run_imagenet_classifier_test, +) +from qai_hub_models.models.resnext50_quantized.demo import main as demo_main +from qai_hub_models.models.resnext50_quantized.model import ( + MODEL_ASSET_VERSION, + MODEL_ID, + ResNeXt50Quantizable, +) + + +def test_task(): + run_imagenet_classifier_test( + ResNeXt50Quantizable.from_pretrained(), + MODEL_ID, + probability_threshold=0.46, + diff_tol=0.005, + rtol=0.02, + atol=0.2, + asset_version=MODEL_ASSET_VERSION, + ) + + +def test_demo(): + # Verify demo does not crash + demo_main(is_test=True) diff --git a/qai_hub_models/models/sam/README.md b/qai_hub_models/models/sam/README.md index d5b814a1..937df382 100644 --- a/qai_hub_models/models/sam/README.md +++ b/qai_hub_models/models/sam/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Segment-Anything-Model found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/sam). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.sam.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Segment-Anything-Model can be found [here](https://github.com/facebookresearch/segment-anything/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Segment Anything](https://arxiv.org/abs/2304.02643) diff --git a/qai_hub_models/models/sam/conftest.py b/qai_hub_models/models/sam/conftest.py new file mode 100644 index 00000000..7f6e737d --- /dev/null +++ b/qai_hub_models/models/sam/conftest.py @@ -0,0 +1,28 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.sam import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.sam.Model.from_pretrained", + return_value=Model.from_pretrained( + model_type="vit_b", + ), + ) + mock.start() diff --git a/qai_hub_models/models/sam/export.py b/qai_hub_models/models/sam/export.py index 0ebb9f97..a5ed59d6 100644 --- a/qai_hub_models/models/sam/export.py +++ b/qai_hub_models/models/sam/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub import torch @@ -18,7 +18,7 @@ from qai_hub_models.models.sam import Model from qai_hub_models.utils.args import export_parser, get_model_kwargs -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.compare import torch_inference from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( @@ -91,9 +91,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or DEFAULT_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "sam", @@ -112,75 +112,90 @@ def export_model( # 1. Initialize PyTorch model model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) - components_dict = {} + components_dict: Dict[str, BaseModel] = {} if "SAMDecoder" in components: - components_dict["SAMDecoder"] = model.get_sam_decoder() + components_dict["SAMDecoder"] = model.get_sam_decoder() # type: ignore if "SAMEncoder" in components: - components_dict["SAMEncoder"] = model.get_sam_encoder() + components_dict["SAMEncoder"] = model.get_sam_encoder() # type: ignore - compile_jobs = {} + compile_jobs: Dict[str, hub.client.CompileJob] = {} for component_name, component in components_dict.items(): # Trace the model input_spec = component.get_input_spec() - source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) source_model = optimize_for_mobile( source_model, optimization_blocklist={ - MobileOptimizerType.HOIST_CONV_PACKED_PARAMS, - MobileOptimizerType.INSERT_FOLD_PREPACK_OPS, - MobileOptimizerType.CONV_BN_FUSION, + MobileOptimizerType.HOIST_CONV_PACKED_PARAMS, # type: ignore + MobileOptimizerType.INSERT_FOLD_PREPACK_OPS, # type: ignore + MobileOptimizerType.CONV_BN_FUSION, # type: ignore }, ) # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( - target_runtime, - compile_options - + " --force_channel_last_input image --force_channel_last_output output_0", + target_runtime, compile_options ) - print(f"Optimizing model {component_name} to run on-device.") - compile_jobs[component_name] = hub.submit_compile_job( + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), - name=f"{component_name}", + name=f"{model_name}_{component_name}", options=model_compile_options, ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_jobs[component_name].get_target_model(), device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_jobs[component_name].get_target_model(), inputs=sample_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Download the model assets to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) for component_name, compile_job in compile_jobs.items(): - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download( str(output_path / f"{model_name}_{component_name}.tflite") ) @@ -189,8 +204,8 @@ def export_model( if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: @@ -198,8 +213,8 @@ def export_model( inference_job = inference_jobs[component_name] sample_inputs = components_dict[component_name].sample_inputs() torch_out = torch_inference(components_dict[component_name], sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return { diff --git a/qai_hub_models/models/sam/info.yaml b/qai_hub_models/models/sam/info.yaml index 59b6b5c5..2be861b5 100644 --- a/qai_hub_models/models/sam/info.yaml +++ b/qai_hub_models/models/sam/info.yaml @@ -15,6 +15,7 @@ tags: research_paper: https://arxiv.org/abs/2304.02643 research_paper_title: Segment Anything license: https://github.com/facebookresearch/segment-anything/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/facebookresearch/segment-anything technical_details: Model checkpoint: vit_l @@ -32,4 +33,5 @@ related_models: [] has_static_banner: yes has_animated_banner: yes license_type: apache-2.0 +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/sam/model.py b/qai_hub_models/models/sam/model.py index 1e9561c8..e6fbd483 100644 --- a/qai_hub_models/models/sam/model.py +++ b/qai_hub_models/models/sam/model.py @@ -117,20 +117,41 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: """ return self.sam.image_encoder(image) - def get_input_spec( + def _get_input_spec_for_model_instance( self, - height: int = 720, - width: int = 1280, + batch_size: int = 1, + num_channels: int = 3, + ) -> InputSpec: + """ + Override for model.get_input_spec() when called on instances of this class. + + The initializer for BaseModel will automatically override get_input_spec + with this function when the class is instantiated. + """ + return self.__class__.get_input_spec( + batch_size, + num_channels, + self.sam.image_encoder.img_size, + self.sam.image_encoder.img_size, + ) + + @staticmethod + def get_input_spec( + batch_size: int = 1, + num_channels: int = 3, + encoder_img_height: int = 1024, # self.sam.image_encoder.img_size[0] + encoder_img_width: int = 1024, # self.sam.image_encoder.img_size[1] ) -> InputSpec: # Get the input specification ordered (name -> (shape, type)) pairs for this model. # # This can be used with the qai_hub python API to declare # the model input specification upon submitting a profile job. - - preprocessed_image = self.preprocess_input_image( - np.ones((height, width, 3), dtype=np.uint8) - ) - return {"image": (preprocessed_image.shape, "float32")} + return { + "image": ( + (batch_size, num_channels, encoder_img_height, encoder_img_width), + "float32", + ) + } def preprocess_input_image(self, input_image: np.ndarray): """Transform input image to work with SAM encoder""" @@ -206,16 +227,35 @@ def forward( image_embeddings, point_coords, point_labels, mask_input, has_mask_input ) - def get_input_spec( + def _get_input_spec_for_model_instance( self, - num_of_points=1, + num_of_points: int = 1, + ) -> InputSpec: + """ + Override for model.get_input_spec() when called on instances of this class. + + The initializer for BaseModel will automatically override get_input_spec + with this function when the class is instantiated. + """ + return self.__class__.get_input_spec( + num_of_points, + self.sam.prompt_encoder.embed_dim, + self.sam.prompt_encoder.image_embedding_size[0], + self.sam.prompt_encoder.image_embedding_size[1], + ) + + @staticmethod + def get_input_spec( + num_of_points: int = 1, + embed_dim: int = 256, + image_embedding_height: int = 64, + image_embedding_width: int = 64, ) -> InputSpec: # Get the input specification ordered (name -> (shape, type)) pairs for this model. # # This can be used with the qai_hub python API to declare # the model input specification upon submitting a profile job. - embed_dim = self.sam.prompt_encoder.embed_dim - embed_size = self.sam.prompt_encoder.image_embedding_size + embed_size = (image_embedding_height, image_embedding_width) mask_input_size = [4 * x for x in embed_size] input_spec = { diff --git a/qai_hub_models/models/sam/perf.yaml b/qai_hub_models/models/sam/perf.yaml index 7aa44891..6ea06c1e 100644 --- a/qai_hub_models/models/sam/perf.yaml +++ b/qai_hub_models/models/sam/perf.yaml @@ -2,6 +2,12 @@ aggregated: supported_oses: - Android supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G - Samsung Galaxy S21 - Samsung Galaxy S21 Ultra - Samsung Galaxy S21+ @@ -11,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: SAMDecoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 16696.0 - throughput: 59.89458552946814 + inference_time: 16761.0 + throughput: 59.66231131794046 estimated_peak_memory_range: - min: 71995392 - max: 131856168 + min: 42115072 + max: 92806968 primary_compute_unit: GPU precision: fp16 layer_info: layers_on_npu: 0 layers_on_gpu: 356 - layers_on_cpu: 8 - total_layers: 364 - job_id: j1pvlewr5 + layers_on_cpu: 9 + total_layers: 365 + job_id: jmg9vkm57 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:10:36.748428Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -51,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 13794.0 + throughput: 72.4952878062926 + estimated_peak_memory_range: + min: 41951232 + max: 94062064 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 356 + layers_on_cpu: 9 + total_layers: 365 + job_id: jnp107n5q + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-22T17:43:03.980523Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:10:36.748439Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/sam/requirements.txt b/qai_hub_models/models/sam/requirements.txt index 116e68bc..c0faedcb 100644 --- a/qai_hub_models/models/sam/requirements.txt +++ b/qai_hub_models/models/sam/requirements.txt @@ -1,4 +1,2 @@ -matplotlib -opencv_python -pycocotools -requests +matplotlib==3.7.4 +pycocotools==2.0.7 diff --git a/qai_hub_models/models/sam/test.py b/qai_hub_models/models/sam/test.py index 640a4286..fd862267 100644 --- a/qai_hub_models/models/sam/test.py +++ b/qai_hub_models/models/sam/test.py @@ -6,7 +6,7 @@ import pytest import torch -from qai_hub_models.models.sam import App +from qai_hub_models.models.sam.app import SAMApp from qai_hub_models.models.sam.demo import IMAGE_ADDRESS from qai_hub_models.models.sam.demo import main as demo_main from qai_hub_models.models.sam.model import SMALL_MODEL_TYPE, SAMQAIHMWrapper @@ -36,7 +36,7 @@ def test_e2e_numerical( sam_predictor.set_image(input_image_data) # QAIHM SAMApp for segmentation - sam_app = App(sam_wrapper) + sam_app = SAMApp(sam_wrapper) # Prepare image for segmentation sam_app.prepare(input_image_data) diff --git a/qai_hub_models/models/sesr_m5/README.md b/qai_hub_models/models/sesr_m5/README.md index cc0a70a9..37483cd4 100644 --- a/qai_hub_models/models/sesr_m5/README.md +++ b/qai_hub_models/models/sesr_m5/README.md @@ -10,7 +10,7 @@ This is based on the implementation of SESR-M5 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/sesr_m5). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.sesr_m5.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of SESR-M5 can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Collapsible Linear Blocks for Super-Efficient Super Resolution](https://arxiv.org/abs/2103.09404) diff --git a/qai_hub_models/models/sesr_m5/conftest.py b/qai_hub_models/models/sesr_m5/conftest.py new file mode 100644 index 00000000..238f114e --- /dev/null +++ b/qai_hub_models/models/sesr_m5/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.sesr_m5 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.sesr_m5.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/sesr_m5/demo.py b/qai_hub_models/models/sesr_m5/demo.py index d8b0c0f9..312bbab4 100644 --- a/qai_hub_models/models/sesr_m5/demo.py +++ b/qai_hub_models/models/sesr_m5/demo.py @@ -16,6 +16,7 @@ def main(is_test: bool = False): super_resolution_demo( model_cls=SESR_M5, + model_id=MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, ) diff --git a/qai_hub_models/models/sesr_m5/export.py b/qai_hub_models/models/sesr_m5/export.py index c857a7c6..51517850 100644 --- a/qai_hub_models/models/sesr_m5/export.py +++ b/qai_hub_models/models/sesr_m5/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/sesr_m5/info.yaml b/qai_hub_models/models/sesr_m5/info.yaml index 081f7b0b..06f7dae2 100644 --- a/qai_hub_models/models/sesr_m5/info.yaml +++ b/qai_hub_models/models/sesr_m5/info.yaml @@ -10,6 +10,7 @@ tags: [] research_paper: https://arxiv.org/abs/2103.09404 research_paper_title: Collapsible Linear Blocks for Super-Efficient Super Resolution license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr technical_details: Model checkpoint: sesr_m5_4x_checkpoint_float32 @@ -29,4 +30,5 @@ related_models: has_static_banner: yes has_animated_banner: yes license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/sesr_m5/model.py b/qai_hub_models/models/sesr_m5/model.py index 2b0b6ec8..c7283ab9 100644 --- a/qai_hub_models/models/sesr_m5/model.py +++ b/qai_hub_models/models/sesr_m5/model.py @@ -37,8 +37,6 @@ def __init__( @classmethod def from_pretrained(cls) -> SESR_M5: model = _load_sesr_source_model( - MODEL_ID, - MODEL_ASSET_VERSION, SCALING_FACTOR, NUM_CHANNELS, NUM_LBLOCKS, diff --git a/qai_hub_models/models/sesr_m5/perf.yaml b/qai_hub_models/models/sesr_m5/perf.yaml index 95e47b7b..dc0d7cac 100644 --- a/qai_hub_models/models/sesr_m5/perf.yaml +++ b/qai_hub_models/models/sesr_m5/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: SESR-M5 performance_metrics: - torchscript_onnx_tflite: - inference_time: 2214.0 - throughput: 451.6711833785005 + inference_time: 2245.0 + throughput: 445.43429844097994 estimated_peak_memory_range: - min: 49152 - max: 8233656 + min: 28672 + max: 9857128 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 25 - job_id: jz5wl394p + job_id: jwgoyjd58 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:02:55.130462Z' torchscript_onnx_qnn: - inference_time: 2149.0 - throughput: 465.33271288971616 + inference_time: 2136.0 + throughput: 468.1647940074906 estimated_peak_memory_range: - min: 212992 - max: 77434640 + min: 221184 + max: 3873216 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 32 - job_id: jmg9zy4mp + job_id: j7gjxj8pd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1533.0 + throughput: 652.3157208088714 + estimated_peak_memory_range: + min: 16384 + max: 23601872 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 22 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 25 + job_id: j1pv3jm5x job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:36:38.760826Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:04:43.299283Z' + torchscript_onnx_qnn: + inference_time: 1462.0 + throughput: 683.9945280437756 + estimated_peak_memory_range: + min: 208896 + max: 20706112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 32 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 32 + job_id: jlpe9j0gr + job_status: Passed diff --git a/qai_hub_models/models/sesr_m5/test.py b/qai_hub_models/models/sesr_m5/test.py index 8412b7dc..e59f48d4 100644 --- a/qai_hub_models/models/sesr_m5/test.py +++ b/qai_hub_models/models/sesr_m5/test.py @@ -34,5 +34,6 @@ def test_task(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/sesr_m5_quantized/README.md b/qai_hub_models/models/sesr_m5_quantized/README.md index 8e042ba9..dadc3fc0 100644 --- a/qai_hub_models/models/sesr_m5_quantized/README.md +++ b/qai_hub_models/models/sesr_m5_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of SESR-M5-Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/sesr_m5_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.sesr_m5_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of SESR-M5-Quantized can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Collapsible Linear Blocks for Super-Efficient Super Resolution](https://arxiv.org/abs/2103.09404) diff --git a/qai_hub_models/models/sesr_m5_quantized/conftest.py b/qai_hub_models/models/sesr_m5_quantized/conftest.py new file mode 100644 index 00000000..36e64be6 --- /dev/null +++ b/qai_hub_models/models/sesr_m5_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.sesr_m5_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.sesr_m5_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/sesr_m5_quantized/demo.py b/qai_hub_models/models/sesr_m5_quantized/demo.py index cb08ed1f..4d063bdd 100644 --- a/qai_hub_models/models/sesr_m5_quantized/demo.py +++ b/qai_hub_models/models/sesr_m5_quantized/demo.py @@ -19,6 +19,7 @@ def main(is_test: bool = False): super_resolution_demo( SESR_M5Quantizable, + MODEL_ID, default_image=IMAGE_ADDRESS, is_test=is_test, available_target_runtimes=[TargetRuntime.TFLITE], diff --git a/qai_hub_models/models/sesr_m5_quantized/export.py b/qai_hub_models/models/sesr_m5_quantized/export.py index 9084e32a..180d06e3 100644 --- a/qai_hub_models/models/sesr_m5_quantized/export.py +++ b/qai_hub_models/models/sesr_m5_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -119,8 +119,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -128,21 +128,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -154,30 +162,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/sesr_m5_quantized/info.yaml b/qai_hub_models/models/sesr_m5_quantized/info.yaml index b4ac2e2e..f62e37c6 100644 --- a/qai_hub_models/models/sesr_m5_quantized/info.yaml +++ b/qai_hub_models/models/sesr_m5_quantized/info.yaml @@ -10,6 +10,7 @@ tags: [quantized] research_paper: https://arxiv.org/abs/2103.09404 research_paper_title: Collapsible Linear Blocks for Super-Efficient Super Resolution license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/sesr technical_details: Model checkpoint: sesr_m5_4x_checkpoint_int8 @@ -27,4 +28,5 @@ related_models: [xlsr, xlsr_quantized, quicksrnetlarge] has_static_banner: yes has_animated_banner: yes license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/sesr_m5_quantized/model.py b/qai_hub_models/models/sesr_m5_quantized/model.py index 4e85a042..7e2bac70 100644 --- a/qai_hub_models/models/sesr_m5_quantized/model.py +++ b/qai_hub_models/models/sesr_m5_quantized/model.py @@ -60,9 +60,7 @@ def from_pretrained( aimet_encodings: str | None = "DEFAULT", ) -> SESR_M5Quantizable: # Load Model - sesr = _load_sesr_source_model( - MODEL_ID, MODEL_ASSET_VERSION, SCALING_FACTOR, NUM_CHANNELS, NUM_LBLOCKS - ) + sesr = _load_sesr_source_model(SCALING_FACTOR, NUM_CHANNELS, NUM_LBLOCKS) input_shape = SESR_M5.get_input_spec()["image"][0] equalize_model(sesr, input_shape) diff --git a/qai_hub_models/models/sesr_m5_quantized/perf.yaml b/qai_hub_models/models/sesr_m5_quantized/perf.yaml index 14a85ccd..ba9de102 100644 --- a/qai_hub_models/models/sesr_m5_quantized/perf.yaml +++ b/qai_hub_models/models/sesr_m5_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: SESR-M5-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1743.0 - throughput: 573.7234652897304 + inference_time: 1749.0 + throughput: 571.7552887364208 estimated_peak_memory_range: - min: 24576 - max: 2845656 + min: 28672 + max: 6325016 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 13 layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 16 - job_id: jz5wl31jp + job_id: joprk1k50 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:59:28.460705Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 1403.0 + throughput: 712.7583749109052 + estimated_peak_memory_range: + min: 20480 + max: 21054176 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 13 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 16 + job_id: jep2836p6 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:34:35.502394Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:59:28.460714Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/sesr_m5_quantized/test.py b/qai_hub_models/models/sesr_m5_quantized/test.py index 46c55138..86bb6543 100644 --- a/qai_hub_models/models/sesr_m5_quantized/test.py +++ b/qai_hub_models/models/sesr_m5_quantized/test.py @@ -7,6 +7,7 @@ import zipfile import numpy as np +import pytest import torch from qai_hub_models.models._shared.super_resolution.app import SuperResolutionApp @@ -45,12 +46,13 @@ def test_task(): ) +@pytest.mark.trace @skip_clone_repo_check def test_trace(): image = load_image(IMAGE_ADDRESS) output_image = load_image(OUTPUT_IMAGE_ADDRESS) app = SuperResolutionApp( - SESR_M5Quantizable.from_pretrained().convert_to_quantized_torchscript() + SESR_M5Quantizable.from_pretrained().convert_to_torchscript() ) app_output_image = app.predict(image)[0] diff --git a/qai_hub_models/models/shufflenet_v2/README.md b/qai_hub_models/models/shufflenet_v2/README.md index d4763144..dbf1bb82 100644 --- a/qai_hub_models/models/shufflenet_v2/README.md +++ b/qai_hub_models/models/shufflenet_v2/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Shufflenet-v2 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/shufflenet_v2). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.shufflenet_v2.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Shufflenet-v2 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164) diff --git a/qai_hub_models/models/shufflenet_v2/conftest.py b/qai_hub_models/models/shufflenet_v2/conftest.py new file mode 100644 index 00000000..ce602a0e --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.shufflenet_v2 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.shufflenet_v2.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/shufflenet_v2/demo.py b/qai_hub_models/models/shufflenet_v2/demo.py index e1c488de..834dc0a1 100644 --- a/qai_hub_models/models/shufflenet_v2/demo.py +++ b/qai_hub_models/models/shufflenet_v2/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.shufflenet_v2.model import ShufflenetV2 +from qai_hub_models.models.shufflenet_v2.model import MODEL_ID, ShufflenetV2 def main(is_test: bool = False): - imagenet_demo(ShufflenetV2, is_test) + imagenet_demo(ShufflenetV2, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/shufflenet_v2/export.py b/qai_hub_models/models/shufflenet_v2/export.py index 74fd2aae..cdb14643 100644 --- a/qai_hub_models/models/shufflenet_v2/export.py +++ b/qai_hub_models/models/shufflenet_v2/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/shufflenet_v2/info.yaml b/qai_hub_models/models/shufflenet_v2/info.yaml index b33359f4..9663eb73 100644 --- a/qai_hub_models/models/shufflenet_v2/info.yaml +++ b/qai_hub_models/models/shufflenet_v2/info.yaml @@ -13,6 +13,7 @@ research_paper: https://arxiv.org/abs/1807.11164 research_paper_title: 'ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/shufflenet_v2/model.py b/qai_hub_models/models/shufflenet_v2/model.py index 9a3b8e77..105ed1c2 100644 --- a/qai_hub_models/models/shufflenet_v2/model.py +++ b/qai_hub_models/models/shufflenet_v2/model.py @@ -14,6 +14,6 @@ class ShufflenetV2(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ShufflenetV2: net = tv_models.shufflenet_v2_x0_5(weights=weights) return cls(net) diff --git a/qai_hub_models/models/shufflenet_v2/perf.yaml b/qai_hub_models/models/shufflenet_v2/perf.yaml index 8c57e692..525e8689 100644 --- a/qai_hub_models/models/shufflenet_v2/perf.yaml +++ b/qai_hub_models/models/shufflenet_v2/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Shufflenet-v2 performance_metrics: - torchscript_onnx_tflite: - inference_time: 921.0 - throughput: 1085.7763300760043 + inference_time: 919.0 + throughput: 1088.139281828074 estimated_peak_memory_range: - min: 16384 - max: 2322736 + min: 12288 + max: 2065312 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 202 - job_id: j1gly27e5 + job_id: j2p0y1ngw job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:00:07.928895Z' torchscript_onnx_qnn: - inference_time: 321.0 - throughput: 3115.264797507788 + inference_time: 322.0 + throughput: 3105.590062111801 estimated_peak_memory_range: - min: 622592 - max: 4181728 + min: 626688 + max: 3731328 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 157 - job_id: jw568zvvg + job_id: jogkzlngd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 586.0 + throughput: 1706.4846416382252 + estimated_peak_memory_range: + min: 16384 + max: 32832960 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 202 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 202 + job_id: j1p8o3og9 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:27:51.522582Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:03:17.860163Z' + torchscript_onnx_qnn: + inference_time: 225.0 + throughput: 4444.444444444444 + estimated_peak_memory_range: + min: 12288 + max: 48449136 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 157 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 157 + job_id: jn5q87o57 + job_status: Passed diff --git a/qai_hub_models/models/shufflenet_v2/test.py b/qai_hub_models/models/shufflenet_v2/test.py index 1198a8ad..81498767 100644 --- a/qai_hub_models/models/shufflenet_v2/test.py +++ b/qai_hub_models/models/shufflenet_v2/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(ShufflenetV2.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(ShufflenetV2.from_pretrained()) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/README.md b/qai_hub_models/models/shufflenet_v2_quantized/README.md index 2eaa69b3..6d034ee8 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/README.md +++ b/qai_hub_models/models/shufflenet_v2_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Shufflenet-v2Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/shufflenet_v2_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.shufflenet_v2_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Shufflenet-v2Quantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/conftest.py b/qai_hub_models/models/shufflenet_v2_quantized/conftest.py new file mode 100644 index 00000000..4ad51ad0 --- /dev/null +++ b/qai_hub_models/models/shufflenet_v2_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.shufflenet_v2_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.shufflenet_v2_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/shufflenet_v2_quantized/demo.py b/qai_hub_models/models/shufflenet_v2_quantized/demo.py index bf864ee1..bbecc0a4 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/demo.py +++ b/qai_hub_models/models/shufflenet_v2_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.shufflenet_v2_quantized.model import ShufflenetV2Quantizable +from qai_hub_models.models.shufflenet_v2_quantized.model import ( + MODEL_ID, + ShufflenetV2Quantizable, +) def main(is_test: bool = False): - imagenet_demo(ShufflenetV2Quantizable, is_test) + imagenet_demo(ShufflenetV2Quantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/shufflenet_v2_quantized/export.py b/qai_hub_models/models/shufflenet_v2_quantized/export.py index 27e330aa..d4cd288c 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/export.py +++ b/qai_hub_models/models/shufflenet_v2_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,42 +163,44 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, metrics="psnr,top1,top5" ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) def main(): warnings.filterwarnings("ignore") - parser = export_parser(model_cls=Model, supports_qnn=False) + parser = export_parser(model_cls=Model) args = parser.parse_args() export_model(**vars(args)) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/info.yaml b/qai_hub_models/models/shufflenet_v2_quantized/info.yaml index 2f7e3b8a..afd1d7e6 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/info.yaml +++ b/qai_hub_models/models/shufflenet_v2_quantized/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/1807.11164 research_paper_title: 'ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/shufflenetv2.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/shufflenet_v2_quantized/model.py b/qai_hub_models/models/shufflenet_v2_quantized/model.py index b6121793..ba13c2c4 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/model.py +++ b/qai_hub_models/models/shufflenet_v2_quantized/model.py @@ -8,26 +8,35 @@ # This verifies aimet is installed, and this must be included first. from qai_hub_models.utils.quantization_aimet import ( AIMETQuantizableMixin, - HubCompileOptionsInt8Mixin, ) # isort: on import torch -from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.cross_layer_equalization import ( + equalize_bn_folded_model, + fold_all_batch_norms, +) +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.shufflenet_v2.model import ShufflenetV2 -from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime +from qai_hub_models.utils.quantization_aimet import ( + convert_all_depthwise_to_per_tensor, + tie_aimet_observer_groups, +) MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 DEFAULT_ENCODINGS = "shufflenet_v2_quantized_encodings.json" class ShufflenetV2Quantizable( - HubCompileOptionsInt8Mixin, AIMETQuantizableMixin, ShufflenetV2 + AIMETQuantizableMixin, + ShufflenetV2, ): """ShufflenetV2 with post train quantization support. @@ -40,9 +49,15 @@ def __init__( ) -> None: ShufflenetV2.__init__(self, sim_model.model) AIMETQuantizableMixin.__init__( - self, sim_model, needs_onnx_direct_aimet_export=True + self, + sim_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, @@ -56,17 +71,22 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ model = ShufflenetV2.from_pretrained() - input_shape = model.get_input_spec()["image_tensor"][0] + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) + dummy_input = torch.rand(input_shape) - equalize_model(model, input_shape) + pairs = fold_all_batch_norms(model, input_shape, dummy_input) + equalize_bn_folded_model(model, input_shape, pairs, dummy_input) sim = QuantizationSimModel( - model.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_per_channel_aimet_config(), - dummy_input=torch.rand(input_shape), + config_file=get_default_aimet_config(), + dummy_input=dummy_input, ) + convert_all_depthwise_to_per_tensor(sim.model) + cls._tie_pre_concat_quantizers(sim) if aimet_encodings: if aimet_encodings == "DEFAULT": @@ -77,3 +97,65 @@ def from_pretrained( sim.model.eval() return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" + + @classmethod + def _tie_pre_concat_quantizers(cls, sim: QuantizationSimModel): + """ + This ties together the output quantizers prior to concatenations. This + prevents unnecessary re-quantization during the concatenation. + """ + n = sim.model.net + # Because of skip connections, the groups are large + groups = [ + [ + getattr(getattr(n.stage2, "0").branch1, "4"), + getattr(getattr(n.stage2, "0").branch2, "7"), + getattr(n.stage2, "0").module_cat, + getattr(getattr(n.stage2, "1").branch2, "7"), + getattr(n.stage2, "1").module_cat_1, + getattr(getattr(n.stage2, "2").branch2, "7"), + getattr(n.stage2, "2").module_cat_2, + getattr(getattr(n.stage2, "3").branch2, "7"), + getattr(n.stage2, "3").module_cat_3, + ], + [ + getattr(getattr(n.stage3, "0").branch1, "4"), + getattr(getattr(n.stage3, "0").branch2, "7"), + getattr(n.stage3, "0").module_cat_4, + getattr(getattr(n.stage3, "1").branch2, "7"), + getattr(n.stage3, "1").module_cat_5, + getattr(getattr(n.stage3, "2").branch2, "7"), + getattr(n.stage3, "2").module_cat_6, + getattr(getattr(n.stage3, "3").branch2, "7"), + getattr(n.stage3, "3").module_cat_7, + getattr(getattr(n.stage3, "4").branch2, "7"), + getattr(n.stage3, "4").module_cat_8, + getattr(getattr(n.stage3, "5").branch2, "7"), + getattr(n.stage3, "5").module_cat_9, + getattr(getattr(n.stage3, "6").branch2, "7"), + getattr(n.stage3, "6").module_cat_10, + getattr(getattr(n.stage3, "7").branch2, "7"), + getattr(n.stage3, "7").module_cat_11, + ], + [ + getattr(getattr(n.stage4, "0").branch1, "4"), + getattr(getattr(n.stage4, "0").branch2, "7"), + getattr(n.stage4, "0").module_cat_12, + getattr(getattr(n.stage4, "1").branch2, "7"), + getattr(n.stage4, "1").module_cat_13, + getattr(getattr(n.stage4, "2").branch2, "7"), + getattr(n.stage4, "2").module_cat_14, + getattr(getattr(n.stage4, "3").branch2, "7"), + getattr(n.stage4, "3").module_cat_15, + ], + ] + + tie_aimet_observer_groups(groups) diff --git a/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml index 877fce75..6459f84c 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml +++ b/qai_hub_models/models/shufflenet_v2_quantized/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Shufflenet-v2Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 30460.0 - throughput: 32.829940906106366 + inference_time: 579.0 + throughput: 1727.1157167530225 estimated_peak_memory_range: - min: 294912 - max: 4752264 + min: 16384 + max: 4558296 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 221 + layers_on_npu: 203 layers_on_gpu: 0 - layers_on_cpu: 17 - total_layers: 238 - job_id: jnp1nw8kg + layers_on_cpu: 0 + total_layers: 203 + job_id: j1p89yxg9 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:32:39.995361Z' torchscript_onnx_qnn: - inference_time: 355.0 - throughput: 2816.9014084507044 + inference_time: 279.0 + throughput: 3584.2293906810037 estimated_peak_memory_range: min: 0 - max: 3208840 + max: 75494608 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 122 + layers_on_npu: 120 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 122 - job_id: jvgddqvkg + total_layers: 120 + job_id: j1glzm8pv + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 385.0 + throughput: 2597.4025974025976 + estimated_peak_memory_range: + min: 12288 + max: 21664192 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 203 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 203 + job_id: jn5qkq457 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:21:57.529965Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:34:41.782968Z' + torchscript_onnx_qnn: + inference_time: 204.0 + throughput: 4901.9607843137255 + estimated_peak_memory_range: + min: 163840 + max: 41738848 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 120 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 120 + job_id: jw56j40po + job_status: Passed diff --git a/qai_hub_models/models/shufflenet_v2_quantized/test.py b/qai_hub_models/models/shufflenet_v2_quantized/test.py index 339d2f16..995731eb 100644 --- a/qai_hub_models/models/shufflenet_v2_quantized/test.py +++ b/qai_hub_models/models/shufflenet_v2_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.shufflenet_v2_quantized.demo import main as demo_main from qai_hub_models.models.shufflenet_v2_quantized.model import ( @@ -25,16 +24,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - ShufflenetV2Quantizable.from_pretrained(), - diff_tol=0.01, - rtol=0.02, - atol=0.2, - is_quantized=True, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/sinet/README.md b/qai_hub_models/models/sinet/README.md index a7d97cf1..5ddf002c 100644 --- a/qai_hub_models/models/sinet/README.md +++ b/qai_hub_models/models/sinet/README.md @@ -10,7 +10,7 @@ This is based on the implementation of SINet found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/sinet). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.sinet.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of SINet can be found [here](https://github.com/clovaai/ext_portrait_segmentation/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [SINet: Extreme Lightweight Portrait Segmentation Networks with Spatial Squeeze Modules and Information Blocking Decoder](https://arxiv.org/abs/1911.09099) diff --git a/qai_hub_models/models/sinet/conftest.py b/qai_hub_models/models/sinet/conftest.py new file mode 100644 index 00000000..b5b3c585 --- /dev/null +++ b/qai_hub_models/models/sinet/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.sinet import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.sinet.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/sinet/demo.py b/qai_hub_models/models/sinet/demo.py index 51972075..b4a56d63 100644 --- a/qai_hub_models/models/sinet/demo.py +++ b/qai_hub_models/models/sinet/demo.py @@ -30,8 +30,8 @@ def main(is_test: bool = False): help="image file path or URL.", ) args = parser.parse_args([] if is_test else None) - model = demo_model_from_cli_args(SINet, args) - validate_on_device_demo_args(args, SINet.get_model_id()) + model = demo_model_from_cli_args(SINet, MODEL_ID, args) + validate_on_device_demo_args(args, MODEL_ID) # load image and model image = load_image(args.image) diff --git a/qai_hub_models/models/sinet/export.py b/qai_hub_models/models/sinet/export.py index 3e9d21c7..780ac793 100644 --- a/qai_hub_models/models/sinet/export.py +++ b/qai_hub_models/models/sinet/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -119,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -150,37 +158,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/sinet/info.yaml b/qai_hub_models/models/sinet/info.yaml index 94bcbfc6..e7a6719f 100644 --- a/qai_hub_models/models/sinet/info.yaml +++ b/qai_hub_models/models/sinet/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/1911.09099 research_paper_title: 'SINet: Extreme Lightweight Portrait Segmentation Networks with Spatial Squeeze Modules and Information Blocking Decoder' license: https://github.com/clovaai/ext_portrait_segmentation/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/clovaai/ext_portrait_segmentation technical_details: Model checkpoint: SINet.pth @@ -32,4 +33,5 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/sinet/model.py b/qai_hub_models/models/sinet/model.py index ea82116f..76625bd3 100644 --- a/qai_hub_models/models/sinet/model.py +++ b/qai_hub_models/models/sinet/model.py @@ -5,12 +5,14 @@ from __future__ import annotations import os +from importlib import reload import torch from qai_hub_models.utils.asset_loaders import ( CachedWebModelAsset, SourceAsRoot, + find_replace_in_repo, load_torch, ) from qai_hub_models.utils.base_model import BaseModel @@ -54,8 +56,8 @@ def forward(self, image: torch.Tensor) -> torch.Tensor: """ return self.model(image) + @staticmethod def get_input_spec( - self, batch_size: int = 1, num_channels: int = 3, height: int = 224, @@ -86,7 +88,14 @@ def _load_sinet_source_model_from_weights( ) -> torch.nn.Module: with SourceAsRoot( SINET_SOURCE_REPOSITORY, SINET_SOURCE_REPO_COMMIT, MODEL_ID, MODEL_ASSET_VERSION - ): + ) as repo_root: + # This repository has a top-level "models", which is common. We + # explicitly reload it in case it has been loaded and cached by another + # package (or our models when executing from qai_hub_models/) + import models + + reload(models) + if os.path.exists(os.path.expanduser(weights_name_or_path)): weights_path = os.path.expanduser(weights_name_or_path) else: @@ -99,11 +108,7 @@ def _load_sinet_source_model_from_weights( # Perform a find and replace for .data.size() in SINet's shuffle implementation # as tracing treats this as a constant, but does not treat .shape as a constant - with open("models/SINet.py", "r") as file: - file_content = file.read() - new_content = file_content.replace(".data.size()", ".shape") - with open("models/SINet.py", "w") as file: - file.write(new_content) + find_replace_in_repo(repo_root, "models/SINet.py", ".data.size()", ".shape") # import the model arch from models.SINet import SINet diff --git a/qai_hub_models/models/sinet/perf.yaml b/qai_hub_models/models/sinet/perf.yaml index 936c588b..914dbe7c 100644 --- a/qai_hub_models/models/sinet/perf.yaml +++ b/qai_hub_models/models/sinet/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: SINet performance_metrics: - torchscript_onnx_tflite: - inference_time: 1817.0 - throughput: 550.357732526142 + inference_time: 1809.0 + throughput: 552.791597567717 estimated_peak_memory_range: - min: 434176 - max: 2872792 + min: 20480 + max: 2244048 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 240 - job_id: jegnzmkmg + job_id: jw566wn5o job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:17:54.436410Z' torchscript_onnx_qnn: - inference_time: 1192.0 - throughput: 838.9261744966443 + inference_time: 1193.0 + throughput: 838.2229673093043 estimated_peak_memory_range: - min: 622592 - max: 51366312 + min: 20480 + max: 25094232 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 187 - job_id: joprl2wep + job_id: jwgoy8158 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1170.0 + throughput: 854.7008547008547 + estimated_peak_memory_range: + min: 12288 + max: 24922736 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 240 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 240 + job_id: j1p3k6m52 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:11:37.141843Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:22:20.419307Z' + torchscript_onnx_qnn: + inference_time: 802.0 + throughput: 1246.8827930174564 + estimated_peak_memory_range: + min: 12288 + max: 65545232 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 187 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 187 + job_id: j1pv37z5x + job_status: Passed diff --git a/qai_hub_models/models/sinet/test.py b/qai_hub_models/models/sinet/test.py index 365aad84..4780256f 100644 --- a/qai_hub_models/models/sinet/test.py +++ b/qai_hub_models/models/sinet/test.py @@ -32,5 +32,6 @@ def test_task(): ) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/squeezenet1_1/README.md b/qai_hub_models/models/squeezenet1_1/README.md index 37e48f82..ed782b23 100644 --- a/qai_hub_models/models/squeezenet1_1/README.md +++ b/qai_hub_models/models/squeezenet1_1/README.md @@ -10,7 +10,7 @@ This is based on the implementation of SqueezeNet-1_1 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/squeezenet1_1). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.squeezenet1_1.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of SqueezeNet-1_1 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size](https://arxiv.org/abs/1602.07360) diff --git a/qai_hub_models/models/squeezenet1_1/conftest.py b/qai_hub_models/models/squeezenet1_1/conftest.py new file mode 100644 index 00000000..6f693de1 --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.squeezenet1_1 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.squeezenet1_1.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/squeezenet1_1/demo.py b/qai_hub_models/models/squeezenet1_1/demo.py index 75640bd4..2eed38c2 100644 --- a/qai_hub_models/models/squeezenet1_1/demo.py +++ b/qai_hub_models/models/squeezenet1_1/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.squeezenet1_1.model import SqueezeNet +from qai_hub_models.models.squeezenet1_1.model import MODEL_ID, SqueezeNet def main(is_test: bool = False): - imagenet_demo(SqueezeNet, is_test) + imagenet_demo(SqueezeNet, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/squeezenet1_1/export.py b/qai_hub_models/models/squeezenet1_1/export.py index 9c1da861..1586bb36 100644 --- a/qai_hub_models/models/squeezenet1_1/export.py +++ b/qai_hub_models/models/squeezenet1_1/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/squeezenet1_1/info.yaml b/qai_hub_models/models/squeezenet1_1/info.yaml index 3763db61..dba5e172 100644 --- a/qai_hub_models/models/squeezenet1_1/info.yaml +++ b/qai_hub_models/models/squeezenet1_1/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/1602.07360 research_paper_title: 'SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/squeezenet1_1/model.py b/qai_hub_models/models/squeezenet1_1/model.py index 6a08155f..9d97e4c0 100644 --- a/qai_hub_models/models/squeezenet1_1/model.py +++ b/qai_hub_models/models/squeezenet1_1/model.py @@ -14,6 +14,6 @@ class SqueezeNet(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> SqueezeNet: net = tv_models.squeezenet1_1(weights=weights) return cls(net) diff --git a/qai_hub_models/models/squeezenet1_1/perf.yaml b/qai_hub_models/models/squeezenet1_1/perf.yaml index 6fe797b1..217d17b8 100644 --- a/qai_hub_models/models/squeezenet1_1/perf.yaml +++ b/qai_hub_models/models/squeezenet1_1/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: SqueezeNet-1_1 performance_metrics: - torchscript_onnx_tflite: - inference_time: 212.0 - throughput: 4716.981132075472 + inference_time: 225.0 + throughput: 4444.444444444444 estimated_peak_memory_range: - min: 20480 - max: 1439360 + min: 24576 + max: 1431872 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 39 - job_id: j1pvlre75 + job_id: j1p8ol8g9 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:32:29.373813Z' torchscript_onnx_qnn: - inference_time: 280.0 - throughput: 3571.4285714285716 + inference_time: 278.0 + throughput: 3597.122302158273 estimated_peak_memory_range: min: 20480 - max: 12471928 + max: 53223728 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 69 - job_id: j7gjr2o7p + job_id: jn5q8jm57 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 181.0 + throughput: 5524.861878453039 + estimated_peak_memory_range: + min: 12288 + max: 21672448 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 39 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 39 + job_id: jogkzjogd job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:17:05.340427Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:34:27.758337Z' + torchscript_onnx_qnn: + inference_time: 199.0 + throughput: 5025.125628140703 + estimated_peak_memory_range: + min: 618496 + max: 28404384 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 69 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 69 + job_id: j1glnjlpv + job_status: Passed diff --git a/qai_hub_models/models/squeezenet1_1/test.py b/qai_hub_models/models/squeezenet1_1/test.py index 0b6f2e19..8d745b7f 100644 --- a/qai_hub_models/models/squeezenet1_1/test.py +++ b/qai_hub_models/models/squeezenet1_1/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(SqueezeNet.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(SqueezeNet.from_pretrained()) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/README.md b/qai_hub_models/models/squeezenet1_1_quantized/README.md index 89b9b080..ebb01c7c 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/README.md +++ b/qai_hub_models/models/squeezenet1_1_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of SqueezeNet-1_1Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/squeezenet1_1_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.squeezenet1_1_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of SqueezeNet-1_1Quantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size](https://arxiv.org/abs/1602.07360) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/conftest.py b/qai_hub_models/models/squeezenet1_1_quantized/conftest.py new file mode 100644 index 00000000..d216d03d --- /dev/null +++ b/qai_hub_models/models/squeezenet1_1_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.squeezenet1_1_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.squeezenet1_1_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/squeezenet1_1_quantized/demo.py b/qai_hub_models/models/squeezenet1_1_quantized/demo.py index fdd8fc5d..f6b6145b 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/demo.py +++ b/qai_hub_models/models/squeezenet1_1_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.squeezenet1_1_quantized.model import SqueezeNetQuantizable +from qai_hub_models.models.squeezenet1_1_quantized.model import ( + MODEL_ID, + SqueezeNetQuantizable, +) def main(is_test: bool = False): - imagenet_demo(SqueezeNetQuantizable, is_test) + imagenet_demo(SqueezeNetQuantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/squeezenet1_1_quantized/export.py b/qai_hub_models/models/squeezenet1_1_quantized/export.py index 8e42fb41..e4256985 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/export.py +++ b/qai_hub_models/models/squeezenet1_1_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,33 +163,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/squeezenet1_1_quantized/info.yaml b/qai_hub_models/models/squeezenet1_1_quantized/info.yaml index 6284192b..8daf5c4d 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/info.yaml +++ b/qai_hub_models/models/squeezenet1_1_quantized/info.yaml @@ -15,6 +15,7 @@ research_paper: https://arxiv.org/abs/1602.07360 research_paper_title: 'SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/squeezenet.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/squeezenet1_1_quantized/model.py b/qai_hub_models/models/squeezenet1_1_quantized/model.py index 554e4a15..67a3f532 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/model.py +++ b/qai_hub_models/models/squeezenet1_1_quantized/model.py @@ -14,14 +14,16 @@ import torch from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.squeezenet1_1.model import SqueezeNet -from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 DEFAULT_ENCODINGS = "squeezenet1_1_quantized_encodings.json" @@ -37,9 +39,15 @@ def __init__( ) -> None: SqueezeNet.__init__(self, sim_model.model) AIMETQuantizableMixin.__init__( - self, sim_model, needs_onnx_direct_aimet_export=True + self, + sim_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, @@ -53,15 +61,16 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ model = SqueezeNet.from_pretrained() - input_shape = model.get_input_spec()["image_tensor"][0] + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) equalize_model(model, input_shape) sim = QuantizationSimModel( - model.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_per_channel_aimet_config(), + config_file=get_default_aimet_config(), dummy_input=torch.rand(input_shape), ) @@ -74,3 +83,11 @@ def from_pretrained( sim.model.eval() return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml index 00894c2f..fcf427fe 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml +++ b/qai_hub_models/models/squeezenet1_1_quantized/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: SqueezeNet-1_1Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 215.0 - throughput: 4651.162790697675 + inference_time: 176.0 + throughput: 5681.818181818182 estimated_peak_memory_range: - min: 20480 - max: 1657648 + min: 12288 + max: 2498992 primary_compute_unit: NPU - precision: fp16 + precision: int8 + layer_info: + layers_on_npu: 39 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 39 + job_id: jnp10jk5q + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:48:25.505884Z' + torchscript_onnx_qnn: + inference_time: 185.0 + throughput: 5405.405405405405 + estimated_peak_memory_range: + min: 172032 + max: 55116856 + primary_compute_unit: NPU + precision: int8 layer_info: layers_on_npu: 43 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 43 - job_id: jegnzmovg + job_id: jqp4q1qgo job_status: Passed - torchscript_onnx_qnn: - inference_time: 227.0 - throughput: 4405.286343612334 + - torchscript_onnx_tflite: + inference_time: 135.0 + throughput: 7407.407407407408 estimated_peak_memory_range: - min: 622592 - max: 62441592 + min: 12288 + max: 21511824 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 45 + layers_on_npu: 39 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 45 - job_id: joprl2ovp + total_layers: 39 + job_id: jz57z4qp3 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:29:43.800896Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:51:35.601938Z' + torchscript_onnx_qnn: + inference_time: 146.0 + throughput: 6849.315068493151 + estimated_peak_memory_range: + min: 159744 + max: 18650384 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 43 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 43 + job_id: jo5mrmygk + job_status: Passed diff --git a/qai_hub_models/models/squeezenet1_1_quantized/test.py b/qai_hub_models/models/squeezenet1_1_quantized/test.py index bf4f2ec9..9c927cf5 100644 --- a/qai_hub_models/models/squeezenet1_1_quantized/test.py +++ b/qai_hub_models/models/squeezenet1_1_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.squeezenet1_1_quantized.demo import main as demo_main from qai_hub_models.models.squeezenet1_1_quantized.model import ( @@ -25,16 +24,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - SqueezeNetQuantizable.from_pretrained(), - diff_tol=0.01, - rtol=0.02, - atol=0.2, - is_quantized=True, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/stable_diffusion_quantized/README.md b/qai_hub_models/models/stable_diffusion_quantized/README.md index 04fbbce0..2cb891b9 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/README.md +++ b/qai_hub_models/models/stable_diffusion_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Stable-Diffusion found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/stable_diffusion_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.stable_diffusion_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Stable-Diffusion can be found [here](https://github.com/CompVis/stable-diffusion/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) diff --git a/qai_hub_models/models/stable_diffusion_quantized/export.py b/qai_hub_models/models/stable_diffusion_quantized/export.py index 5f9280bd..eb3af6b4 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/export.py +++ b/qai_hub_models/models/stable_diffusion_quantized/export.py @@ -9,25 +9,21 @@ import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub from qai_hub_models.models.stable_diffusion_quantized import Model from qai_hub_models.utils.args import export_parser -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BasePrecompiledModel, TargetRuntime from qai_hub_models.utils.printing import print_profile_metrics_from_job from qai_hub_models.utils.qai_hub_helpers import ( can_access_qualcomm_ai_hub, export_without_hub_access, ) -ALL_COMPONENTS = ["Text-Encoder-Quantized", "UNet-Quantized", "VAE-Decoder-Quantized"] -DEFAULT_COMPONENTS = [ - "Text-Encoder-Quantized", - "VAE-Decoder-Quantized", - "UNet-Quantized", -] +ALL_COMPONENTS = ["TextEncoder_Quantized", "UNet_Quantized", "VAEDecoder_Quantized"] +DEFAULT_COMPONENTS = ["TextEncoder_Quantized", "VAEDecoder_Quantized", "UNet_Quantized"] def export_model( @@ -79,9 +75,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or DEFAULT_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "stable_diffusion_quantized", @@ -98,16 +94,17 @@ def export_model( component_arg, ) + target_runtime = TargetRuntime.TFLITE # 1. Initialize model print("Initializing model class") model = Model.from_precompiled() - components_dict = {} - if "Text-Encoder-Quantized" in components: - components_dict["Text-Encoder-Quantized"] = model.text_encoder - if "UNet-Quantized" in components: - components_dict["UNet-Quantized"] = model.unet - if "VAE-Decoder-Quantized" in components: - components_dict["VAE-Decoder-Quantized"] = model.vae_decoder + components_dict: Dict[str, BasePrecompiledModel] = {} + if "TextEncoder_Quantized" in components: + components_dict["TextEncoder_Quantized"] = model.text_encoder # type: ignore + if "UNet_Quantized" in components: + components_dict["UNet_Quantized"] = model.unet # type: ignore + if "VAEDecoder_Quantized" in components: + components_dict["VAEDecoder_Quantized"] = model.vae_decoder # type: ignore # 2. Upload model assets to hub print("Uploading model assets on hub") @@ -118,39 +115,51 @@ def export_model( ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=uploaded_models[component_name], device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=uploaded_models[component_name], inputs=sample_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Summarize the results from profiling if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) return { diff --git a/qai_hub_models/models/stable_diffusion_quantized/info.yaml b/qai_hub_models/models/stable_diffusion_quantized/info.yaml index 86efe1b8..ceac7d79 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/info.yaml +++ b/qai_hub_models/models/stable_diffusion_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/2112.10752 research_paper_title: High-Resolution Image Synthesis with Latent Diffusion Models license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE +deploy_license: https://github.com/CompVis/stable-diffusion/blob/main/LICENSE source_repo: https://github.com/CompVis/stable-diffusion/tree/main technical_details: Input: Text prompt to generate image @@ -34,4 +35,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: creativeml-openrail-m +deploy_license_type: creativeml-openrail-m dataset: [] diff --git a/qai_hub_models/models/stable_diffusion_quantized/model.py b/qai_hub_models/models/stable_diffusion_quantized/model.py index 54d0144a..f9da4488 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/model.py +++ b/qai_hub_models/models/stable_diffusion_quantized/model.py @@ -6,8 +6,9 @@ import os +from qai_hub_models.models.protocols import FromPrecompiledProtocol from qai_hub_models.utils.asset_loaders import CachedWebModelAsset -from qai_hub_models.utils.base_model import BasePrecompiledModel +from qai_hub_models.utils.base_model import BasePrecompiledModel, CollectionModel from qai_hub_models.utils.input_spec import InputSpec MODEL_ID = __name__.split(".")[-2] @@ -18,7 +19,7 @@ VAE_DECODER = os.path.join(QNN_SDK_PREFIX, "vae_decoder.serialized.bin") -class StableDiffusionQuantized: +class StableDiffusionQuantized(FromPrecompiledProtocol, CollectionModel): """ Stable Diffusion wrapper class consists of - Text Encoder @@ -51,9 +52,6 @@ class ClipVITTextEncoder(BasePrecompiledModel): and compiled into serialized binary for Qualcomm Snapdragon Gen2+. """ - def __init__(self, target_model_path) -> None: - self.target_model_path = target_model_path - @classmethod def from_precompiled(cls) -> "ClipVITTextEncoder": text_encoder_path = CachedWebModelAsset.from_asset_store( @@ -61,10 +59,8 @@ def from_precompiled(cls) -> "ClipVITTextEncoder": ).fetch() return ClipVITTextEncoder(text_encoder_path) - def get_target_model_path(self) -> str: - return self.target_model_path - - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: return {"input_1": ((1, 77), "int32")} @@ -76,9 +72,6 @@ class Unet(BasePrecompiledModel): and compiled into serialized binary for Qualcomm Snapdragon Gen2+. """ - def __init__(self, target_model_path) -> None: - self.target_model_path = target_model_path - @classmethod def from_precompiled(cls) -> "Unet": model_path = CachedWebModelAsset.from_asset_store( @@ -86,10 +79,8 @@ def from_precompiled(cls) -> "Unet": ).fetch() return Unet(model_path) - def get_target_model_path(self) -> str: - return self.target_model_path - - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: return { "input_1": ((1, 64, 64, 4), "float32"), "input_2": ((1, 1280), "float32"), @@ -105,9 +96,6 @@ class VAEDecoder(BasePrecompiledModel): and compiled into serialized binary for Qualcomm Snapdragon Gen2+. """ - def __init__(self, target_model_path) -> None: - self.target_model_path = target_model_path - @classmethod def from_precompiled(cls) -> "VAEDecoder": model_path = CachedWebModelAsset.from_asset_store( @@ -115,8 +103,6 @@ def from_precompiled(cls) -> "VAEDecoder": ).fetch() return VAEDecoder(model_path) - def get_target_model_path(self) -> str: - return self.target_model_path - - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: return {"input_1": ((1, 64, 64, 4), "float32")} diff --git a/qai_hub_models/models/stable_diffusion_quantized/requirements.txt b/qai_hub_models/models/stable_diffusion_quantized/requirements.txt index e21d8196..83aa3d48 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/requirements.txt +++ b/qai_hub_models/models/stable_diffusion_quantized/requirements.txt @@ -1,2 +1,2 @@ -transformers==4.31.0 +transformers==4.27.4 diffusers[torch]==0.21.4 diff --git a/qai_hub_models/models/stable_diffusion_quantized/test.py b/qai_hub_models/models/stable_diffusion_quantized/test.py index b1c0b2b0..b0cc4bf5 100644 --- a/qai_hub_models/models/stable_diffusion_quantized/test.py +++ b/qai_hub_models/models/stable_diffusion_quantized/test.py @@ -8,6 +8,13 @@ from qai_hub_models.models.stable_diffusion_quantized.demo import main as demo_main from qai_hub_models.models.stable_diffusion_quantized.export import export_model +from qai_hub_models.models.stable_diffusion_quantized.model import ( + StableDiffusionQuantized, +) + + +def test_from_precompiled(): + StableDiffusionQuantized.from_precompiled() @pytest.mark.skip("#105 move slow_cloud and slow tests to nightly.") diff --git a/qai_hub_models/models/stylegan2/README.md b/qai_hub_models/models/stylegan2/README.md index 2068b45a..be4cac59 100644 --- a/qai_hub_models/models/stylegan2/README.md +++ b/qai_hub_models/models/stylegan2/README.md @@ -10,7 +10,7 @@ This is based on the implementation of StyleGAN2 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/stylegan2). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.stylegan2.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of StyleGAN2 can be found [here](https://github.com/NVlabs/stylegan3/blob/main/LICENSE.txt). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Analyzing and Improving the Image Quality of StyleGAN](http://arxiv.org/abs/1912.04958) diff --git a/qai_hub_models/models/stylegan2/conftest.py b/qai_hub_models/models/stylegan2/conftest.py new file mode 100644 index 00000000..d5441390 --- /dev/null +++ b/qai_hub_models/models/stylegan2/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.stylegan2 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.stylegan2.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/stylegan2/demo.py b/qai_hub_models/models/stylegan2/demo.py index e3d0f99e..dc28952f 100644 --- a/qai_hub_models/models/stylegan2/demo.py +++ b/qai_hub_models/models/stylegan2/demo.py @@ -5,7 +5,7 @@ import torch from qai_hub_models.models.stylegan2.app import StyleGAN2App -from qai_hub_models.models.stylegan2.model import StyleGAN2 +from qai_hub_models.models.stylegan2.model import MODEL_ID, StyleGAN2 from qai_hub_models.utils.args import ( demo_model_from_cli_args, get_model_cli_parser, @@ -46,7 +46,7 @@ def main(is_test: bool = False): # Create model and app model = model_from_cli_args(StyleGAN2, args) - inference_model = demo_model_from_cli_args(StyleGAN2, args) + inference_model = demo_model_from_cli_args(StyleGAN2, MODEL_ID, args) app = StyleGAN2App(inference_model, model.output_size, model.num_classes) # Verify model input args diff --git a/qai_hub_models/models/stylegan2/export.py b/qai_hub_models/models/stylegan2/export.py index 0ac5ef8c..520ed574 100644 --- a/qai_hub_models/models/stylegan2/export.py +++ b/qai_hub_models/models/stylegan2/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -108,63 +108,72 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_output output_0" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) sample_inputs = model.sample_inputs(input_spec) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=sample_inputs, device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/stylegan2/info.yaml b/qai_hub_models/models/stylegan2/info.yaml index 4e624753..d063ff28 100644 --- a/qai_hub_models/models/stylegan2/info.yaml +++ b/qai_hub_models/models/stylegan2/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: http://arxiv.org/abs/1912.04958 research_paper_title: Analyzing and Improving the Image Quality of StyleGAN license: https://github.com/NVlabs/stylegan3/blob/main/LICENSE.txt +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/NVlabs/stylegan3 technical_details: Model checkpoint: StyleGAN2 (afhqcat dataset) @@ -29,4 +30,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/stylegan2/model.py b/qai_hub_models/models/stylegan2/model.py index eddb21cd..45071a48 100644 --- a/qai_hub_models/models/stylegan2/model.py +++ b/qai_hub_models/models/stylegan2/model.py @@ -10,7 +10,7 @@ import torch from qai_hub_models.utils.asset_loaders import SourceAsRoot -from qai_hub_models.utils.base_model import BaseModel +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.input_spec import InputSpec STYLEGAN2_SOURCE_REPOSITORY = "https://github.com/NVlabs/stylegan3" @@ -82,21 +82,29 @@ def forward(self, image_noise: torch.Tensor, classes: torch.Tensor | None = None force_fp32=True, ) - def get_input_spec(self, batch_size: int = 1) -> InputSpec: + @staticmethod + def get_input_spec( + output_size: int, num_classes: int, batch_size: int = 1 + ) -> InputSpec: """ Returns the input specification (name -> (shape, type). This can be used to submit a profiling job on Qualcomm AI Hub. """ - inputs = {"image_noise": ((batch_size, self.output_size), "float32")} - if self.num_classes != 0: - inputs["classes"] = ((batch_size, self.num_classes), "float32") - return inputs # type: ignore + inputs = {"image_noise": ((batch_size, output_size), "float32")} + if num_classes != 0: + inputs["classes"] = ((batch_size, num_classes), "float32") + return inputs + + def _get_input_spec_for_model_instance(self, batch_size: int = 1) -> InputSpec: + return self.__class__.get_input_spec( + self.output_size, self.num_classes, batch_size + ) def sample_inputs( self, input_spec: InputSpec | None = None, seed=None ) -> Dict[str, List[np.ndarray]]: if not input_spec: - input_spec = self.get_input_spec() + input_spec = self._get_input_spec_for_model_instance() inputs = { "image_noise": [ @@ -113,6 +121,22 @@ def sample_inputs( return inputs + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --compute_unit gpu" + + def get_hub_profile_options( + self, target_runtime: TargetRuntime, other_profile_options: str = "" + ) -> str: + profile_options = super().get_hub_profile_options( + target_runtime, other_profile_options + ) + return profile_options + " --compute_unit gpu" + def _get_qaihm_upfirdn2d_ref(misc: Any, conv2d_gradfix: Callable, upfirdn2d: Any): """ diff --git a/qai_hub_models/models/stylegan2/perf.yaml b/qai_hub_models/models/stylegan2/perf.yaml index c762bf13..856842a6 100644 --- a/qai_hub_models/models/stylegan2/perf.yaml +++ b/qai_hub_models/models/stylegan2/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: StyleGAN2 performance_metrics: - torchscript_onnx_tflite: - inference_time: 1280066.0 - throughput: 0.7812097188738706 + inference_time: 1218362.0 + throughput: 0.8207741213202644 estimated_peak_memory_range: - min: 1790029824 - max: 2607953504 + min: 1358295040 + max: 1361471248 primary_compute_unit: CPU precision: fp32 layer_info: layers_on_npu: 0 layers_on_gpu: 89 - layers_on_cpu: 462 - total_layers: 551 - job_id: jz57elvqp + layers_on_cpu: 492 + total_layers: 581 + job_id: jlpe988gr job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:03:19.171321Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 980347.0 + throughput: 1.0200469833640537 + estimated_peak_memory_range: + min: 1110478848 + max: 1142166720 + primary_compute_unit: CPU + precision: fp32 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 89 + layers_on_cpu: 492 + total_layers: 581 + job_id: jz5wo84p1 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:56.125164Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:03:19.171331Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/stylegan2/requirements.txt b/qai_hub_models/models/stylegan2/requirements.txt index b8261cd7..7317e178 100644 --- a/qai_hub_models/models/stylegan2/requirements.txt +++ b/qai_hub_models/models/stylegan2/requirements.txt @@ -1 +1 @@ -click>=8.0 +click==8.0 diff --git a/qai_hub_models/models/stylegan2/test.py b/qai_hub_models/models/stylegan2/test.py index df1f75c6..a55dbe26 100644 --- a/qai_hub_models/models/stylegan2/test.py +++ b/qai_hub_models/models/stylegan2/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest import torch from qai_hub_models.models.stylegan2.app import StyleGAN2App @@ -52,6 +53,7 @@ def test_stylegan2_app(): assert len(output_images) == 2 +@pytest.mark.trace @skip_clone_repo_check def test_stylegan2_trace(): app = StyleGAN2App(StyleGAN2.from_pretrained().convert_to_torchscript()) diff --git a/qai_hub_models/models/swin_base/README.md b/qai_hub_models/models/swin_base/README.md index d5663088..cf886066 100644 --- a/qai_hub_models/models/swin_base/README.md +++ b/qai_hub_models/models/swin_base/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Swin-Base found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/swin_base). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.swin_base.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Swin-Base can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) diff --git a/qai_hub_models/models/swin_base/conftest.py b/qai_hub_models/models/swin_base/conftest.py new file mode 100644 index 00000000..d866a771 --- /dev/null +++ b/qai_hub_models/models/swin_base/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.swin_base import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.swin_base.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/swin_base/demo.py b/qai_hub_models/models/swin_base/demo.py index bf03b593..f9d17b55 100644 --- a/qai_hub_models/models/swin_base/demo.py +++ b/qai_hub_models/models/swin_base/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.swin_base.model import SwinBase +from qai_hub_models.models.swin_base.model import MODEL_ID, SwinBase def main(is_test: bool = False): - imagenet_demo(SwinBase, is_test) + imagenet_demo(SwinBase, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/swin_base/export.py b/qai_hub_models/models/swin_base/export.py index 50aa82df..b9e657bd 100644 --- a/qai_hub_models/models/swin_base/export.py +++ b/qai_hub_models/models/swin_base/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/swin_base/info.yaml b/qai_hub_models/models/swin_base/info.yaml index 15d83d1c..00a55170 100644 --- a/qai_hub_models/models/swin_base/info.yaml +++ b/qai_hub_models/models/swin_base/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/2103.14030 research_paper_title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py technical_details: @@ -39,6 +40,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/swin_base/perf.yaml b/qai_hub_models/models/swin_base/perf.yaml index c27cc0d3..d28fc1c7 100644 --- a/qai_hub_models/models/swin_base/perf.yaml +++ b/qai_hub_models/models/swin_base/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Swin-Base performance_metrics: - torchscript_onnx_tflite: - inference_time: 76852.0 - throughput: 13.012023109353041 + inference_time: 66948.0 + throughput: 14.936966003465376 estimated_peak_memory_range: - min: 12288 - max: 367871696 - primary_compute_unit: GPU + min: 28672 + max: 6112608 + primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 0 - layers_on_gpu: 2006 + layers_on_npu: 1614 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 2006 - job_id: jw568zrvg + total_layers: 1614 + job_id: jogkzm2gd job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:23:06.160602Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 43458.0 + throughput: 23.010722996916563 + estimated_peak_memory_range: + min: 69632 + max: 472671520 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1614 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1614 + job_id: jn5q8o457 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:09:41.513292Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:23:06.160610Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/swin_base/test.py b/qai_hub_models/models/swin_base/test.py index 3f302ff1..dce247a4 100644 --- a/qai_hub_models/models/swin_base/test.py +++ b/qai_hub_models/models/swin_base/test.py @@ -13,7 +13,7 @@ from qai_hub_models.models.swin_base.model import MODEL_ID, SwinBase -def test_task(imagenet_sample_torch): +def test_numerical(imagenet_sample_torch): # Ensure that the optimized SwinBase matches the original one numerically x = imagenet_sample_torch model_opt = SwinBase.from_pretrained().eval() diff --git a/qai_hub_models/models/swin_small/README.md b/qai_hub_models/models/swin_small/README.md index 856bedb6..f323fa2a 100644 --- a/qai_hub_models/models/swin_small/README.md +++ b/qai_hub_models/models/swin_small/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Swin-Small found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/swin_small). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.swin_small.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Swin-Small can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) diff --git a/qai_hub_models/models/swin_small/conftest.py b/qai_hub_models/models/swin_small/conftest.py new file mode 100644 index 00000000..73bf1779 --- /dev/null +++ b/qai_hub_models/models/swin_small/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.swin_small import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.swin_small.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/swin_small/demo.py b/qai_hub_models/models/swin_small/demo.py index dd78cca1..ec65fdc7 100644 --- a/qai_hub_models/models/swin_small/demo.py +++ b/qai_hub_models/models/swin_small/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.swin_small.model import SwinSmall +from qai_hub_models.models.swin_small.model import MODEL_ID, SwinSmall def main(is_test: bool = False): - imagenet_demo(SwinSmall, is_test) + imagenet_demo(SwinSmall, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/swin_small/export.py b/qai_hub_models/models/swin_small/export.py index 9e7505bb..ed29fb83 100644 --- a/qai_hub_models/models/swin_small/export.py +++ b/qai_hub_models/models/swin_small/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/swin_small/info.yaml b/qai_hub_models/models/swin_small/info.yaml index b783fb0c..ac042fe2 100644 --- a/qai_hub_models/models/swin_small/info.yaml +++ b/qai_hub_models/models/swin_small/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/2103.14030 research_paper_title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py technical_details: @@ -38,6 +39,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/swin_small/perf.yaml b/qai_hub_models/models/swin_small/perf.yaml index d481ee20..2e615a93 100644 --- a/qai_hub_models/models/swin_small/perf.yaml +++ b/qai_hub_models/models/swin_small/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Swin-Small performance_metrics: - torchscript_onnx_tflite: - inference_time: 52492.0 - throughput: 19.05052198430237 + inference_time: 50143.0 + throughput: 19.94296312546118 estimated_peak_memory_range: - min: 12288 - max: 222000632 - primary_compute_unit: GPU + min: 90112 + max: 3612056 + primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 0 - layers_on_gpu: 1965 + layers_on_npu: 1609 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1965 - job_id: jlpe7wl05 + total_layers: 1609 + job_id: jo5mr9ygk job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:53:07.887698Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 33054.0 + throughput: 30.2535245356084 + estimated_peak_memory_range: + min: 45056 + max: 454274336 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 1609 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 1609 + job_id: jegn2qvgo + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:30:42.368348Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:53:07.887705Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/swin_small/test.py b/qai_hub_models/models/swin_small/test.py index 3ce1e0ea..8bcacc9e 100644 --- a/qai_hub_models/models/swin_small/test.py +++ b/qai_hub_models/models/swin_small/test.py @@ -13,7 +13,7 @@ from qai_hub_models.models.swin_small.model import MODEL_ID, SwinSmall -def test_task(imagenet_sample_torch): +def test_numerical(imagenet_sample_torch): # Ensure that the optimized SwinSmall matches the original one numerically x = imagenet_sample_torch model_opt = SwinSmall.from_pretrained().eval() diff --git a/qai_hub_models/models/swin_tiny/README.md b/qai_hub_models/models/swin_tiny/README.md index 19c7a416..0c8a8564 100644 --- a/qai_hub_models/models/swin_tiny/README.md +++ b/qai_hub_models/models/swin_tiny/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Swin-Tiny found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/swin_tiny). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.swin_tiny.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Swin-Tiny can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) diff --git a/qai_hub_models/models/swin_tiny/conftest.py b/qai_hub_models/models/swin_tiny/conftest.py new file mode 100644 index 00000000..08d176fe --- /dev/null +++ b/qai_hub_models/models/swin_tiny/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.swin_tiny import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.swin_tiny.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/swin_tiny/demo.py b/qai_hub_models/models/swin_tiny/demo.py index cd8aac95..782e73d6 100644 --- a/qai_hub_models/models/swin_tiny/demo.py +++ b/qai_hub_models/models/swin_tiny/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.swin_tiny.model import SwinTiny +from qai_hub_models.models.swin_tiny.model import MODEL_ID, SwinTiny def main(is_test: bool = False): - imagenet_demo(SwinTiny, is_test) + imagenet_demo(SwinTiny, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/swin_tiny/export.py b/qai_hub_models/models/swin_tiny/export.py index 28760804..3f43d39b 100644 --- a/qai_hub_models/models/swin_tiny/export.py +++ b/qai_hub_models/models/swin_tiny/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options + " --compute_unit gpu", + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/swin_tiny/info.yaml b/qai_hub_models/models/swin_tiny/info.yaml index ee0345a8..aee47f6a 100644 --- a/qai_hub_models/models/swin_tiny/info.yaml +++ b/qai_hub_models/models/swin_tiny/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/2103.14030 research_paper_title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py technical_details: @@ -38,6 +39,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/swin_tiny/perf.yaml b/qai_hub_models/models/swin_tiny/perf.yaml index 7603ecf3..6e71c1a1 100644 --- a/qai_hub_models/models/swin_tiny/perf.yaml +++ b/qai_hub_models/models/swin_tiny/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Swin-Tiny performance_metrics: - torchscript_onnx_tflite: - inference_time: 29469.0 - throughput: 33.93396450507313 + inference_time: 31313.0 + throughput: 31.935617794526234 estimated_peak_memory_range: - min: 0 - max: 193113472 - primary_compute_unit: GPU + min: 81920 + max: 3482152 + primary_compute_unit: NPU precision: fp16 layer_info: - layers_on_npu: 0 - layers_on_gpu: 1059 + layers_on_npu: 859 + layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 1059 - job_id: jqpyojx45 + total_layers: 859 + job_id: j0pxvv1g7 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:34:33.080588Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 20716.0 + throughput: 48.27186715582159 + estimated_peak_memory_range: + min: 49152 + max: 274521296 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 859 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 859 + job_id: jo5mrrwgk + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:18:27.047126Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:34:33.080597Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/swin_tiny/test.py b/qai_hub_models/models/swin_tiny/test.py index 574e7c1e..1b20542f 100644 --- a/qai_hub_models/models/swin_tiny/test.py +++ b/qai_hub_models/models/swin_tiny/test.py @@ -14,7 +14,7 @@ from qai_hub_models.models.swin_tiny.model import MODEL_ID, SwinTiny -def test_task(imagenet_sample_torch): +def test_numerical(imagenet_sample_torch): # Ensure that the optimized SwinTiny matches the original one numerically x = imagenet_sample_torch model_opt = SwinTiny.from_pretrained().eval() diff --git a/qai_hub_models/models/trocr/README.md b/qai_hub_models/models/trocr/README.md index 27f1a1a4..27fde033 100644 --- a/qai_hub_models/models/trocr/README.md +++ b/qai_hub_models/models/trocr/README.md @@ -10,7 +10,7 @@ This is based on the implementation of TrOCR found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/trocr). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.trocr.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of TrOCR can be found [here](https://github.com/microsoft/unilm/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) diff --git a/qai_hub_models/models/trocr/conftest.py b/qai_hub_models/models/trocr/conftest.py new file mode 100644 index 00000000..574e667a --- /dev/null +++ b/qai_hub_models/models/trocr/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.trocr import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.trocr.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/trocr/export.py b/qai_hub_models/models/trocr/export.py index 8ed66055..37af35b9 100644 --- a/qai_hub_models/models/trocr/export.py +++ b/qai_hub_models/models/trocr/export.py @@ -10,14 +10,14 @@ import os import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub import torch from qai_hub_models.models.trocr import Model from qai_hub_models.utils.args import export_parser, get_model_kwargs -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.compare import torch_inference from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( @@ -90,9 +90,9 @@ def export_model( output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or ALL_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "trocr", @@ -111,68 +111,85 @@ def export_model( # 1. Initialize PyTorch model model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) - components_dict = {} + components_dict: Dict[str, BaseModel] = {} if "TrOCREncoder" in components: - components_dict["TrOCREncoder"] = model.encoder + components_dict["TrOCREncoder"] = model.encoder # type: ignore if "TrOCRDecoder" in components: - components_dict["TrOCRDecoder"] = model.decoder + components_dict["TrOCRDecoder"] = model.decoder # type: ignore - compile_jobs = {} + compile_jobs: Dict[str, hub.client.CompileJob] = {} for component_name, component in components_dict.items(): # Trace the model input_spec = component.get_input_spec() - source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input pixel_values" ) - print(f"Optimizing model {component_name} to run on-device.") - compile_jobs[component_name] = hub.submit_compile_job( + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), - name=f"{component_name}", + name=f"{model_name}_{component_name}", options=model_compile_options, ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_jobs[component_name].get_target_model(), device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() # Convert inputs from channel first to channel last hub_inputs = transpose_channel_first_to_last( "pixel_values", sample_inputs, target_runtime ) - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_jobs[component_name].get_target_model(), inputs=hub_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Download the model assets to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) for component_name, compile_job in compile_jobs.items(): - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download( str(output_path / f"{model_name}_{component_name}.tflite") ) @@ -181,8 +198,8 @@ def export_model( if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: @@ -190,8 +207,8 @@ def export_model( inference_job = inference_jobs[component_name] sample_inputs = components_dict[component_name].sample_inputs() torch_out = torch_inference(components_dict[component_name], sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return { diff --git a/qai_hub_models/models/trocr/info.yaml b/qai_hub_models/models/trocr/info.yaml index e0755c8f..cd37a5e5 100644 --- a/qai_hub_models/models/trocr/info.yaml +++ b/qai_hub_models/models/trocr/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/2109.10282 research_paper_title: 'TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models' license: https://github.com/microsoft/unilm/blob/master/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://huggingface.co/microsoft/trocr-small-stage1 technical_details: Model checkpoint: trocr-small-stage1 @@ -33,4 +34,5 @@ related_models: [] has_static_banner: yes has_animated_banner: yes license_type: mit +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/trocr/model.py b/qai_hub_models/models/trocr/model.py index 6fc61625..ac627212 100644 --- a/qai_hub_models/models/trocr/model.py +++ b/qai_hub_models/models/trocr/model.py @@ -116,7 +116,8 @@ def forward( return (*kv_cache,) # convert list to tuple for export - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec() -> InputSpec: # Get the input specification ordered (name -> (shape, type)) pairs for this model. # # This can be used with the qai_hub python API to declared @@ -216,7 +217,10 @@ def forward( *out_kv_cache, ) - def get_input_spec(self) -> InputSpec: + @staticmethod + def get_input_spec( + decoder_attention_heads: int, embeddings_per_head: int, num_decoder_layers: int + ) -> InputSpec: """ Returns the input specification (name -> (shape, type). This can be used to submit profiling job on Qualcomm AI Hub. @@ -226,9 +230,9 @@ def get_input_spec(self) -> InputSpec: attn_cache_spec = ( ( TROCR_BATCH_SIZE, - self.decoder_attention_heads, + decoder_attention_heads, TROCR_EXPORT_SEQ_LEN, - self.embeddings_per_head, + embeddings_per_head, ), "float32", ) @@ -236,15 +240,15 @@ def get_input_spec(self) -> InputSpec: cross_attn_cache_spec = ( ( TROCR_BATCH_SIZE, - self.decoder_attention_heads, + decoder_attention_heads, 578, # TODO: Can we get this programatically? - self.embeddings_per_head, + embeddings_per_head, ), "float32", ) decoder_input_specs: InputSpec = {"input_ids": input_ids_spec} - for i in range(0, self.num_decoder_layers): + for i in range(0, num_decoder_layers): decoder_input_specs[f"kv_{i}_attn_key"] = attn_cache_spec decoder_input_specs[f"kv_{i}_attn_val"] = attn_cache_spec decoder_input_specs[f"kv_{i}_cross_attn_key"] = cross_attn_cache_spec @@ -252,6 +256,13 @@ def get_input_spec(self) -> InputSpec: return decoder_input_specs + def _get_input_spec_for_model_instance(self) -> InputSpec: + return self.__class__.get_input_spec( + self.decoder_attention_heads, + self.embeddings_per_head, + self.num_decoder_layers, + ) + @classmethod def from_pretrained(cls): return TrOCR.from_pretrained().decoder diff --git a/qai_hub_models/models/trocr/perf.yaml b/qai_hub_models/models/trocr/perf.yaml index b9cea027..df5315f1 100644 --- a/qai_hub_models/models/trocr/perf.yaml +++ b/qai_hub_models/models/trocr/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: TrOCREncoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 244369.0 - throughput: 4.092172084020477 + inference_time: 243976.0 + throughput: 4.098763812834049 estimated_peak_memory_range: - min: 7294976 - max: 10455296 + min: 7221248 + max: 10173368 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 627 - job_id: j2p0m26eg + job_id: j7gjxxxpd job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:46:04.016709Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,22 +68,52 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 182193.0 + throughput: 5.48868507571641 + estimated_peak_memory_range: + min: 20480 + max: 305620528 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 627 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 627 + job_id: jygzeekg8 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:39.426796Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:46:04.016721Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped - name: TrOCRDecoder performance_metrics: - torchscript_onnx_tflite: - inference_time: 2820.0 - throughput: 354.6099290780142 + inference_time: 2810.0 + throughput: 355.87188612099646 estimated_peak_memory_range: - min: 20480 - max: 2212720 + min: 12288 + max: 2353880 primary_compute_unit: NPU precision: fp16 layer_info: @@ -80,8 +121,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 394 - job_id: j1p8em18p + job_id: jlpe991gr job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:51:23.352323Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -97,11 +146,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 2018.0 + throughput: 495.5401387512388 + estimated_peak_memory_range: + min: 12288 + max: 193404384 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 394 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 394 + job_id: jz5wov6p1 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:34:45.126605Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:51:23.352351Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/trocr/requirements.txt b/qai_hub_models/models/trocr/requirements.txt index 3a308074..b000abd3 100644 --- a/qai_hub_models/models/trocr/requirements.txt +++ b/qai_hub_models/models/trocr/requirements.txt @@ -1,2 +1,2 @@ -transformers==4.33.2 -sentencepiece +transformers==4.27.4 +sentencepiece==0.2.0 diff --git a/qai_hub_models/models/unet_segmentation/README.md b/qai_hub_models/models/unet_segmentation/README.md index 1980acde..f47eb91c 100644 --- a/qai_hub_models/models/unet_segmentation/README.md +++ b/qai_hub_models/models/unet_segmentation/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Unet-Segmentation found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/unet_segmentation). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.unet_segmentation.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Unet-Segmentation can be found [here](https://github.com/milesial/Pytorch-UNet/blob/master/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597) diff --git a/qai_hub_models/models/unet_segmentation/conftest.py b/qai_hub_models/models/unet_segmentation/conftest.py new file mode 100644 index 00000000..45c18443 --- /dev/null +++ b/qai_hub_models/models/unet_segmentation/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.unet_segmentation import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.unet_segmentation.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/unet_segmentation/demo.py b/qai_hub_models/models/unet_segmentation/demo.py index b1714696..b2e6666d 100644 --- a/qai_hub_models/models/unet_segmentation/demo.py +++ b/qai_hub_models/models/unet_segmentation/demo.py @@ -32,6 +32,7 @@ # The demo will display the predicted mask in a window. def unet_demo( model: Callable[..., Callable[[torch.Tensor, torch.Tensor], torch.Tensor]], + MODEL_ID, default_image: PathType, is_test: bool = False, ): @@ -45,10 +46,10 @@ def unet_demo( help="File path or URL to an input image to use for the demo.", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model.get_model_id()) + validate_on_device_demo_args(args, MODEL_ID) # Load image & model - model = demo_model_from_cli_args(UNet, args) + model = demo_model_from_cli_args(UNet, MODEL_ID, args) print("Model loaded from pre-trained weights.") (_, _, height, width) = UNet.get_input_spec()["image"][0] orig_image = load_image( @@ -67,6 +68,7 @@ def unet_demo( def main(is_test: bool = False): unet_demo( UNet, + MODEL_ID, IMAGE_ADDRESS, is_test, ) diff --git a/qai_hub_models/models/unet_segmentation/export.py b/qai_hub_models/models/unet_segmentation/export.py index 94d2b2c4..11489e1b 100644 --- a/qai_hub_models/models/unet_segmentation/export.py +++ b/qai_hub_models/models/unet_segmentation/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,7 +110,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -119,29 +119,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -150,37 +158,39 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime ) print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/unet_segmentation/info.yaml b/qai_hub_models/models/unet_segmentation/info.yaml index b835940a..f53c724e 100644 --- a/qai_hub_models/models/unet_segmentation/info.yaml +++ b/qai_hub_models/models/unet_segmentation/info.yaml @@ -15,6 +15,7 @@ tags: research_paper: https://arxiv.org/abs/1505.04597 research_paper_title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation' license: https://github.com/milesial/Pytorch-UNet/blob/master/LICENSE +deploy_license: https://github.com/milesial/Pytorch-UNet/blob/master/LICENSE source_repo: https://github.com/milesial/Pytorch-UNet technical_details: Model checkpoint: unet_carvana_scale1.0_epoch2 @@ -35,4 +36,5 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: gpl-3.0 +deploy_license_type: gpl-3.0 dataset: [] diff --git a/qai_hub_models/models/unet_segmentation/perf.yaml b/qai_hub_models/models/unet_segmentation/perf.yaml index 42a0fc85..8ff61006 100644 --- a/qai_hub_models/models/unet_segmentation/perf.yaml +++ b/qai_hub_models/models/unet_segmentation/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Unet-Segmentation performance_metrics: - torchscript_onnx_tflite: - inference_time: 7708.0 - throughput: 129.73533990659055 + inference_time: 160694.0 + throughput: 6.223007704083538 estimated_peak_memory_range: - min: 442368 - max: 29540072 + min: 6688768 + max: 229291048 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 31 - job_id: j7gjr207p + job_id: jlpe9rvgr job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:14:31.434457Z' torchscript_onnx_qnn: - inference_time: 7735.0 - throughput: 129.2824822236587 + inference_time: 146509.0 + throughput: 6.825519251377049 estimated_peak_memory_range: - min: 421888 - max: 282981312 + min: 10952704 + max: 44981480 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 52 - job_id: jlpe7wr75 + job_id: jmg9v3857 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 118830.0 + throughput: 8.415383320710259 + estimated_peak_memory_range: + min: 6234112 + max: 344093584 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 31 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 31 + job_id: jz5wodmp1 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:43:41.073611Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:16:26.988161Z' + torchscript_onnx_qnn: + inference_time: 110459.0 + throughput: 9.053132836618111 + estimated_peak_memory_range: + min: 328994816 + max: 420473984 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 52 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 52 + job_id: jnp10d75q + job_status: Passed diff --git a/qai_hub_models/models/vit/README.md b/qai_hub_models/models/vit/README.md index 3dc2bac9..8d5812f7 100644 --- a/qai_hub_models/models/vit/README.md +++ b/qai_hub_models/models/vit/README.md @@ -10,7 +10,7 @@ This is based on the implementation of VIT found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/vit). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.vit.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of VIT can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) diff --git a/qai_hub_models/models/vit/conftest.py b/qai_hub_models/models/vit/conftest.py new file mode 100644 index 00000000..de0fda92 --- /dev/null +++ b/qai_hub_models/models/vit/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.vit import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.vit.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/vit/demo.py b/qai_hub_models/models/vit/demo.py index 53e6806c..69d7d74b 100644 --- a/qai_hub_models/models/vit/demo.py +++ b/qai_hub_models/models/vit/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.vit.model import VIT +from qai_hub_models.models.vit.model import MODEL_ID, VIT def main(is_test: bool = False): - imagenet_demo(VIT, is_test) + imagenet_demo(VIT, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/vit/export.py b/qai_hub_models/models/vit/export.py index 31a1ccfc..bc43f4ae 100644 --- a/qai_hub_models/models/vit/export.py +++ b/qai_hub_models/models/vit/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -110,36 +110,44 @@ def export_model( # Trace the model source_model = torch.jit.trace( - model, make_torch_inputs(input_spec), check_trace=False + model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -148,33 +156,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/vit/info.yaml b/qai_hub_models/models/vit/info.yaml index ec8bbafc..6667f41f 100644 --- a/qai_hub_models/models/vit/info.yaml +++ b/qai_hub_models/models/vit/info.yaml @@ -14,6 +14,7 @@ research_paper: https://arxiv.org/abs/2010.11929 research_paper_title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale' license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/vision_transformer.py technical_details: @@ -37,6 +38,7 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/vit/model.py b/qai_hub_models/models/vit/model.py index b25fba95..aa608719 100644 --- a/qai_hub_models/models/vit/model.py +++ b/qai_hub_models/models/vit/model.py @@ -14,6 +14,6 @@ class VIT(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> VIT: net = tv_models.vit_b_16(weights=weights) return cls(net) diff --git a/qai_hub_models/models/vit/perf.yaml b/qai_hub_models/models/vit/perf.yaml index 3eebfd79..6a9f6f8d 100644 --- a/qai_hub_models/models/vit/perf.yaml +++ b/qai_hub_models/models/vit/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: VIT performance_metrics: - torchscript_onnx_tflite: - inference_time: 135762.0 - throughput: 7.365831381388017 + inference_time: 136110.0 + throughput: 7.346998751010212 estimated_peak_memory_range: - min: 147456 - max: 3331880 + min: 86016 + max: 3893632 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 557 - job_id: j1gly2ll5 + job_id: j1gln9lpv job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:40:01.517909Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 100287.0 + throughput: 9.971382133277494 + estimated_peak_memory_range: + min: 163840 + max: 401162112 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 557 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 557 + job_id: jw566975o + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:21:41.057280Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:40:01.517918Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/whisper_asr/info.yaml b/qai_hub_models/models/whisper_asr/info.yaml deleted file mode 100644 index cf001a33..00000000 --- a/qai_hub_models/models/whisper_asr/info.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: Whisper-Base -# id must match with the model dir name in qai_hub_models -id: whisper_asr -status: public -headline: Automatic speech recognition (ASR) model for multilingual transcription - as well as translation. -domain: Audio -description: State-of-art model encoder-decoder transformer. The encoder takes an - audio chunk (around 30 second) converted to a log-Mel spectrogram. The decoder - predicts the corresponding text caption intermixed with special tokens that can - be used to direct the single model to perform various speech tasks. -use_case: Speech Recognition -tags: - - foundation -research_paper: https://cdn.openai.com/papers/whisper.pdf -research_paper_title: Robust Speech Recognition via Large-Scale Weak Supervision -license: https://github.com/openai/whisper/blob/main/LICENSE -source_repo: https://github.com/openai/whisper/tree/main -technical_details: - Model checkpoint: Tiny En - Input resolution: 80x3000 - Number of parameters (WhisperEncoder): 9.39M - Model size (WhisperEncoder): 35.9 MB - Number of parameters (WhisperDecoder): 28.2M - Model size (WhisperDecoder): 108 MB -applicable_scenarios: - - Smart Home - - Accessibility -related_models: - - huggingface_wavlm_base_plus -form_factors: - - Phone - - Tablet - - IoT -has_static_banner: yes -has_animated_banner: yes -license_type: mit -dataset: [] diff --git a/qai_hub_models/models/whisper_asr/README.md b/qai_hub_models/models/whisper_base_en/README.md similarity index 58% rename from qai_hub_models/models/whisper_asr/README.md rename to qai_hub_models/models/whisper_base_en/README.md index 88403a8d..f5a7658f 100644 --- a/qai_hub_models/models/whisper_asr/README.md +++ b/qai_hub_models/models/whisper_base_en/README.md @@ -1,16 +1,16 @@ [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) -# [Whisper-Base: Automatic speech recognition (ASR) model for multilingual transcription as well as translation](https://aihub.qualcomm.com/models/whisper_asr) +# [Whisper-Base-En: Automatic speech recognition (ASR) model for English transcription as well as translation](https://aihub.qualcomm.com/models/whisper_base_en) -State-of-art model encoder-decoder transformer. The encoder takes an audio chunk (around 30 second) converted to a log-Mel spectrogram. The decoder predicts the corresponding text caption intermixed with special tokens that can be used to direct the single model to perform various speech tasks. +OpenAI’s Whisper ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text. It exhibits robust performance in realistic, noisy environments, making it highly reliable for real-world applications. Specifically, it excels in long-form transcription, capable of accurately transcribing audio clips up to 30 seconds long. Time to the first token is the encoder's latency, while time to each additional token is decoder's latency, where we assume a mean decoded length specified below. -This is based on the implementation of Whisper-Base found +This is based on the implementation of Whisper-Base-En found [here](https://github.com/openai/whisper/tree/main). This repository contains scripts for optimized on-device export suitable to run on Qualcomm® devices. More details on model performance -accross various devices, can be found [here](https://aihub.qualcomm.com/models/whisper_asr). +accross various devices, can be found [here](https://aihub.qualcomm.com/models/whisper_base_en). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -18,19 +18,19 @@ a hosted Qualcomm® device. Install the package via pip: ```bash -pip install "qai_hub_models[whisper_asr]" +pip install "qai_hub_models[whisper_base_en]" ``` Once installed, run the following simple CLI demo: ```bash -python -m qai_hub_models.models.whisper_asr.demo +python -m qai_hub_models.models.whisper_base_en.demo ``` More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -38,15 +38,15 @@ This repository contains export scripts that produce a model optimized for on-device deployment. This can be run as follows: ```bash -python -m qai_hub_models.models.whisper_asr.export +python -m qai_hub_models.models.whisper_base_en.export ``` Additional options are documented with the `--help` option. Note that the above script requires access to Deployment instructions for Qualcomm® AI Hub. ## License -- The license for the original implementation of Whisper-Base can be found +- The license for the original implementation of Whisper-Base-En can be found [here](https://github.com/openai/whisper/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) diff --git a/qai_hub_models/models/whisper_asr/__init__.py b/qai_hub_models/models/whisper_base_en/__init__.py similarity index 67% rename from qai_hub_models/models/whisper_asr/__init__.py rename to qai_hub_models/models/whisper_base_en/__init__.py index 3f49ff9d..bac04dc1 100644 --- a/qai_hub_models/models/whisper_asr/__init__.py +++ b/qai_hub_models/models/whisper_base_en/__init__.py @@ -2,6 +2,7 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- -from .app import WhisperApp as App # noqa: F401 +from qai_hub_models.models._shared.whisper.app import WhisperApp as App # noqa: F401 + from .model import MODEL_ID # noqa: F401 -from .model import Whisper as Model # noqa: F401 +from .model import WhisperBaseEn as Model # noqa: F401 diff --git a/qai_hub_models/models/whisper_base_en/conftest.py b/qai_hub_models/models/whisper_base_en/conftest.py new file mode 100644 index 00000000..b0406dce --- /dev/null +++ b/qai_hub_models/models/whisper_base_en/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.whisper_base_en import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.whisper_base_en.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/whisper_base_en/demo.py b/qai_hub_models/models/whisper_base_en/demo.py new file mode 100644 index 00000000..9bbf714a --- /dev/null +++ b/qai_hub_models/models/whisper_base_en/demo.py @@ -0,0 +1,14 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.demo import whisper_demo +from qai_hub_models.models.whisper_base_en.model import WhisperBaseEn + + +def main(): + whisper_demo(WhisperBaseEn) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_asr/export.py b/qai_hub_models/models/whisper_base_en/export.py similarity index 74% rename from qai_hub_models/models/whisper_asr/export.py rename to qai_hub_models/models/whisper_base_en/export.py index ece839b1..5eaddcb4 100644 --- a/qai_hub_models/models/whisper_asr/export.py +++ b/qai_hub_models/models/whisper_base_en/export.py @@ -10,14 +10,14 @@ import os import warnings from pathlib import Path -from typing import List, Mapping, Optional, Tuple +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast import qai_hub as hub import torch -from qai_hub_models.models.whisper_asr import Model +from qai_hub_models.models.whisper_base_en import Model from qai_hub_models.utils.args import export_parser, get_model_kwargs -from qai_hub_models.utils.base_model import TargetRuntime +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime from qai_hub_models.utils.compare import torch_inference from qai_hub_models.utils.input_spec import make_torch_inputs from qai_hub_models.utils.printing import ( @@ -85,17 +85,17 @@ def export_model( * A ProfileJob containing metadata about the profile job (None if profiling skipped). * An InferenceJob containing metadata about the inference job (None if inferencing skipped). """ - model_name = "whisper_asr" + model_name = "whisper_base_en" output_path = Path(output_dir or Path.cwd() / "build" / model_name) component_arg = components components = components or ALL_COMPONENTS - for component in components: - if component not in ALL_COMPONENTS: - raise ValueError(f"Invalid component {component}.") + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") if not can_access_qualcomm_ai_hub(): return export_without_hub_access( - "whisper_asr", - "Whisper-Base", + "whisper_base_en", + "Whisper-Base-En", device, skip_profiling, skip_inferencing, @@ -110,64 +110,81 @@ def export_model( # 1. Initialize PyTorch model model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) - components_dict = {} + components_dict: Dict[str, BaseModel] = {} if "WhisperEncoder" in components: - components_dict["WhisperEncoder"] = model.encoder + components_dict["WhisperEncoder"] = model.encoder # type: ignore if "WhisperDecoder" in components: - components_dict["WhisperDecoder"] = model.decoder + components_dict["WhisperDecoder"] = model.decoder # type: ignore - compile_jobs = {} + compile_jobs: Dict[str, hub.client.CompileJob] = {} for component_name, component in components_dict.items(): # Trace the model input_spec = component.get_input_spec() - source_model = torch.jit.trace(component, make_torch_inputs(input_spec)) + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) # 2. Compile the models to an on-device asset model_compile_options = component.get_hub_compile_options( target_runtime, compile_options ) - print(f"Optimizing model {component_name} to run on-device.") - compile_jobs[component_name] = hub.submit_compile_job( + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), - name=f"{component_name}", + name=f"{model_name}_{component_name}", options=model_compile_options, ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) # 3. Profile the model assets on real devices - profile_jobs = {} + profile_jobs: Dict[str, hub.client.ProfileJob] = {} if not skip_profiling: for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) print(f"Profiling model {component_name} on a hosted device.") - profile_jobs[component_name] = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_jobs[component_name].get_target_model(), device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job ) # 4. Run inference on-device with sample inputs - inference_jobs = {} + inference_jobs: Dict[str, hub.client.InferenceJob] = {} if not skip_inferencing: for component_name in components: print( f"Running inference for {component_name} on a hosted device with example inputs." ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) sample_inputs = components_dict[component_name].sample_inputs() - inference_jobs[component_name] = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_jobs[component_name].get_target_model(), inputs=sample_inputs, device=hub.Device(device), - name=f"{component_name}", - options=profile_options, + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job ) # 5. Download the model assets to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) for component_name, compile_job in compile_jobs.items(): - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download( str(output_path / f"{model_name}_{component_name}.tflite") ) @@ -176,8 +193,8 @@ def export_model( if not skip_summary and not skip_profiling: for component_name in components: profile_job = profile_jobs[component_name] - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: @@ -185,8 +202,8 @@ def export_model( inference_job = inference_jobs[component_name] sample_inputs = components_dict[component_name].sample_inputs() torch_out = torch_inference(components_dict[component_name], sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) return { diff --git a/qai_hub_models/models/whisper_base_en/info.yaml b/qai_hub_models/models/whisper_base_en/info.yaml new file mode 100644 index 00000000..320d726c --- /dev/null +++ b/qai_hub_models/models/whisper_base_en/info.yaml @@ -0,0 +1,40 @@ +name: Whisper-Base-En +# id must match with the model dir name in qai_hub_models +id: whisper_base_en +status: public +headline: Automatic speech recognition (ASR) model for English transcription as well + as translation. +domain: Audio +description: OpenAI’s Whisper ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text. It exhibits robust performance in realistic, noisy environments, making it highly reliable for real-world applications. Specifically, it excels in long-form transcription, capable of accurately transcribing audio clips up to 30 seconds long. Time to the first token is the encoder's latency, while time to each additional token is decoder's latency, where we assume a mean decoded length specified below. +use_case: Speech Recognition +tags: + - foundation +research_paper: https://cdn.openai.com/papers/whisper.pdf +research_paper_title: Robust Speech Recognition via Large-Scale Weak Supervision +license: https://github.com/openai/whisper/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/openai/whisper/tree/main +technical_details: + Model checkpoint: base.en + Input resolution: 80x3000 (30 seconds audio) + Mean decoded sequence length: 112 tokens + Number of parameters (WhisperEncoder): 23.7M + Model size (WhisperEncoder): 90.6 MB + Number of parameters (WhisperDecoder): 48.6M + Model size (WhisperDecoder): 186 MB +applicable_scenarios: + - Smart Home + - Accessibility +related_models: + - whisper_tiny_en + - whisper_small_en + - huggingface_wavlm_base_plus +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: yes +license_type: mit +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/whisper_base_en/model.py b/qai_hub_models/models/whisper_base_en/model.py new file mode 100644 index 00000000..fca5be00 --- /dev/null +++ b/qai_hub_models/models/whisper_base_en/model.py @@ -0,0 +1,16 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from qai_hub_models.models._shared.whisper.model import Whisper + +MODEL_ID = __name__.split(".")[-2] +WHISPER_VERSION = "base.en" + + +class WhisperBaseEn(Whisper): + @classmethod + def from_pretrained(cls): + return Whisper.from_pretrained(WHISPER_VERSION) diff --git a/qai_hub_models/models/whisper_base_en/perf.yaml b/qai_hub_models/models/whisper_base_en/perf.yaml new file mode 100644 index 00000000..5831fb37 --- /dev/null +++ b/qai_hub_models/models/whisper_base_en/perf.yaml @@ -0,0 +1,186 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 +models: +- name: WhisperEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 154406.0 + throughput: 6.476432262994962 + estimated_peak_memory_range: + min: 36892672 + max: 232224176 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 315 + layers_on_cpu: 0 + total_layers: 315 + job_id: jqp4q0vgo + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:10:43.748935Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 120437.0 + throughput: 8.303096224582147 + estimated_peak_memory_range: + min: 36777984 + max: 66087104 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 315 + layers_on_cpu: 0 + total_layers: 315 + job_id: jo5mrywgk + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:10:43.748943Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped +- name: WhisperDecoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 14139.0 + throughput: 70.72635971426551 + estimated_peak_memory_range: + min: 3051520 + max: 5712920 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 433 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 433 + job_id: j0pxv21g7 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:16:05.499826Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 10614.0 + throughput: 94.21518748822311 + estimated_peak_memory_range: + min: 2019328 + max: 96045024 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 433 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 433 + job_id: jegn28rgo + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:16:05.499836Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/whisper_asr/requirements.txt b/qai_hub_models/models/whisper_base_en/requirements.txt similarity index 100% rename from qai_hub_models/models/whisper_asr/requirements.txt rename to qai_hub_models/models/whisper_base_en/requirements.txt diff --git a/qai_hub_models/models/whisper_base_en/test.py b/qai_hub_models/models/whisper_base_en/test.py new file mode 100644 index 00000000..aeb74e53 --- /dev/null +++ b/qai_hub_models/models/whisper_base_en/test.py @@ -0,0 +1,22 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.test_utils import ( + run_test_transcribe, + run_test_wrapper_numerics, +) +from qai_hub_models.models.whisper_tiny_en.demo import main as demo_main +from qai_hub_models.models.whisper_tiny_en.model import WHISPER_VERSION + + +def test_numerics(): + run_test_wrapper_numerics(WHISPER_VERSION) + + +def test_transcribe(): + run_test_transcribe(WHISPER_VERSION) + + +def test_demo(): + demo_main() diff --git a/qai_hub_models/models/whisper_small_en/README.md b/qai_hub_models/models/whisper_small_en/README.md new file mode 100644 index 00000000..c1f21275 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/README.md @@ -0,0 +1,59 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Whisper-Small-En: Automatic speech recognition (ASR) model for English transcription as well as translation](https://aihub.qualcomm.com/models/whisper_small_en) + +OpenAI’s Whisper ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text. It exhibits robust performance in realistic, noisy environments, making it highly reliable for real-world applications. Specifically, it excels in long-form transcription, capable of accurately transcribing audio clips up to 30 seconds long. Time to the first token is the encoder's latency, while time to each additional token is decoder's latency, where we assume a mean decoded length specified below. + +This is based on the implementation of Whisper-Small-En found +[here](https://github.com/openai/whisper/tree/main). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/whisper_small_en). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[whisper_small_en]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.whisper_small_en.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.whisper_small_en.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Whisper-Small-En can be found + [here](https://github.com/openai/whisper/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) +* [Source Model Implementation](https://github.com/openai/whisper/tree/main) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/whisper_small_en/__init__.py b/qai_hub_models/models/whisper_small_en/__init__.py new file mode 100644 index 00000000..ac4b17de --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/__init__.py @@ -0,0 +1,8 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.app import WhisperApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import WhisperSmallEn as Model # noqa: F401 diff --git a/qai_hub_models/models/whisper_small_en/conftest.py b/qai_hub_models/models/whisper_small_en/conftest.py new file mode 100644 index 00000000..a7507db2 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.whisper_small_en import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.whisper_small_en.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/whisper_small_en/demo.py b/qai_hub_models/models/whisper_small_en/demo.py new file mode 100644 index 00000000..c3100f59 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/demo.py @@ -0,0 +1,14 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.demo import whisper_demo +from qai_hub_models.models.whisper_small_en.model import WhisperSmallEn + + +def main(): + whisper_demo(WhisperSmallEn) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_small_en/export.py b/qai_hub_models/models/whisper_small_en/export.py new file mode 100644 index 00000000..348716d4 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/export.py @@ -0,0 +1,229 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast + +import qai_hub as hub +import torch + +from qai_hub_models.models.whisper_small_en import Model +from qai_hub_models.utils.args import export_parser, get_model_kwargs +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = ["WhisperEncoder", "WhisperDecoder"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "whisper_small_en" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + component_arg = components + components = components or ALL_COMPONENTS + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "whisper_small_en", + "Whisper-Small-En", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict: Dict[str, BaseModel] = {} + if "WhisperEncoder" in components: + components_dict["WhisperEncoder"] = model.encoder # type: ignore + if "WhisperDecoder" in components: + components_dict["WhisperDecoder"] = model.decoder # type: ignore + + compile_jobs: Dict[str, hub.client.CompileJob] = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{model_name}_{component_name}", + options=model_compile_options, + ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) + + # 3. Profile the model assets on real devices + profile_jobs: Dict[str, hub.client.ProfileJob] = {} + if not skip_profiling: + for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + print(f"Profiling model {component_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job + ) + + # 4. Run inference on-device with sample inputs + inference_jobs: Dict[str, hub.client.InferenceJob] = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + sample_inputs = components_dict[component_name].sample_inputs() + submitted_inference_job = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_small_en/info.yaml b/qai_hub_models/models/whisper_small_en/info.yaml new file mode 100644 index 00000000..7227fbbb --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/info.yaml @@ -0,0 +1,40 @@ +name: Whisper-Small-En +# id must match with the model dir name in qai_hub_models +id: whisper_small_en +status: public +headline: Automatic speech recognition (ASR) model for English transcription as well + as translation. +domain: Audio +description: OpenAI’s Whisper ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text. It exhibits robust performance in realistic, noisy environments, making it highly reliable for real-world applications. Specifically, it excels in long-form transcription, capable of accurately transcribing audio clips up to 30 seconds long. Time to the first token is the encoder's latency, while time to each additional token is decoder's latency, where we assume a mean decoded length specified below. +use_case: Speech Recognition +tags: + - foundation +research_paper: https://cdn.openai.com/papers/whisper.pdf +research_paper_title: Robust Speech Recognition via Large-Scale Weak Supervision +license: https://github.com/openai/whisper/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/openai/whisper/tree/main +technical_details: + Model checkpoint: small.en + Input resolution: 80x3000 (30 seconds audio) + Mean decoded sequence length: 112 tokens + Number of parameters (WhisperEncoder): 102M + Model size (WhisperEncoder): 390 MB + Number of parameters (WhisperDecoder): 139M + Model size (WhisperDecoder): 531 MB +applicable_scenarios: + - Smart Home + - Accessibility +related_models: + - whisper_tiny_en + - whisper_base_en + - huggingface_wavlm_base_plus +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: yes +license_type: mit +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/whisper_small_en/model.py b/qai_hub_models/models/whisper_small_en/model.py new file mode 100644 index 00000000..54433dc6 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/model.py @@ -0,0 +1,16 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from qai_hub_models.models._shared.whisper.model import Whisper + +MODEL_ID = __name__.split(".")[-2] +WHISPER_VERSION = "small.en" + + +class WhisperSmallEn(Whisper): + @classmethod + def from_pretrained(cls): + return Whisper.from_pretrained(WHISPER_VERSION) diff --git a/qai_hub_models/models/whisper_small_en/perf.yaml b/qai_hub_models/models/whisper_small_en/perf.yaml new file mode 100644 index 00000000..549cec62 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/perf.yaml @@ -0,0 +1,186 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 +models: +- name: WhisperEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 602022.0 + throughput: 1.6610688645929883 + estimated_peak_memory_range: + min: 12288 + max: 448965896 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 609 + layers_on_cpu: 0 + total_layers: 609 + job_id: jvgdw4k5j + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:29:10.773412Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 461601.0 + throughput: 2.1663731231084853 + estimated_peak_memory_range: + min: 14163968 + max: 46674320 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 609 + layers_on_cpu: 0 + total_layers: 609 + job_id: jnp101l5q + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:29:10.773421Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped +- name: WhisperDecoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 45479.0 + throughput: 21.988170364343983 + estimated_peak_memory_range: + min: 8577024 + max: 12019040 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 853 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 853 + job_id: jz5wozjp1 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:33:07.115194Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: jvgdw4l5j + job_status: Failed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:33:07.115203Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/whisper_small_en/requirements.txt b/qai_hub_models/models/whisper_small_en/requirements.txt new file mode 100644 index 00000000..fa34d4f8 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/requirements.txt @@ -0,0 +1,2 @@ +openai-whisper==20230314 +scipy==1.8.1 diff --git a/qai_hub_models/models/whisper_small_en/test.py b/qai_hub_models/models/whisper_small_en/test.py new file mode 100644 index 00000000..aeb74e53 --- /dev/null +++ b/qai_hub_models/models/whisper_small_en/test.py @@ -0,0 +1,22 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.test_utils import ( + run_test_transcribe, + run_test_wrapper_numerics, +) +from qai_hub_models.models.whisper_tiny_en.demo import main as demo_main +from qai_hub_models.models.whisper_tiny_en.model import WHISPER_VERSION + + +def test_numerics(): + run_test_wrapper_numerics(WHISPER_VERSION) + + +def test_transcribe(): + run_test_transcribe(WHISPER_VERSION) + + +def test_demo(): + demo_main() diff --git a/qai_hub_models/models/whisper_small_multi/code-gen.yaml b/qai_hub_models/models/whisper_small_multi/code-gen.yaml new file mode 100644 index 00000000..39d2d995 --- /dev/null +++ b/qai_hub_models/models/whisper_small_multi/code-gen.yaml @@ -0,0 +1,4 @@ +components: + WhisperEncoder: model.encoder + WhisperDecoder: model.decoder +qnn_export_failure_reason: "Compilation fails https://dev.aihub.qualcomm.com/jobs/jegnklrvg/ https://dev.aihub.qualcomm.com/jobs/joprw81v5 " diff --git a/qai_hub_models/models/whisper_small_multi/demo.py b/qai_hub_models/models/whisper_small_multi/demo.py new file mode 100644 index 00000000..0dbdb990 --- /dev/null +++ b/qai_hub_models/models/whisper_small_multi/demo.py @@ -0,0 +1,14 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.demo import whisper_demo +from qai_hub_models.models.whisper_small_multi.model import WhisperSmallMulti + + +def main(): + whisper_demo(WhisperSmallMulti) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_tiny_en/README.md b/qai_hub_models/models/whisper_tiny_en/README.md new file mode 100644 index 00000000..f92501ec --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/README.md @@ -0,0 +1,59 @@ +[![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) + + +# [Whisper-Tiny-En: Automatic speech recognition (ASR) model for English transcription as well as translation](https://aihub.qualcomm.com/models/whisper_tiny_en) + +OpenAI’s Whisper ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text. It exhibits robust performance in realistic, noisy environments, making it highly reliable for real-world applications. Specifically, it excels in long-form transcription, capable of accurately transcribing audio clips up to 30 seconds long. Time to the first token is the encoder's latency, while time to each additional token is decoder's latency, where we assume a mean decoded length specified below. + +This is based on the implementation of Whisper-Tiny-En found +[here](https://github.com/openai/whisper/tree/main). This repository contains scripts for optimized on-device +export suitable to run on Qualcomm® devices. More details on model performance +accross various devices, can be found [here](https://aihub.qualcomm.com/models/whisper_tiny_en). + +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on +a hosted Qualcomm® device. + + +## Example & Usage + +Install the package via pip: +```bash +pip install "qai_hub_models[whisper_tiny_en]" +``` + + +Once installed, run the following simple CLI demo: + +```bash +python -m qai_hub_models.models.whisper_tiny_en.demo +``` +More details on the CLI tool can be found with the `--help` option. See +[demo.py](demo.py) for sample usage of the model including pre/post processing +scripts. Please refer to our [general instructions on using +models](../../../#getting-started) for more usage instructions. + +## Export for on-device deployment + +This repository contains export scripts that produce a model optimized for +on-device deployment. This can be run as follows: + +```bash +python -m qai_hub_models.models.whisper_tiny_en.export +``` +Additional options are documented with the `--help` option. Note that the above +script requires access to Deployment instructions for Qualcomm® AI Hub. + +## License +- The license for the original implementation of Whisper-Tiny-En can be found + [here](https://github.com/openai/whisper/blob/main/LICENSE). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) + +## References +* [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) +* [Source Model Implementation](https://github.com/openai/whisper/tree/main) + +## Community +* Join [our AI Hub Slack community](https://join.slack.com/t/qualcomm-ai-hub/shared_invite/zt-2dgf95loi-CXHTDRR1rvPgQWPO~ZZZJg) to collaborate, post questions and learn more about on-device AI. +* For questions or feedback please [reach out to us](mailto:ai-hub-support@qti.qualcomm.com). + + diff --git a/qai_hub_models/models/whisper_tiny_en/__init__.py b/qai_hub_models/models/whisper_tiny_en/__init__.py new file mode 100644 index 00000000..d454567c --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/__init__.py @@ -0,0 +1,8 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.app import WhisperApp as App # noqa: F401 + +from .model import MODEL_ID # noqa: F401 +from .model import WhisperTinyEn as Model # noqa: F401 diff --git a/qai_hub_models/models/whisper_tiny_en/conftest.py b/qai_hub_models/models/whisper_tiny_en/conftest.py new file mode 100644 index 00000000..fdfc63e0 --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.whisper_tiny_en import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.whisper_tiny_en.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/whisper_tiny_en/demo.py b/qai_hub_models/models/whisper_tiny_en/demo.py new file mode 100644 index 00000000..073ab120 --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/demo.py @@ -0,0 +1,14 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.demo import whisper_demo +from qai_hub_models.models.whisper_tiny_en.model import WhisperTinyEn + + +def main(): + whisper_demo(WhisperTinyEn) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_tiny_en/export.py b/qai_hub_models/models/whisper_tiny_en/export.py new file mode 100644 index 00000000..57b7c76b --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/export.py @@ -0,0 +1,229 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + + +from __future__ import annotations + +import os +import warnings +from pathlib import Path +from typing import Any, Dict, List, Mapping, Optional, Tuple, cast + +import qai_hub as hub +import torch + +from qai_hub_models.models.whisper_tiny_en import Model +from qai_hub_models.utils.args import export_parser, get_model_kwargs +from qai_hub_models.utils.base_model import BaseModel, TargetRuntime +from qai_hub_models.utils.compare import torch_inference +from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.printing import ( + print_inference_metrics, + print_profile_metrics_from_job, +) +from qai_hub_models.utils.qai_hub_helpers import ( + can_access_qualcomm_ai_hub, + export_without_hub_access, +) + +ALL_COMPONENTS = ["WhisperEncoder", "WhisperDecoder"] + + +def export_model( + device: str = "Samsung Galaxy S23", + components: Optional[List[str]] = None, + skip_profiling: bool = False, + skip_inferencing: bool = False, + skip_downloading: bool = False, + skip_summary: bool = False, + output_dir: Optional[str] = None, + target_runtime: TargetRuntime = TargetRuntime.TFLITE, + compile_options: str = "", + profile_options: str = "", + **additional_model_kwargs, +) -> Mapping[ + str, Tuple[hub.CompileJob, Optional[hub.ProfileJob], Optional[hub.InferenceJob]] +] | List[str]: + """ + This function accomplishes 6 main tasks: + + 1. Instantiates a PyTorch model and converts it to a traced TorchScript format. + 2. Compiles the model to an asset that can be run on device. + 3. Profiles the model performance on real devices. + 4. Inferences the model on sample inputs. + 5. Downloads the model asset to the local directory. + 6. Summarizes the results from profiling and inference. + + Each of the last four steps can be optionally skipped using the input options. + + Parameters: + device: Device for which to export the model. + Full list of available devices can be found by running `hub.get_devices()`. + Defaults to DEFAULT_DEVICE if not specified. + components: List of sub-components of the model that will be exported. + Each component is compiled and profiled separately. + Defaults to ALL_COMPONENTS if not specified. + skip_profiling: If set, skips profiling of compiled model on real devices. + skip_inferencing: If set, skips computing on-device outputs from sample data. + skip_downloading: If set, skips downloading of compiled model. + skip_summary: If set, skips waiting for and summarizing results + from profiling and inference. + output_dir: Directory to store generated assets (e.g. compiled model). + Defaults to `/build/`. + target_runtime: Which on-device runtime to target. Default is TFLite. + compile_options: Additional options to pass when submitting the compile job. + profile_options: Additional options to pass when submitting the profile job. + **additional_model_kwargs: Additional optional kwargs used to customize + `model_cls.from_pretrained` + + Returns: + A Mapping from component_name to a 3-tuple of: + * A CompileJob object containing metadata about the compile job submitted to hub. + * A ProfileJob containing metadata about the profile job (None if profiling skipped). + * An InferenceJob containing metadata about the inference job (None if inferencing skipped). + """ + model_name = "whisper_tiny_en" + output_path = Path(output_dir or Path.cwd() / "build" / model_name) + component_arg = components + components = components or ALL_COMPONENTS + for component_name in components: + if component_name not in ALL_COMPONENTS: + raise ValueError(f"Invalid component {component_name}.") + if not can_access_qualcomm_ai_hub(): + return export_without_hub_access( + "whisper_tiny_en", + "Whisper-Tiny-En", + device, + skip_profiling, + skip_inferencing, + skip_downloading, + skip_summary, + output_path, + target_runtime, + compile_options, + profile_options, + component_arg, + ) + + # 1. Initialize PyTorch model + model = Model.from_pretrained(**get_model_kwargs(Model, additional_model_kwargs)) + components_dict: Dict[str, BaseModel] = {} + if "WhisperEncoder" in components: + components_dict["WhisperEncoder"] = model.encoder # type: ignore + if "WhisperDecoder" in components: + components_dict["WhisperDecoder"] = model.decoder # type: ignore + + compile_jobs: Dict[str, hub.client.CompileJob] = {} + for component_name, component in components_dict.items(): + # Trace the model + input_spec = component.get_input_spec() + source_model = torch.jit.trace( + component.to("cpu"), make_torch_inputs(input_spec) + ) + + # 2. Compile the models to an on-device asset + model_compile_options = component.get_hub_compile_options( + target_runtime, compile_options + ) + print(f"Optimizing model {component_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( + model=source_model, + input_specs=input_spec, + device=hub.Device(device), + name=f"{model_name}_{component_name}", + options=model_compile_options, + ) + compile_jobs[component_name] = cast( + hub.client.CompileJob, submitted_compile_job + ) + + # 3. Profile the model assets on real devices + profile_jobs: Dict[str, hub.client.ProfileJob] = {} + if not skip_profiling: + for component_name in components: + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + print(f"Profiling model {component_name} on a hosted device.") + submitted_profile_job = hub.submit_profile_job( + model=compile_jobs[component_name].get_target_model(), + device=hub.Device(device), + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + profile_jobs[component_name] = cast( + hub.client.ProfileJob, submitted_profile_job + ) + + # 4. Run inference on-device with sample inputs + inference_jobs: Dict[str, hub.client.InferenceJob] = {} + if not skip_inferencing: + for component_name in components: + print( + f"Running inference for {component_name} on a hosted device with example inputs." + ) + profile_options_all = components_dict[ + component_name + ].get_hub_profile_options(target_runtime, profile_options) + sample_inputs = components_dict[component_name].sample_inputs() + submitted_inference_job = hub.submit_inference_job( + model=compile_jobs[component_name].get_target_model(), + inputs=sample_inputs, + device=hub.Device(device), + name=f"{model_name}_{component_name}", + options=profile_options_all, + ) + inference_jobs[component_name] = cast( + hub.client.InferenceJob, submitted_inference_job + ) + + # 5. Download the model assets to a local file + if not skip_downloading: + os.makedirs(output_path, exist_ok=True) + for component_name, compile_job in compile_jobs.items(): + target_model: hub.Model = compile_job.get_target_model() # type: ignore + target_model.download( + str(output_path / f"{model_name}_{component_name}.tflite") + ) + + # 6. Summarize the results from profiling and inference + if not skip_summary and not skip_profiling: + for component_name in components: + profile_job = profile_jobs[component_name] + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore + print_profile_metrics_from_job(profile_job, profile_data) + + if not skip_summary and not skip_inferencing: + for component_name in components: + inference_job = inference_jobs[component_name] + sample_inputs = components_dict[component_name].sample_inputs() + torch_out = torch_inference(components_dict[component_name], sample_inputs) + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore + print_inference_metrics(inference_job, inference_result, torch_out) + + return { + component_name: ( + compile_jobs[component_name], + profile_jobs.get(component_name, None), + inference_jobs.get(component_name, None), + ) + for component_name in components + } + + +def main(): + warnings.filterwarnings("ignore") + parser = export_parser( + model_cls=Model, components=ALL_COMPONENTS, supports_qnn=False + ) + args = parser.parse_args() + export_model(**vars(args)) + + +if __name__ == "__main__": + main() diff --git a/qai_hub_models/models/whisper_tiny_en/info.yaml b/qai_hub_models/models/whisper_tiny_en/info.yaml new file mode 100644 index 00000000..4fb672ed --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/info.yaml @@ -0,0 +1,40 @@ +name: Whisper-Tiny-En +# id must match with the model dir name in qai_hub_models +id: whisper_tiny_en +status: public +headline: Automatic speech recognition (ASR) model for English transcription + as well as translation. +domain: Audio +description: OpenAI’s Whisper ASR (Automatic Speech Recognition) model is a state-of-the-art system designed for transcribing spoken language into written text. It exhibits robust performance in realistic, noisy environments, making it highly reliable for real-world applications. Specifically, it excels in long-form transcription, capable of accurately transcribing audio clips up to 30 seconds long. Time to the first token is the encoder's latency, while time to each additional token is decoder's latency, where we assume a mean decoded length specified below. +use_case: Speech Recognition +tags: + - foundation +research_paper: https://cdn.openai.com/papers/whisper.pdf +research_paper_title: Robust Speech Recognition via Large-Scale Weak Supervision +license: https://github.com/openai/whisper/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf +source_repo: https://github.com/openai/whisper/tree/main +technical_details: + Model checkpoint: tiny.en + Input resolution: 80x3000 (30 seconds audio) + Mean decoded sequence length: 112 tokens + Number of parameters (WhisperEncoder): 9.39M + Model size (WhisperEncoder): 35.9 MB + Number of parameters (WhisperDecoder): 28.2M + Model size (WhisperDecoder): 108 MB +applicable_scenarios: + - Smart Home + - Accessibility +related_models: + - whisper_base_en + - whisper_small_en + - huggingface_wavlm_base_plus +form_factors: + - Phone + - Tablet + - IoT +has_static_banner: yes +has_animated_banner: yes +license_type: mit +deploy_license_type: AI Model Hub License +dataset: [] diff --git a/qai_hub_models/models/whisper_tiny_en/model.py b/qai_hub_models/models/whisper_tiny_en/model.py new file mode 100644 index 00000000..b430fe61 --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/model.py @@ -0,0 +1,16 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from __future__ import annotations + +from qai_hub_models.models._shared.whisper.model import Whisper + +MODEL_ID = __name__.split(".")[-2] +WHISPER_VERSION = "tiny.en" + + +class WhisperTinyEn(Whisper): + @classmethod + def from_pretrained(cls): + return Whisper.from_pretrained(WHISPER_VERSION) diff --git a/qai_hub_models/models/whisper_tiny_en/perf.yaml b/qai_hub_models/models/whisper_tiny_en/perf.yaml new file mode 100644 index 00000000..a00f22a5 --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/perf.yaml @@ -0,0 +1,186 @@ +aggregated: + supported_oses: + - Android + supported_devices: + - Google Pixel 3 + - Google Pixel 3a + - Google Pixel 3a XL + - Google Pixel 4 + - Google Pixel 4a + - Google Pixel 5a 5G + - Samsung Galaxy S21 + - Samsung Galaxy S21 Ultra + - Samsung Galaxy S21+ + - Samsung Galaxy S22 5G + - Samsung Galaxy S22 Ultra 5G + - Samsung Galaxy S22+ 5G + - Samsung Galaxy S23 + - Samsung Galaxy S23 Ultra + - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra + - Samsung Galaxy Tab S8 + - Xiaomi 12 + - Xiaomi 12 Pro + supported_chipsets: + - Snapdragon® 8 Gen 1 + - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 + - Snapdragon® 888 +models: +- name: WhisperEncoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 67350.0 + throughput: 14.847809948032666 + estimated_peak_memory_range: + min: 11608064 + max: 57976544 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 217 + layers_on_cpu: 0 + total_layers: 217 + job_id: jz57zx9p3 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:03:16.946141Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 53449.0 + throughput: 18.709423936836984 + estimated_peak_memory_range: + min: 0 + max: 27656928 + primary_compute_unit: GPU + precision: fp16 + layer_info: + layers_on_npu: 0 + layers_on_gpu: 217 + layers_on_cpu: 0 + total_layers: 217 + job_id: jegn23qgo + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:03:16.946150Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped +- name: WhisperDecoder + performance_metrics: + - torchscript_onnx_tflite: + inference_time: 7423.0 + throughput: 134.71642193183348 + estimated_peak_memory_range: + min: 1634304 + max: 4170776 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 293 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 293 + job_id: jqp4qv1go + job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:09:31.853789Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 5570.0 + throughput: 179.53321364452424 + estimated_peak_memory_range: + min: 466944 + max: 230273920 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 293 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 293 + job_id: joprke750 + job_status: Passed + reference_device_info: + name: Samsung Galaxy S24 + os: '14' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:09:31.853814Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/whisper_tiny_en/requirements.txt b/qai_hub_models/models/whisper_tiny_en/requirements.txt new file mode 100644 index 00000000..75b1cf12 --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/requirements.txt @@ -0,0 +1,2 @@ +openai-whisper==20230314 +scipy diff --git a/qai_hub_models/models/whisper_tiny_en/test.py b/qai_hub_models/models/whisper_tiny_en/test.py new file mode 100644 index 00000000..aeb74e53 --- /dev/null +++ b/qai_hub_models/models/whisper_tiny_en/test.py @@ -0,0 +1,22 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from qai_hub_models.models._shared.whisper.test_utils import ( + run_test_transcribe, + run_test_wrapper_numerics, +) +from qai_hub_models.models.whisper_tiny_en.demo import main as demo_main +from qai_hub_models.models.whisper_tiny_en.model import WHISPER_VERSION + + +def test_numerics(): + run_test_wrapper_numerics(WHISPER_VERSION) + + +def test_transcribe(): + run_test_transcribe(WHISPER_VERSION) + + +def test_demo(): + demo_main() diff --git a/qai_hub_models/models/wideresnet50/README.md b/qai_hub_models/models/wideresnet50/README.md index f8b69f8c..33f834c9 100644 --- a/qai_hub_models/models/wideresnet50/README.md +++ b/qai_hub_models/models/wideresnet50/README.md @@ -10,7 +10,7 @@ This is based on the implementation of WideResNet50 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/wideresnet50). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.wideresnet50.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of WideResNet50 can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Wide Residual Networks](https://arxiv.org/abs/1605.07146) diff --git a/qai_hub_models/models/wideresnet50/conftest.py b/qai_hub_models/models/wideresnet50/conftest.py new file mode 100644 index 00000000..c4c08578 --- /dev/null +++ b/qai_hub_models/models/wideresnet50/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.wideresnet50 import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.wideresnet50.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/wideresnet50/demo.py b/qai_hub_models/models/wideresnet50/demo.py index e0fc917e..04487876 100644 --- a/qai_hub_models/models/wideresnet50/demo.py +++ b/qai_hub_models/models/wideresnet50/demo.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.wideresnet50.model import WideResNet50 +from qai_hub_models.models.wideresnet50.model import MODEL_ID, WideResNet50 def main(is_test: bool = False): - imagenet_demo(WideResNet50, is_test) + imagenet_demo(WideResNet50, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/wideresnet50/export.py b/qai_hub_models/models/wideresnet50/export.py index 023f1afc..c86de65a 100644 --- a/qai_hub_models/models/wideresnet50/export.py +++ b/qai_hub_models/models/wideresnet50/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,33 +154,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/wideresnet50/info.yaml b/qai_hub_models/models/wideresnet50/info.yaml index 3e10e96e..abeab0e0 100644 --- a/qai_hub_models/models/wideresnet50/info.yaml +++ b/qai_hub_models/models/wideresnet50/info.yaml @@ -13,6 +13,7 @@ tags: research_paper: https://arxiv.org/abs/1605.07146 research_paper_title: Wide Residual Networks license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -35,6 +36,7 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/wideresnet50/model.py b/qai_hub_models/models/wideresnet50/model.py index f8d7d130..9e3358bd 100644 --- a/qai_hub_models/models/wideresnet50/model.py +++ b/qai_hub_models/models/wideresnet50/model.py @@ -14,6 +14,6 @@ class WideResNet50(ImagenetClassifier): @classmethod - def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> ImagenetClassifier: + def from_pretrained(cls, weights: str = DEFAULT_WEIGHTS) -> WideResNet50: net = tv_models.wide_resnet50_2(weights=weights) return cls(net) diff --git a/qai_hub_models/models/wideresnet50/perf.yaml b/qai_hub_models/models/wideresnet50/perf.yaml index 9c9625ba..6dbe8a6c 100644 --- a/qai_hub_models/models/wideresnet50/perf.yaml +++ b/qai_hub_models/models/wideresnet50/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: WideResNet50 performance_metrics: - torchscript_onnx_tflite: - inference_time: 4393.0 - throughput: 227.6348736626451 + inference_time: 4401.0 + throughput: 227.22108611679164 estimated_peak_memory_range: - min: 24576 - max: 1816072 + min: 20480 + max: 2132848 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 77 - job_id: jz57el9rp + job_id: jwgoyr458 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:12:38.602998Z' torchscript_onnx_qnn: - inference_time: 4605.0 - throughput: 217.15526601520088 + inference_time: 4580.0 + throughput: 218.34061135371178 estimated_peak_memory_range: - min: 0 - max: 313348064 + min: 618496 + max: 323904968 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 125 - job_id: jqp4yd3lp + job_id: j7gjx77pd + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 3306.0 + throughput: 302.48033877797945 + estimated_peak_memory_range: + min: 16384 + max: 94385296 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 77 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 77 + job_id: j1pv3d75x job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:11:06.129828Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:15:48.204812Z' + torchscript_onnx_qnn: + inference_time: 3413.0 + throughput: 292.99736302373276 + estimated_peak_memory_range: + min: 618496 + max: 52379088 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 125 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 125 + job_id: jlpe9z7gr + job_status: Passed diff --git a/qai_hub_models/models/wideresnet50/test.py b/qai_hub_models/models/wideresnet50/test.py index 09976a8e..a089afd8 100644 --- a/qai_hub_models/models/wideresnet50/test.py +++ b/qai_hub_models/models/wideresnet50/test.py @@ -2,6 +2,8 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +import pytest + from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, run_imagenet_classifier_trace_test, @@ -14,6 +16,7 @@ def test_task(): run_imagenet_classifier_test(WideResNet50.from_pretrained(), MODEL_ID) +@pytest.mark.trace def test_trace(): run_imagenet_classifier_trace_test(WideResNet50.from_pretrained()) diff --git a/qai_hub_models/models/wideresnet50_quantized/README.md b/qai_hub_models/models/wideresnet50_quantized/README.md index d8e51c8b..0f4c4c23 100644 --- a/qai_hub_models/models/wideresnet50_quantized/README.md +++ b/qai_hub_models/models/wideresnet50_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of WideResNet50-Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/wideresnet50_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.wideresnet50_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of WideResNet50-Quantized can be found [here](https://github.com/pytorch/vision/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Wide Residual Networks](https://arxiv.org/abs/1605.07146) diff --git a/qai_hub_models/models/wideresnet50_quantized/conftest.py b/qai_hub_models/models/wideresnet50_quantized/conftest.py new file mode 100644 index 00000000..bd7b0f33 --- /dev/null +++ b/qai_hub_models/models/wideresnet50_quantized/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.wideresnet50_quantized import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.wideresnet50_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/wideresnet50_quantized/demo.py b/qai_hub_models/models/wideresnet50_quantized/demo.py index c124c6fa..92feb1a1 100644 --- a/qai_hub_models/models/wideresnet50_quantized/demo.py +++ b/qai_hub_models/models/wideresnet50_quantized/demo.py @@ -3,11 +3,14 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.demo import imagenet_demo -from qai_hub_models.models.wideresnet50_quantized.model import WideResNet50Quantizable +from qai_hub_models.models.wideresnet50_quantized.model import ( + MODEL_ID, + WideResNet50Quantizable, +) def main(is_test: bool = False): - imagenet_demo(WideResNet50Quantizable, is_test) + imagenet_demo(WideResNet50Quantizable, MODEL_ID, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/wideresnet50_quantized/export.py b/qai_hub_models/models/wideresnet50_quantized/export.py index 4bad440d..ad05928c 100644 --- a/qai_hub_models/models/wideresnet50_quantized/export.py +++ b/qai_hub_models/models/wideresnet50_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -120,8 +120,8 @@ def export_model( model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image_tensor" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -129,21 +129,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -155,33 +163,35 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image_tensor", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics(inference_job, inference_result, torch_out) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/wideresnet50_quantized/info.yaml b/qai_hub_models/models/wideresnet50_quantized/info.yaml index e1b9b755..ec14612f 100644 --- a/qai_hub_models/models/wideresnet50_quantized/info.yaml +++ b/qai_hub_models/models/wideresnet50_quantized/info.yaml @@ -14,6 +14,7 @@ tags: research_paper: https://arxiv.org/abs/1605.07146 research_paper_title: Wide Residual Networks license: https://github.com/pytorch/vision/blob/main/LICENSE +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py technical_details: Model checkpoint: Imagenet @@ -36,6 +37,7 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: bsd-3-clause +deploy_license_type: AI Model Hub License dataset: - imagenet-1k - imagenet-22k diff --git a/qai_hub_models/models/wideresnet50_quantized/model.py b/qai_hub_models/models/wideresnet50_quantized/model.py index 86bdd679..9b1086a0 100644 --- a/qai_hub_models/models/wideresnet50_quantized/model.py +++ b/qai_hub_models/models/wideresnet50_quantized/model.py @@ -13,15 +13,20 @@ # isort: on import torch -from aimet_torch.cross_layer_equalization import equalize_model +from aimet_torch.cross_layer_equalization import ( + equalize_bn_folded_model, + fold_all_batch_norms, +) +from aimet_torch.model_preparer import prepare_model from aimet_torch.quantsim import QuantizationSimModel, load_encodings_to_sim from qai_hub_models.models.wideresnet50.model import WideResNet50 -from qai_hub_models.utils.aimet.config_loader import get_per_channel_aimet_config +from qai_hub_models.utils.aimet.config_loader import get_default_aimet_config from qai_hub_models.utils.asset_loaders import CachedWebModelAsset +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime MODEL_ID = __name__.split(".")[-2] -MODEL_ASSET_VERSION = 1 +MODEL_ASSET_VERSION = 2 DEFAULT_ENCODINGS = "wideresnet50_quantized_encodings.json" @@ -37,9 +42,15 @@ def __init__( ) -> None: WideResNet50.__init__(self, sim_model.model) AIMETQuantizableMixin.__init__( - self, sim_model, needs_onnx_direct_aimet_export=True + self, + sim_model, ) + def preferred_hub_source_model_format( + self, target_runtime: TargetRuntime + ) -> SourceModelFormat: + return SourceModelFormat.ONNX + @classmethod def from_pretrained( cls, @@ -53,16 +64,19 @@ def from_pretrained( else: Interprets as a filepath and loads the encodings stored there. """ model = WideResNet50.from_pretrained() - input_shape = model.get_input_spec()["image_tensor"][0] + input_shape = cls.get_input_spec()["image_tensor"][0] + model = prepare_model(model) + dummy_input = torch.rand(input_shape) - equalize_model(model, input_shape) + pairs = fold_all_batch_norms(model, input_shape, dummy_input) + equalize_bn_folded_model(model, input_shape, pairs, dummy_input) sim = QuantizationSimModel( - model.net, + model, quant_scheme="tf_enhanced", default_param_bw=8, default_output_bw=8, - config_file=get_per_channel_aimet_config(), - dummy_input=torch.rand(input_shape), + config_file=get_default_aimet_config(), + dummy_input=dummy_input, ) if aimet_encodings: @@ -74,3 +88,11 @@ def from_pretrained( sim.model.eval() return cls(sim) + + def get_hub_compile_options( + self, target_runtime: TargetRuntime, other_compile_options: str = "" + ) -> str: + compile_options = super().get_hub_compile_options( + target_runtime, other_compile_options + ) + return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/qai_hub_models/models/wideresnet50_quantized/perf.yaml b/qai_hub_models/models/wideresnet50_quantized/perf.yaml index b9121909..b597a1f7 100644 --- a/qai_hub_models/models/wideresnet50_quantized/perf.yaml +++ b/qai_hub_models/models/wideresnet50_quantized/perf.yaml @@ -17,51 +17,92 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: WideResNet50-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1833.0 - throughput: 545.5537370430987 + inference_time: 1767.0 + throughput: 565.9309564233164 estimated_peak_memory_range: - min: 28672 - max: 1710680 + min: 24576 + max: 1759936 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: - layers_on_npu: 80 + layers_on_npu: 78 layers_on_gpu: 0 layers_on_cpu: 0 - total_layers: 80 - job_id: jz5wl34jp + total_layers: 78 + job_id: jz5wo4zp1 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:12:41.207435Z' torchscript_onnx_qnn: - inference_time: 1756.0 - throughput: 569.4760820045558 + inference_time: 1707.0 + throughput: 585.8230814294083 + estimated_peak_memory_range: + min: 28672 + max: 479496224 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 76 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 76 + job_id: j0pxvxjg7 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1322.0 + throughput: 756.4296520423601 estimated_peak_memory_range: - min: 520192 - max: 152789048 + min: 16384 + max: 54559456 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 78 layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 78 - job_id: jmg9zydvp + job_id: jvgdw2k5j job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:11:48.964511Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:17:15.136644Z' + torchscript_onnx_qnn: + inference_time: 1291.0 + throughput: 774.5933384972889 + estimated_peak_memory_range: + min: 167936 + max: 41865680 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 76 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 76 + job_id: jogkz4ygd + job_status: Passed diff --git a/qai_hub_models/models/wideresnet50_quantized/test.py b/qai_hub_models/models/wideresnet50_quantized/test.py index cb4bac98..fbe14f34 100644 --- a/qai_hub_models/models/wideresnet50_quantized/test.py +++ b/qai_hub_models/models/wideresnet50_quantized/test.py @@ -4,7 +4,6 @@ # --------------------------------------------------------------------- from qai_hub_models.models._shared.imagenet_classifier.test_utils import ( run_imagenet_classifier_test, - run_imagenet_classifier_trace_test, ) from qai_hub_models.models.wideresnet50_quantized.demo import main as demo_main from qai_hub_models.models.wideresnet50_quantized.model import ( @@ -26,16 +25,6 @@ def test_task(): ) -def test_trace(): - run_imagenet_classifier_trace_test( - WideResNet50Quantizable.from_pretrained(), - diff_tol=0.01, - rtol=0.02, - atol=0.2, - is_quantized=True, - ) - - def test_demo(): # Verify demo does not crash demo_main(is_test=True) diff --git a/qai_hub_models/models/xlsr/README.md b/qai_hub_models/models/xlsr/README.md index d8b89438..47d468ec 100644 --- a/qai_hub_models/models/xlsr/README.md +++ b/qai_hub_models/models/xlsr/README.md @@ -10,7 +10,7 @@ This is based on the implementation of XLSR found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/xlsr). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.xlsr.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of XLSR can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Extremely Lightweight Quantization Robust Real-Time Single-Image Super Resolution for Mobile Devices](https://arxiv.org/abs/2105.10288) diff --git a/qai_hub_models/models/xlsr/conftest.py b/qai_hub_models/models/xlsr/conftest.py new file mode 100644 index 00000000..0b729d36 --- /dev/null +++ b/qai_hub_models/models/xlsr/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.xlsr import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.xlsr.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/xlsr/demo.py b/qai_hub_models/models/xlsr/demo.py index 9d531a77..942a23f3 100644 --- a/qai_hub_models/models/xlsr/demo.py +++ b/qai_hub_models/models/xlsr/demo.py @@ -12,7 +12,7 @@ def main(is_test: bool = False): - super_resolution_demo(XLSR, IMAGE_ADDRESS, is_test) + super_resolution_demo(XLSR, MODEL_ID, IMAGE_ADDRESS, is_test) if __name__ == "__main__": diff --git a/qai_hub_models/models/xlsr/export.py b/qai_hub_models/models/xlsr/export.py index b760df96..c45d8d65 100644 --- a/qai_hub_models/models/xlsr/export.py +++ b/qai_hub_models/models/xlsr/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,7 +109,7 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( @@ -118,29 +118,37 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -149,30 +157,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/xlsr/info.yaml b/qai_hub_models/models/xlsr/info.yaml index 1751d48f..cec3ec6d 100644 --- a/qai_hub_models/models/xlsr/info.yaml +++ b/qai_hub_models/models/xlsr/info.yaml @@ -11,6 +11,7 @@ research_paper: https://arxiv.org/abs/2105.10288 research_paper_title: Extremely Lightweight Quantization Robust Real-Time Single-Image Super Resolution for Mobile Devices license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr technical_details: Model checkpoint: xlsr_4x_checkpoint_float32 @@ -28,4 +29,5 @@ related_models: [esrgan, real_esrgan_general_x4v3] has_static_banner: yes has_animated_banner: yes license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/xlsr/model.py b/qai_hub_models/models/xlsr/model.py index 5f4a2ffd..aaee1928 100644 --- a/qai_hub_models/models/xlsr/model.py +++ b/qai_hub_models/models/xlsr/model.py @@ -8,7 +8,8 @@ from qai_hub_models.evaluators.base_evaluators import BaseEvaluator from qai_hub_models.evaluators.superres_evaluator import SuperResolutionOutputEvaluator -from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, SourceAsRoot +from qai_hub_models.utils.aimet.repo import aimet_zoo_as_root +from qai_hub_models.utils.asset_loaders import CachedWebModelAsset from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.input_spec import InputSpec @@ -81,12 +82,7 @@ def get_input_spec( def _load_xlsr_source_model() -> torch.nn.Module: # Load XLSR model from the source repository using the given weights. # Returns .utils.super_resolution.models.XLSRRelease - with SourceAsRoot( - XLSR_SOURCE_REPOSITORY, - XLSR_SOURCE_REPO_COMMIT, - MODEL_ID, - MODEL_ASSET_VERSION, - ): + with aimet_zoo_as_root(): # necessary import. `modeling.deeplab` comes from the XLSR repo. from aimet_zoo_torch.common.super_resolution.models import XLSRRelease diff --git a/qai_hub_models/models/xlsr/perf.yaml b/qai_hub_models/models/xlsr/perf.yaml index 91c2f707..3f267734 100644 --- a/qai_hub_models/models/xlsr/perf.yaml +++ b/qai_hub_models/models/xlsr/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: XLSR performance_metrics: - torchscript_onnx_tflite: - inference_time: 2523.0 - throughput: 396.3535473642489 + inference_time: 2508.0 + throughput: 398.72408293460927 estimated_peak_memory_range: - min: 24576 - max: 1686120 + min: 16384 + max: 9569248 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 16 - job_id: jogk2qlyg + job_id: jz57z6np3 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:14:58.320277Z' torchscript_onnx_qnn: - inference_time: 1068.0 - throughput: 936.3295880149813 + inference_time: 987.0 + throughput: 1013.1712259371834 estimated_peak_memory_range: - min: 217088 - max: 63076024 + min: 2121728 + max: 10203592 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 22 - job_id: jn5qlr77p + job_id: j0pxvm8g7 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 1996.0 + throughput: 501.00200400801606 + estimated_peak_memory_range: + min: 16384 + max: 19879696 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 13 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 16 + job_id: jqp4q82go job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:37:57.776098Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:16:54.078428Z' + torchscript_onnx_qnn: + inference_time: 631.0 + throughput: 1584.7860538827258 + estimated_peak_memory_range: + min: 225280 + max: 18045792 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 22 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 22 + job_id: jegn2xjgo + job_status: Passed diff --git a/qai_hub_models/models/xlsr_quantized/README.md b/qai_hub_models/models/xlsr_quantized/README.md index 03bba459..106e48b4 100644 --- a/qai_hub_models/models/xlsr_quantized/README.md +++ b/qai_hub_models/models/xlsr_quantized/README.md @@ -10,7 +10,7 @@ This is based on the implementation of XLSR-Quantized found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/xlsr_quantized). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.xlsr_quantized.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of XLSR-Quantized can be found [here](https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Extremely Lightweight Quantization Robust Real-Time Single-Image Super Resolution for Mobile Devices](https://arxiv.org/abs/2105.10288) diff --git a/qai_hub_models/models/xlsr_quantized/conftest.py b/qai_hub_models/models/xlsr_quantized/conftest.py new file mode 100644 index 00000000..c2a4915a --- /dev/null +++ b/qai_hub_models/models/xlsr_quantized/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.xlsr_quantized import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.xlsr_quantized.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/xlsr_quantized/demo.py b/qai_hub_models/models/xlsr_quantized/demo.py index fde391bd..af51277d 100644 --- a/qai_hub_models/models/xlsr_quantized/demo.py +++ b/qai_hub_models/models/xlsr_quantized/demo.py @@ -19,6 +19,7 @@ def main(is_test: bool = False): super_resolution_demo( XLSRQuantizable, + MODEL_ID, IMAGE_ADDRESS, is_test, available_target_runtimes=[TargetRuntime.TFLITE], diff --git a/qai_hub_models/models/xlsr_quantized/export.py b/qai_hub_models/models/xlsr_quantized/export.py index 9ae48155..d8b04a25 100644 --- a/qai_hub_models/models/xlsr_quantized/export.py +++ b/qai_hub_models/models/xlsr_quantized/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub @@ -123,8 +123,8 @@ def export_model( + " --force_channel_last_input image" + " --force_channel_last_output output_0", ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), @@ -132,21 +132,29 @@ def export_model( calibration_data=quant_calibration_data, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -158,30 +166,31 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore # Convert outputs from channel last to channel first inference_result = transpose_channel_last_to_first( "output_0", inference_result, target_runtime diff --git a/qai_hub_models/models/xlsr_quantized/info.yaml b/qai_hub_models/models/xlsr_quantized/info.yaml index 55059211..38920617 100644 --- a/qai_hub_models/models/xlsr_quantized/info.yaml +++ b/qai_hub_models/models/xlsr_quantized/info.yaml @@ -12,6 +12,7 @@ research_paper: https://arxiv.org/abs/2105.10288 research_paper_title: Extremely Lightweight Quantization Robust Real-Time Single-Image Super Resolution for Mobile Devices license: https://github.com/quic/aimet-model-zoo/blob/develop/LICENSE.pdf +deploy_license: https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf source_repo: https://github.com/quic/aimet-model-zoo/tree/develop/aimet_zoo_torch/xlsr technical_details: Model checkpoint: xlsr_4x_checkpoint_w8a8 @@ -29,4 +30,5 @@ related_models: [esrgan, real_esrgan_general_x4v3, xlsr] has_static_banner: yes has_animated_banner: yes license_type: other +deploy_license_type: AI Model Hub License dataset: [] diff --git a/qai_hub_models/models/xlsr_quantized/perf.yaml b/qai_hub_models/models/xlsr_quantized/perf.yaml index e76f30ab..9bf56e2f 100644 --- a/qai_hub_models/models/xlsr_quantized/perf.yaml +++ b/qai_hub_models/models/xlsr_quantized/perf.yaml @@ -17,31 +17,42 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: XLSR-Quantized performance_metrics: - torchscript_onnx_tflite: - inference_time: 1298.0 - throughput: 770.4160246533128 + inference_time: 1349.0 + throughput: 741.2898443291327 estimated_peak_memory_range: - min: 24576 - max: 1426056 + min: 28672 + max: 1726904 primary_compute_unit: NPU - precision: fp16 + precision: int8 layer_info: layers_on_npu: 16 layers_on_gpu: 0 layers_on_cpu: 3 total_layers: 19 - job_id: jo5m064yg + job_id: j1p3k3l52 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:58:09.460010Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 1084.0 + throughput: 922.509225092251 + estimated_peak_memory_range: + min: 20480 + max: 21010912 + primary_compute_unit: NPU + precision: int8 + layer_info: + layers_on_npu: 16 + layers_on_gpu: 0 + layers_on_cpu: 3 + total_layers: 19 + job_id: jwgoy0x58 + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:31:32.010687Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:58:09.460020Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/yolov6/README.md b/qai_hub_models/models/yolov6/README.md index 0b2e5623..3cf265f5 100644 --- a/qai_hub_models/models/yolov6/README.md +++ b/qai_hub_models/models/yolov6/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Yolo-v6 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov6). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -25,7 +25,7 @@ python -m qai_hub_models.models.yolov6.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -41,7 +41,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Yolo-v6 can be found [here](https://github.com/meituan/YOLOv6/blob/47625514e7480706a46ff3c0cd0252907ac12f22/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications](https://arxiv.org/abs/2209.02976) diff --git a/qai_hub_models/models/yolov6/conftest.py b/qai_hub_models/models/yolov6/conftest.py new file mode 100644 index 00000000..6055e321 --- /dev/null +++ b/qai_hub_models/models/yolov6/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.yolov6 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.yolov6.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/yolov6/demo.py b/qai_hub_models/models/yolov6/demo.py index 9f2ba92b..b1d136c1 100644 --- a/qai_hub_models/models/yolov6/demo.py +++ b/qai_hub_models/models/yolov6/demo.py @@ -18,6 +18,7 @@ def main(is_test: bool = False): yolo_detection_demo( YoloV6, + MODEL_ID, YoloV6DetectionApp, IMAGE_ADDRESS, YoloV6.STRIDE_MULTIPLE, diff --git a/qai_hub_models/models/yolov6/export.py b/qai_hub_models/models/yolov6/export.py index a425cd58..1fd64724 100644 --- a/qai_hub_models/models/yolov6/export.py +++ b/qai_hub_models/models/yolov6/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,35 +154,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, outputs_to_skip=[2] ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/yolov6/info.yaml b/qai_hub_models/models/yolov6/info.yaml index 9195123b..419842fd 100644 --- a/qai_hub_models/models/yolov6/info.yaml +++ b/qai_hub_models/models/yolov6/info.yaml @@ -14,6 +14,8 @@ research_paper_title: 'YOLOv6: A Single-Stage Object Detection Framework for Ind Applications' license: https://github.com/meituan/YOLOv6/blob/47625514e7480706a46ff3c0cd0252907ac12f22/LICENSE +deploy_license: + https://github.com/meituan/YOLOv6/blob/47625514e7480706a46ff3c0cd0252907ac12f22/LICENSE source_repo: https://github.com/meituan/YOLOv6/ technical_details: Model checkpoint: YoloV6-N @@ -35,4 +37,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: gpl-3.0 +deploy_license_type: gpl-3.0 dataset: [] diff --git a/qai_hub_models/models/yolov6/perf.yaml b/qai_hub_models/models/yolov6/perf.yaml index 82e39b9b..93dc9a2a 100644 --- a/qai_hub_models/models/yolov6/perf.yaml +++ b/qai_hub_models/models/yolov6/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Yolo-v6 performance_metrics: - torchscript_onnx_tflite: - inference_time: 7848.0 - throughput: 127.420998980632 + inference_time: 8480.0 + throughput: 117.9245283018868 estimated_peak_memory_range: - min: 32768 - max: 7233136 + min: 24576 + max: 3130456 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 182 - job_id: jqpyoj4r5 + job_id: jz5wo0jp1 job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:29:36.439969Z' torchscript_onnx_qnn: - inference_time: 7283.0 - throughput: 137.3060551970342 + inference_time: 7275.0 + throughput: 137.4570446735395 estimated_peak_memory_range: - min: 4931584 - max: 17461520 + min: 4939776 + max: 18286232 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 230 - job_id: j2p0m212g + job_id: jnp10kl5q + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 6051.0 + throughput: 165.26194017517767 + estimated_peak_memory_range: + min: 16384 + max: 74357488 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 182 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 182 + job_id: jmg9v7v57 job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:12:26.065342Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:32:50.147901Z' + torchscript_onnx_qnn: + inference_time: 5175.0 + throughput: 193.23671497584542 + estimated_peak_memory_range: + min: 4931584 + max: 94425040 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 230 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 230 + job_id: jz57zmrp3 + job_status: Passed diff --git a/qai_hub_models/models/yolov6/test.py b/qai_hub_models/models/yolov6/test.py index d3d13d82..4239dec7 100644 --- a/qai_hub_models/models/yolov6/test.py +++ b/qai_hub_models/models/yolov6/test.py @@ -46,5 +46,6 @@ def test_task(): assert np.allclose(source_out_postprocessed[i], qaihm_out_postprocessed[i]) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/yolov7/README.md b/qai_hub_models/models/yolov7/README.md index 73cdec65..2db3ff95 100644 --- a/qai_hub_models/models/yolov7/README.md +++ b/qai_hub_models/models/yolov7/README.md @@ -10,7 +10,7 @@ This is based on the implementation of Yolo-v7 found export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov7). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.yolov7.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -46,7 +46,7 @@ script requires access to Deployment instructions for Qualcomm® AI Hub. ## License - The license for the original implementation of Yolo-v7 can be found [here](https://github.com/WongKinYiu/yolov7/blob/main/LICENSE.md). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors](https://arxiv.org/abs/2207.02696) diff --git a/qai_hub_models/models/yolov7/conftest.py b/qai_hub_models/models/yolov7/conftest.py new file mode 100644 index 00000000..d2efde67 --- /dev/null +++ b/qai_hub_models/models/yolov7/conftest.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.yolov7 import Model +from qai_hub_models.utils.testing import skip_clone_repo_check + + +@pytest.fixture(autouse=True) +@skip_clone_repo_check +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.yolov7.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/yolov7/demo.py b/qai_hub_models/models/yolov7/demo.py index 23b01552..65c4bff0 100644 --- a/qai_hub_models/models/yolov7/demo.py +++ b/qai_hub_models/models/yolov7/demo.py @@ -15,6 +15,7 @@ def main(is_test: bool = False): yolo_detection_demo( YoloV7, + MODEL_ID, YoloV7DetectionApp, IMAGE_ADDRESS, YoloV7.STRIDE_MULTIPLE, diff --git a/qai_hub_models/models/yolov7/export.py b/qai_hub_models/models/yolov7/export.py index e58ad576..4a68cc79 100644 --- a/qai_hub_models/models/yolov7/export.py +++ b/qai_hub_models/models/yolov7/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -109,35 +109,43 @@ def export_model( ) # Trace the model - source_model = torch.jit.trace(model, make_torch_inputs(input_spec)) + source_model = torch.jit.trace(model.to("cpu"), make_torch_inputs(input_spec)) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -146,35 +154,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, outputs_to_skip=[2] ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/yolov7/info.yaml b/qai_hub_models/models/yolov7/info.yaml index 5cf326e6..88298456 100644 --- a/qai_hub_models/models/yolov7/info.yaml +++ b/qai_hub_models/models/yolov7/info.yaml @@ -13,6 +13,7 @@ research_paper: https://arxiv.org/abs/2207.02696 research_paper_title: 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors' license: https://github.com/WongKinYiu/yolov7/blob/main/LICENSE.md +deploy_license: https://github.com/WongKinYiu/yolov7/blob/main/LICENSE.md source_repo: https://github.com/WongKinYiu/yolov7/ technical_details: Model checkpoint: YoloV7 Tiny @@ -34,4 +35,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: gpl-3.0 +deploy_license_type: gpl-3.0 dataset: [] diff --git a/qai_hub_models/models/yolov7/model.py b/qai_hub_models/models/yolov7/model.py index 63e56e72..f46bed59 100644 --- a/qai_hub_models/models/yolov7/model.py +++ b/qai_hub_models/models/yolov7/model.py @@ -13,8 +13,9 @@ detect_postprocess, yolo_sample_inputs, ) +from qai_hub_models.models.common import SampleInputsType from qai_hub_models.utils.asset_loaders import SourceAsRoot -from qai_hub_models.utils.base_model import BaseModel, InputsType +from qai_hub_models.utils.base_model import BaseModel from qai_hub_models.utils.input_spec import InputSpec YOLOV7_SOURCE_REPOSITORY = "https://github.com/WongKinYiu/yolov7" @@ -102,7 +103,7 @@ def get_input_spec( """ return {"image": ((batch_size, num_channels, height, width), "float32")} - def sample_inputs(self, input_spec: InputSpec | None = None) -> InputsType: + def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType: if input_spec is not None and input_spec != YoloV7.get_input_spec(): raise ValueError("Sample input has a fixed size that cannot be changed") diff --git a/qai_hub_models/models/yolov7/perf.yaml b/qai_hub_models/models/yolov7/perf.yaml index 36dfb9a3..d9dc602f 100644 --- a/qai_hub_models/models/yolov7/perf.yaml +++ b/qai_hub_models/models/yolov7/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: - name: Yolo-v7 performance_metrics: - torchscript_onnx_tflite: - inference_time: 22349.0 - throughput: 44.74473130788849 + inference_time: 24023.0 + throughput: 41.626774341256294 estimated_peak_memory_range: - min: 9764864 - max: 12574848 + min: 9568256 + max: 12076232 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 21 total_layers: 307 - job_id: jvgddqzlg + job_id: jqpye94gy job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:22:25.772406Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 17674.0 + throughput: 56.580287427860135 + estimated_peak_memory_range: + min: 327680 + max: 113867968 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 286 + layers_on_gpu: 0 + layers_on_cpu: 21 + total_layers: 307 + job_id: j2p0ynegw + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:10:34.471023Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:22:25.772414Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/yolov7/requirements.txt b/qai_hub_models/models/yolov7/requirements.txt index 8e95168e..2978ccea 100644 --- a/qai_hub_models/models/yolov7/requirements.txt +++ b/qai_hub_models/models/yolov7/requirements.txt @@ -1,6 +1,3 @@ -matplotlib -opencv-python -PyYAML -requests -scipy -seaborn +matplotlib==3.7.4 +scipy==1.8.1 +seaborn==0.11.0 diff --git a/qai_hub_models/models/yolov7/test.py b/qai_hub_models/models/yolov7/test.py index b2d84594..b8f204e1 100644 --- a/qai_hub_models/models/yolov7/test.py +++ b/qai_hub_models/models/yolov7/test.py @@ -45,6 +45,7 @@ def test_task(): assert np.allclose(source_out_postprocessed[i], qaihm_out_postprocessed[i]) +@skip_clone_repo_check def test_yolov7_app(): image = load_image(IMAGE_ADDRESS) output_image = load_image(OUTPUT_IMAGE_ADDRESS).convert("RGB") @@ -52,5 +53,6 @@ def test_yolov7_app(): assert np.allclose(app.predict_boxes_from_image(image)[0], np.asarray(output_image)) +@skip_clone_repo_check def test_demo(): demo_main(is_test=True) diff --git a/qai_hub_models/models/yolov8_det/README.md b/qai_hub_models/models/yolov8_det/README.md index dbf1242c..91edcac5 100644 --- a/qai_hub_models/models/yolov8_det/README.md +++ b/qai_hub_models/models/yolov8_det/README.md @@ -1,16 +1,16 @@ [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) -# [Yolo-v8-Detection: Real-time object detection optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov8_det) +# [YOLOv8-Detection: Real-time object detection optimized for mobile and edge by Ultralytics](https://aihub.qualcomm.com/models/yolov8_det) -YoloV8 is a machine learning model that predicts bounding boxes and classes of objects in an image. +Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes and classes of objects in an image. -This is based on the implementation of Yolo-v8-Detection found +This is based on the implementation of YOLOv8-Detection found [here](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/detect). This repository contains scripts for optimized on-device export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov8_det). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.yolov8_det.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -44,9 +44,9 @@ Additional options are documented with the `--help` option. Note that the above script requires access to Deployment instructions for Qualcomm® AI Hub. ## License -- The license for the original implementation of Yolo-v8-Detection can be found +- The license for the original implementation of YOLOv8-Detection can be found [here](https://github.com/ultralytics/ultralytics/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Real-Time Flying Object Detection with YOLOv8](https://arxiv.org/abs/2305.09972) diff --git a/qai_hub_models/models/yolov8_det/conftest.py b/qai_hub_models/models/yolov8_det/conftest.py new file mode 100644 index 00000000..0e32c6fa --- /dev/null +++ b/qai_hub_models/models/yolov8_det/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.yolov8_det import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.yolov8_det.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/yolov8_det/demo.py b/qai_hub_models/models/yolov8_det/demo.py index 3e766a25..434a87f0 100644 --- a/qai_hub_models/models/yolov8_det/demo.py +++ b/qai_hub_models/models/yolov8_det/demo.py @@ -19,6 +19,7 @@ def main(is_test: bool = False): yolo_detection_demo( YoloV8Detector, + MODEL_ID, YoloV8DetectionApp, IMAGE_ADDRESS, is_test=is_test, diff --git a/qai_hub_models/models/yolov8_det/export.py b/qai_hub_models/models/yolov8_det/export.py index e26be3de..2b207ddb 100644 --- a/qai_hub_models/models/yolov8_det/export.py +++ b/qai_hub_models/models/yolov8_det/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -90,7 +90,7 @@ def export_model( if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "yolov8_det", - "Yolo-v8-Detection", + "YOLOv8-Detection", device, skip_profiling, skip_inferencing, @@ -110,36 +110,44 @@ def export_model( # Trace the model source_model = torch.jit.trace( - model, make_torch_inputs(input_spec), check_trace=False + model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -148,35 +156,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, outputs_to_skip=[2] ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/yolov8_det/info.yaml b/qai_hub_models/models/yolov8_det/info.yaml index bff88dc3..a987a227 100644 --- a/qai_hub_models/models/yolov8_det/info.yaml +++ b/qai_hub_models/models/yolov8_det/info.yaml @@ -1,21 +1,22 @@ -name: Yolo-v8-Detection +name: YOLOv8-Detection # id must match with the model dir name in qai_hub_models id: yolov8_det status: public -headline: Real-time object detection optimized for mobile and edge. +headline: Real-time object detection optimized for mobile and edge by Ultralytics. domain: Computer Vision use_case: Object Detection -description: YoloV8 is a machine learning model that predicts bounding boxes and classes +description: Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes and classes of objects in an image. tags: - real-time research_paper: https://arxiv.org/abs/2305.09972 research_paper_title: Real-Time Flying Object Detection with YOLOv8 license: https://github.com/ultralytics/ultralytics/blob/main/LICENSE +deploy_license: https://github.com/ultralytics/ultralytics/blob/main/LICENSE source_repo: https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/detect technical_details: - Model checkpoint: YoloV8-N + Model checkpoint: YOLOv8-N Input resolution: 640x640 Number of parameters: 3.18M Model size: 12.2 MB @@ -34,4 +35,5 @@ form_factors: has_static_banner: yes has_animated_banner: yes license_type: agpl-3.0 +deploy_license_type: agpl-3.0 dataset: [] diff --git a/qai_hub_models/models/yolov8_det/perf.yaml b/qai_hub_models/models/yolov8_det/perf.yaml index bbaddd57..ecd1b9a2 100644 --- a/qai_hub_models/models/yolov8_det/perf.yaml +++ b/qai_hub_models/models/yolov8_det/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: -- name: Yolo-v8-Detection +- name: YOLOv8-Detection performance_metrics: - torchscript_onnx_tflite: - inference_time: 9251.0 - throughput: 108.09642200843152 + inference_time: 9217.0 + throughput: 108.49517196484756 estimated_peak_memory_range: - min: 233472 - max: 2649168 + min: 262144 + max: 19308896 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,14 +43,22 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 300 - job_id: j7gjr2q8p + job_id: jo5mrw9gk job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-14T23:28:16.047386Z' torchscript_onnx_qnn: - inference_time: 7043.0 - throughput: 141.9849495953429 + inference_time: 7039.0 + throughput: 142.06563432305725 estimated_peak_memory_range: - min: 4939776 - max: 19565584 + min: 4984832 + max: 18803744 primary_compute_unit: NPU precision: fp16 layer_info: @@ -55,13 +66,43 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 294 - job_id: jlpe7wy05 + job_id: joprk4750 + job_status: Passed + - torchscript_onnx_tflite: + inference_time: 6502.0 + throughput: 153.79883112888342 + estimated_peak_memory_range: + min: 24576 + max: 83870080 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 300 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 300 + job_id: jegn29qgo job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:08:50.678067Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-14T23:30:19.085764Z' + torchscript_onnx_qnn: + inference_time: 4840.0 + throughput: 206.61157024793388 + estimated_peak_memory_range: + min: 4947968 + max: 123420640 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 294 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 294 + job_id: jep287qp6 + job_status: Passed diff --git a/qai_hub_models/models/yolov8_det/requirements.txt b/qai_hub_models/models/yolov8_det/requirements.txt index 5d6e5cf5..94980b0d 100644 --- a/qai_hub_models/models/yolov8_det/requirements.txt +++ b/qai_hub_models/models/yolov8_det/requirements.txt @@ -1 +1,3 @@ +seaborn==0.11.0 +thop==0.1.1.post2209072238 ultralytics==8.0.193 diff --git a/qai_hub_models/models/yolov8_det/test.py b/qai_hub_models/models/yolov8_det/test.py index 13614261..a98da871 100644 --- a/qai_hub_models/models/yolov8_det/test.py +++ b/qai_hub_models/models/yolov8_det/test.py @@ -17,7 +17,6 @@ ) from qai_hub_models.utils.asset_loaders import CachedWebModelAsset, load_image from qai_hub_models.utils.image_processing import preprocess_PIL_image -from qai_hub_models.utils.testing import skip_clone_repo_check OUTPUT_IMAGE_ADDRESS = CachedWebModelAsset.from_asset_store( MODEL_ID, MODEL_ASSET_VERSION, "test_images/output_image.png" @@ -25,7 +24,6 @@ WEIGHTS = "yolov8n.pt" -@skip_clone_repo_check def test_task(): """Verify that raw (numeric) outputs of both (QAIHM and non-qaihm) networks are the same.""" processed_sample_image = preprocess_PIL_image(load_image(IMAGE_ADDRESS)) diff --git a/qai_hub_models/models/yolov8_seg/README.md b/qai_hub_models/models/yolov8_seg/README.md index b3c82eec..f29edea1 100644 --- a/qai_hub_models/models/yolov8_seg/README.md +++ b/qai_hub_models/models/yolov8_seg/README.md @@ -1,16 +1,16 @@ [![Qualcomm® AI Hub Models](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/quic-logo.jpg)](../../README.md) -# [Yolo-v8-Segmentation: Real-time object segmentation optimized for mobile and edge](https://aihub.qualcomm.com/models/yolov8_seg) +# [YOLOv8-Segmentation: Real-time object segmentation optimized for mobile and edge by Ultralytics](https://aihub.qualcomm.com/models/yolov8_seg) -YoloV8 is a machine learning model that predicts bounding boxes, segmentation masks and classes of objects in an image. +Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes, segmentation masks and classes of objects in an image. -This is based on the implementation of Yolo-v8-Segmentation found +This is based on the implementation of YOLOv8-Segmentation found [here](https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/segment). This repository contains scripts for optimized on-device export suitable to run on Qualcomm® devices. More details on model performance accross various devices, can be found [here](https://aihub.qualcomm.com/models/yolov8_seg). -[Sign up](https://aihub.qualcomm.com/) for early access to run these models on +[Sign up](https://myaccount.qualcomm.com/signup) for early access to run these models on a hosted Qualcomm® device. @@ -30,7 +30,7 @@ python -m qai_hub_models.models.yolov8_seg.demo More details on the CLI tool can be found with the `--help` option. See [demo.py](demo.py) for sample usage of the model including pre/post processing scripts. Please refer to our [general instructions on using -models](../../#qai-hub-models) for more usage instructions. +models](../../../#getting-started) for more usage instructions. ## Export for on-device deployment @@ -44,9 +44,9 @@ Additional options are documented with the `--help` option. Note that the above script requires access to Deployment instructions for Qualcomm® AI Hub. ## License -- The license for the original implementation of Yolo-v8-Segmentation can be found +- The license for the original implementation of YOLOv8-Segmentation can be found [here](https://github.com/ultralytics/ultralytics/blob/main/LICENSE). -- The license for the compiled assets for on-device deployment can be found [here](https://qaihub-public-assets.s3.us-west-2.amazonaws.com/qai-hub-models/Qualcomm+AI+Hub+Proprietary+License.pdf). +- The license for the compiled assets for on-device deployment can be found [here]({deploy_license_url}) ## References * [Real-Time Flying Object Detection with YOLOv8](https://arxiv.org/abs/2305.09972) diff --git a/qai_hub_models/models/yolov8_seg/conftest.py b/qai_hub_models/models/yolov8_seg/conftest.py new file mode 100644 index 00000000..273c44c1 --- /dev/null +++ b/qai_hub_models/models/yolov8_seg/conftest.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +# THIS FILE WAS AUTO-GENERATED. DO NOT EDIT MANUALLY. + +from unittest.mock import patch + +import pytest + +from qai_hub_models.models.yolov8_seg import Model + + +@pytest.fixture(autouse=True) +def mock_from_pretrained(): + """ + Model.from_pretrained() can be slow. Invoke it once and cache it so all invocations + across all tests return the cached instance of the model. + """ + mock = patch( + "qai_hub_models.models.yolov8_seg.Model.from_pretrained", + return_value=Model.from_pretrained(), + ) + mock.start() diff --git a/qai_hub_models/models/yolov8_seg/demo.py b/qai_hub_models/models/yolov8_seg/demo.py index bd194c22..fc39d386 100644 --- a/qai_hub_models/models/yolov8_seg/demo.py +++ b/qai_hub_models/models/yolov8_seg/demo.py @@ -68,7 +68,7 @@ def yolov8_seg_demo( help="Intersection over Union (IoU) threshold for NonMaximumSuppression", ) args = parser.parse_args([] if is_test else None) - validate_on_device_demo_args(args, model_type.get_model_id()) + validate_on_device_demo_args(args, MODEL_ID) if args.image is None: image_path = default_image.fetch() @@ -76,7 +76,7 @@ def yolov8_seg_demo( image_path = args.image # Load image & model - model = demo_model_from_cli_args(model_type, args, check_trace=False) + model = demo_model_from_cli_args(model_type, MODEL_ID, args) app = YoloV8SegmentationApp(model, args.score_threshold, args.iou_threshold) print("Model Loaded") diff --git a/qai_hub_models/models/yolov8_seg/export.py b/qai_hub_models/models/yolov8_seg/export.py index 366b63ea..227c0a81 100644 --- a/qai_hub_models/models/yolov8_seg/export.py +++ b/qai_hub_models/models/yolov8_seg/export.py @@ -10,7 +10,7 @@ import os import warnings from pathlib import Path -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast import qai_hub as hub import torch @@ -90,7 +90,7 @@ def export_model( if not can_access_qualcomm_ai_hub(): return export_without_hub_access( "yolov8_seg", - "Yolo-v8-Segmentation", + "YOLOv8-Segmentation", device, skip_profiling, skip_inferencing, @@ -110,36 +110,44 @@ def export_model( # Trace the model source_model = torch.jit.trace( - model, make_torch_inputs(input_spec), check_trace=False + model.to("cpu"), make_torch_inputs(input_spec), check_trace=False ) # 2. Compile the model to an on-device asset model_compile_options = model.get_hub_compile_options( target_runtime, compile_options + " --force_channel_last_input image" ) - print(f"Optimizing model {model_name} to run on-device.") - compile_job = hub.submit_compile_job( + print(f"Optimizing model {model_name} to run on-device") + submitted_compile_job = hub.submit_compile_job( model=source_model, input_specs=input_spec, device=hub.Device(device), name=model_name, options=model_compile_options, ) + compile_job = cast(hub.client.CompileJob, submitted_compile_job) # 3. Profile the model asset on real devices - profile_job = None + profile_job: Optional[hub.client.ProfileJob] = None if not skip_profiling: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print(f"Profiling model {model_name} on a hosted device.") - profile_job = hub.submit_profile_job( + submitted_profile_job = hub.submit_profile_job( model=compile_job.get_target_model(), device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + profile_job = cast(hub.client.ProfileJob, submitted_profile_job) # 4. Run inference on-device with sample inputs - inference_job = None + inference_job: Optional[hub.client.InferenceJob] = None if not skip_inferencing: + profile_options_all = model.get_hub_profile_options( + target_runtime, profile_options + ) print( f"Running inference for {model_name} on a hosted device with example inputs." ) @@ -148,35 +156,37 @@ def export_model( hub_inputs = transpose_channel_first_to_last( "image", sample_inputs, target_runtime ) - inference_job = hub.submit_inference_job( + submitted_inference_job = hub.submit_inference_job( model=compile_job.get_target_model(), inputs=hub_inputs, device=hub.Device(device), name=model_name, - options=profile_options, + options=profile_options_all, ) + inference_job = cast(hub.client.InferenceJob, submitted_inference_job) # 5. Download the model asset to a local file if not skip_downloading: os.makedirs(output_path, exist_ok=True) - target_model = compile_job.get_target_model() + target_model: hub.Model = compile_job.get_target_model() # type: ignore target_model.download(str(output_path / f"{model_name}.tflite")) # 6. Summarize the results from profiling and inference if not skip_summary and not skip_profiling: - assert profile_job.wait().success - profile_data = profile_job.download_profile() + assert profile_job is not None and profile_job.wait().success + profile_data: Dict[str, Any] = profile_job.download_profile() # type: ignore print_profile_metrics_from_job(profile_job, profile_data) if not skip_summary and not skip_inferencing: torch_out = torch_inference(model, sample_inputs) - assert inference_job.wait().success - inference_result = inference_job.download_output_data() + assert inference_job is not None and inference_job.wait().success + inference_result: hub.client.DatasetEntries = inference_job.download_output_data() # type: ignore print_inference_metrics( inference_job, inference_result, torch_out, outputs_to_skip=[3] ) - print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) + if not skip_summary: + print_on_target_demo_cmd(compile_job, Path(__file__).parent.resolve(), device) return (compile_job, profile_job, inference_job) diff --git a/qai_hub_models/models/yolov8_seg/info.yaml b/qai_hub_models/models/yolov8_seg/info.yaml index 7397936c..f20e4138 100644 --- a/qai_hub_models/models/yolov8_seg/info.yaml +++ b/qai_hub_models/models/yolov8_seg/info.yaml @@ -1,21 +1,22 @@ -name: Yolo-v8-Segmentation +name: YOLOv8-Segmentation # id must match with the model dir name in qai_hub_models id: yolov8_seg status: public -headline: Real-time object segmentation optimized for mobile and edge. +headline: Real-time object segmentation optimized for mobile and edge by Ultralytics. domain: Computer Vision use_case: Semantic Segmentation -description: YoloV8 is a machine learning model that predicts bounding boxes, segmentation +description: Ultralytics YOLOv8 is a machine learning model that predicts bounding boxes, segmentation masks and classes of objects in an image. tags: - real-time research_paper: https://arxiv.org/abs/2305.09972 research_paper_title: Real-Time Flying Object Detection with YOLOv8 license: https://github.com/ultralytics/ultralytics/blob/main/LICENSE +deploy_license: https://github.com/ultralytics/ultralytics/blob/main/LICENSE source_repo: https://github.com/ultralytics/ultralytics/tree/main/ultralytics/models/yolo/segment technical_details: - Model checkpoint: YoloV8N-Seg + Model checkpoint: YOLOv8N-Seg Input resolution: 640x640 Number of parameters: 3.43M Model size: 13.2 MB @@ -38,4 +39,5 @@ form_factors: has_static_banner: yes has_animated_banner: no license_type: agpl-3.0 +deploy_license_type: agpl-3.0 dataset: [] diff --git a/qai_hub_models/models/yolov8_seg/model.py b/qai_hub_models/models/yolov8_seg/model.py index ff3ac466..bbd5fee2 100644 --- a/qai_hub_models/models/yolov8_seg/model.py +++ b/qai_hub_models/models/yolov8_seg/model.py @@ -72,8 +72,8 @@ def forward(self, image: torch.Tensor): boxes, scores, masks, classes = yolov8_segment_postprocess(predictions[0]) return boxes, scores, masks, classes, predictions[1][-1] + @staticmethod def get_input_spec( - self, batch_size: int = 1, num_channels: int = 3, height: int = 640, diff --git a/qai_hub_models/models/yolov8_seg/perf.yaml b/qai_hub_models/models/yolov8_seg/perf.yaml index 6e25b242..1f87df00 100644 --- a/qai_hub_models/models/yolov8_seg/perf.yaml +++ b/qai_hub_models/models/yolov8_seg/perf.yaml @@ -17,22 +17,25 @@ aggregated: - Samsung Galaxy S23 - Samsung Galaxy S23 Ultra - Samsung Galaxy S23+ + - Samsung Galaxy S24 + - Samsung Galaxy S24 Ultra - Samsung Galaxy Tab S8 - Xiaomi 12 - Xiaomi 12 Pro supported_chipsets: - Snapdragon® 8 Gen 1 - Snapdragon® 8 Gen 2 + - Snapdragon® 8 Gen 3 - Snapdragon® 888 models: -- name: Yolo-v8-Segmentation +- name: YOLOv8-Segmentation performance_metrics: - torchscript_onnx_tflite: - inference_time: 10686.0 - throughput: 93.58038555118847 + inference_time: 10665.0 + throughput: 93.76465072667604 estimated_peak_memory_range: min: 4616192 - max: 6819472 + max: 6990768 primary_compute_unit: NPU precision: fp16 layer_info: @@ -40,8 +43,16 @@ models: layers_on_gpu: 0 layers_on_cpu: 0 total_layers: 337 - job_id: jz57el6qp + job_id: j1glnxepv job_status: Passed + reference_device_info: + name: Samsung Galaxy S23 + os: '13' + form_factor: Phone + os_name: Android + manufacturer: Samsung + chipset: Snapdragon® 8 Gen 2 + timestamp: '2024-03-15T00:08:48.972058Z' torchscript_onnx_qnn: inference_time: 'null' throughput: 'null' @@ -57,11 +68,41 @@ models: total_layers: 0 job_id: '' job_status: Skipped + - torchscript_onnx_tflite: + inference_time: 7417.0 + throughput: 134.8254011055683 + estimated_peak_memory_range: + min: 53248 + max: 91611328 + primary_compute_unit: NPU + precision: fp16 + layer_info: + layers_on_npu: 337 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 337 + job_id: jw5667v5o + job_status: Passed reference_device_info: - name: Samsung Galaxy S23 Ultra - os: '13' + name: Samsung Galaxy S24 + os: '14' form_factor: Phone os_name: Android manufacturer: Samsung - chipset: Snapdragon® 8 Gen 2 - timestamp: '2024-02-21T16:36:07.212007Z' + chipset: Snapdragon® 8 Gen 3 + timestamp: '2024-03-15T00:08:48.972071Z' + torchscript_onnx_qnn: + inference_time: 'null' + throughput: 'null' + estimated_peak_memory_range: + min: 0 + max: 0 + primary_compute_unit: 'null' + precision: 'null' + layer_info: + layers_on_npu: 0 + layers_on_gpu: 0 + layers_on_cpu: 0 + total_layers: 0 + job_id: '' + job_status: Skipped diff --git a/qai_hub_models/models/yolov8_seg/requirements.txt b/qai_hub_models/models/yolov8_seg/requirements.txt index 5d6e5cf5..94980b0d 100644 --- a/qai_hub_models/models/yolov8_seg/requirements.txt +++ b/qai_hub_models/models/yolov8_seg/requirements.txt @@ -1 +1,3 @@ +seaborn==0.11.0 +thop==0.1.1.post2209072238 ultralytics==8.0.193 diff --git a/qai_hub_models/models/yolov8_seg/test.py b/qai_hub_models/models/yolov8_seg/test.py index f477d402..46c927d7 100644 --- a/qai_hub_models/models/yolov8_seg/test.py +++ b/qai_hub_models/models/yolov8_seg/test.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import numpy as np +import pytest import torch from ultralytics import YOLO as ultralytics_YOLO @@ -40,6 +41,7 @@ def test_task(): assert np.allclose(source_out_postprocessed[i], qaihm_out_postprocessed[i]) +@pytest.mark.trace def test_trace(): net = YoloV8Segmentor.from_pretrained(WEIGHTS) input_spec = net.get_input_spec() diff --git a/qai_hub_models/requirements-dev.txt b/qai_hub_models/requirements-dev.txt index 578fca5f..be5243dc 100644 --- a/qai_hub_models/requirements-dev.txt +++ b/qai_hub_models/requirements-dev.txt @@ -1,20 +1,18 @@ -boto3 -botocore +boto3==1.34.40 +botocore==1.34.40 coverage==6.5.0 -huggingface-hub==0.20.3 +imageio[ffmpeg]==2.31.5 jinja2==3.0.3 mypy==0.991 -protobuf==3.20.3 +pre-commit==3.5.0 pytest-cov==4.1.0 pytest-xdist==3.3.1 -pyyaml==6.0.1 -ruamel-yaml +ruamel-yaml==0.18.6 schema==0.7.5 -scikit-image>=0.21.0 -tensorflow-cpu==2.13.0; sys_platform != 'darwin' -tensorflow-macos==2.13.0; sys_platform == 'darwin' -types-PyYAML -types-pillow -types-tabulate -types-requests +scikit-image==0.21.0 +tflite==2.10.0 +types-PyYAML==6.0.12.12 +types-pillow==10.2.0.20240213 +types-tabulate==0.9.0.20240106 +types-requests==2.31.0.6 keyrings.envvars; python_version >= '3.9' # used only by CI diff --git a/qai_hub_models/requirements.txt b/qai_hub_models/requirements.txt index 9e263481..f25dc8e9 100644 --- a/qai_hub_models/requirements.txt +++ b/qai_hub_models/requirements.txt @@ -1,19 +1,25 @@ Pillow==10.0.1 +deprecation==2.1.0 +fsspec==2023.6.0 gdown==4.7.1 -gitpython -huggingface_hub -ipython +gitpython==3.1.42 +huggingface_hub==0.20.3 +ipython==8.12.3 numpy==1.23.1 opencv-python==4.8.1.78 -pandas -prettytable +packaging==23.2 +pandas==1.5.3 +prettytable==3.9.0 +protobuf==3.20.2 pytest==7.4.2 -pyyaml -qai_hub>=0.9.0 -requests -requests_toolbelt -schema -tabulate +pyyaml==6.0.1 +requests_toolbelt==1.0.0 +schema==0.7.5 +scipy==1.8.1 +tabulate==0.9.0 torch==1.13.1 -torchvision<=0.14.1 -urllib3<2 +torchvision==0.14.1 +typing-extensions==4.5.0 +tqdm==4.66.2 +urllib3==1.26.18 +qai_hub>=0.9.0 diff --git a/qai_hub_models/test/e2e/test_aimet_compile.py b/qai_hub_models/test/e2e/test_aimet_compile.py index 0bd090c5..f3d52c36 100644 --- a/qai_hub_models/test/e2e/test_aimet_compile.py +++ b/qai_hub_models/test/e2e/test_aimet_compile.py @@ -6,7 +6,7 @@ import pytest import qai_hub as hub -from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable +from qai_hub_models.models.squeezenet1_1_quantized.model import SqueezeNetQuantizable from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime from qai_hub_models.utils.inference import compile_zoo_model_to_hub from qai_hub_models.utils.measurement import get_model_size_mb @@ -16,16 +16,15 @@ @pytest.mark.parametrize( "source_model_format,target_runtime,expected_size_mb", [ - (SourceModelFormat.ONNX, TargetRuntime.TFLITE, 3.4), - (SourceModelFormat.TORCHSCRIPT, TargetRuntime.TFLITE, 3.4), - (SourceModelFormat.ONNX, TargetRuntime.QNN, 3.8), - (SourceModelFormat.TORCHSCRIPT, TargetRuntime.QNN, 3.8), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, 1.3), + (SourceModelFormat.TORCHSCRIPT, TargetRuntime.TFLITE, 1.3), + (SourceModelFormat.ONNX, TargetRuntime.QNN, 1.6), ], ) def test_compile_aimet( source_model_format, target_runtime, expected_size_mb, skip_clone_repo_check_fixture ): - model = MobileNetV2Quantizable.from_pretrained() + model = SqueezeNetQuantizable.from_pretrained() calibration_data = model.get_calibration_data(target_runtime) diff --git a/qai_hub_models/utils/aimet/config_loader.py b/qai_hub_models/utils/aimet/config_loader.py index b18bcf0e..dadc6012 100644 --- a/qai_hub_models/utils/aimet/config_loader.py +++ b/qai_hub_models/utils/aimet/config_loader.py @@ -5,13 +5,19 @@ from pathlib import Path -def get_default_aimet_config() -> str: - path = Path(__file__).parent / "default_config.json" +def get_default_aimet_config_legacy_v1() -> str: + path = Path(__file__).parent / "default_config_legacy_v1.json" return str(path.resolve()) -def get_per_channel_aimet_config() -> str: - path = Path(__file__).parent / "default_config_per_channel.json" +def get_default_aimet_config_legacy_v2() -> str: + # Introduced per-channel weights + path = Path(__file__).parent / "default_config_legacy_v2.json" + return str(path.resolve()) + + +def get_default_aimet_config() -> str: + path = Path(__file__).parent / "default_config.json" return str(path.resolve()) diff --git a/qai_hub_models/utils/aimet/default_config.json b/qai_hub_models/utils/aimet/default_config.json index f616005d..c7b13e15 100644 --- a/qai_hub_models/utils/aimet/default_config.json +++ b/qai_hub_models/utils/aimet/default_config.json @@ -11,14 +11,15 @@ "is_symmetric": "True" }, "strict_symmetric": "False", - "per_channel_quantization": "False" + "unsigned_symmetric": "False", + "per_channel_quantization": "True" }, "params": { "bias": { - "is_quantized": "True" + "is_quantized": "False" } }, @@ -26,19 +27,19 @@ { "Squeeze": { - "is_output_quantized": "False" + "is_output_quantized": "True" }, "Pad": { - "is_output_quantized": "False" + "is_output_quantized": "True" }, "Mean": { "is_output_quantized": "False" }, - "Gather": + "Gemm": { - "is_output_quantized": "False" + "per_channel_quantization": "False" } }, @@ -47,11 +48,11 @@ { "op_list": ["Conv", "Relu"] }, - { - "op_list": ["ConvTranspose", "Relu"] - }, - { + { "op_list": ["Conv", "Clip"] + }, + { + "op_list": ["Conv", "BatchNormalization", "Relu"] }, { "op_list": ["Add", "Relu"] diff --git a/qai_hub_models/utils/aimet/default_config_legacy_v1.json b/qai_hub_models/utils/aimet/default_config_legacy_v1.json new file mode 100644 index 00000000..f616005d --- /dev/null +++ b/qai_hub_models/utils/aimet/default_config_legacy_v1.json @@ -0,0 +1,71 @@ +{ + "defaults": + { + "ops": + { + "is_output_quantized": "True" + }, + "params": + { + "is_quantized": "True", + "is_symmetric": "True" + }, + "strict_symmetric": "False", + "per_channel_quantization": "False" + }, + + "params": + { + "bias": + { + "is_quantized": "True" + } + }, + + "op_type": + { + "Squeeze": + { + "is_output_quantized": "False" + }, + "Pad": + { + "is_output_quantized": "False" + }, + "Mean": + { + "is_output_quantized": "False" + }, + "Gather": + { + "is_output_quantized": "False" + } + }, + + "supergroups": + [ + { + "op_list": ["Conv", "Relu"] + }, + { + "op_list": ["ConvTranspose", "Relu"] + }, + { + "op_list": ["Conv", "Clip"] + }, + { + "op_list": ["Add", "Relu"] + }, + { + "op_list": ["Gemm", "Relu"] + } + ], + + "model_input": + { + "is_input_quantized": "True" + }, + + "model_output": + {} +} diff --git a/qai_hub_models/utils/aimet/default_config_per_channel.json b/qai_hub_models/utils/aimet/default_config_legacy_v2.json similarity index 100% rename from qai_hub_models/utils/aimet/default_config_per_channel.json rename to qai_hub_models/utils/aimet/default_config_legacy_v2.json diff --git a/qai_hub_models/utils/aimet/repo.py b/qai_hub_models/utils/aimet/repo.py new file mode 100644 index 00000000..1d7a6ab5 --- /dev/null +++ b/qai_hub_models/utils/aimet/repo.py @@ -0,0 +1,32 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +from contextlib import contextmanager + +from qai_hub_models.utils.asset_loaders import SourceAsRoot, find_replace_in_repo + +AIMET_ZOO_SOURCE_REPOSITORY = "https://github.com/quic/aimet-model-zoo" +AIMET_ZOO_SOURCE_REPO_COMMIT = "d09d2b0404d10f71a7640a87e9d5e5257b028802" +REPO_ASSET_VERSION = 1 + + +@contextmanager +def aimet_zoo_as_root(): + with SourceAsRoot( + AIMET_ZOO_SOURCE_REPOSITORY, + AIMET_ZOO_SOURCE_REPO_COMMIT, + source_repo_name="aimet_zoo", + source_repo_version=REPO_ASSET_VERSION, + keep_sys_modules=True, + ) as repo_root: + # Remove import of model_definition.py as it has an import error itself, + # but we don't need anything from that file here + find_replace_in_repo( + repo_root, + "aimet_zoo_torch/quicksrnet/__init__.py", + "from .model.model_definition import QuickSRNet", + " ", + ) + + yield repo_root diff --git a/qai_hub_models/utils/args.py b/qai_hub_models/utils/args.py index cbddf19c..8b60a63d 100644 --- a/qai_hub_models/utils/args.py +++ b/qai_hub_models/utils/args.py @@ -18,14 +18,12 @@ import qai_hub as hub -from qai_hub_models.utils.base_model import ( - BaseModel, +from qai_hub_models.models.protocols import ( FromPrecompiledTypeVar, - FromPretrainedMixin, + FromPretrainedProtocol, FromPretrainedTypeVar, - InputSpec, - TargetRuntime, ) +from qai_hub_models.utils.base_model import BaseModel, InputSpec, TargetRuntime from qai_hub_models.utils.inference import HubModel from qai_hub_models.utils.qai_hub_helpers import _AIHUB_NAME, can_access_qualcomm_ai_hub @@ -61,6 +59,9 @@ def add_output_dir_arg(parser: argparse.ArgumentParser) -> argparse.ArgumentPars def add_target_runtime_arg( parser: argparse.ArgumentParser, help: str, + available_target_runtimes: List[TargetRuntime] = list( + TargetRuntime.__members__.values() + ), default: TargetRuntime = TargetRuntime.TFLITE, ) -> argparse.ArgumentParser: parser.add_argument( @@ -68,7 +69,7 @@ def add_target_runtime_arg( type=str, action=partial(ParseEnumAction, enum_type=TargetRuntime), # type: ignore default=default, - choices=[name.lower() for name in TargetRuntime._member_names_], + choices=[rt.name.lower() for rt in available_target_runtimes], help=help, ) return parser @@ -124,6 +125,7 @@ def get_on_device_demo_parser( parser, help="The runtime to demo (if --on-device is specified).", default=default_runtime, + available_target_runtimes=available_target_runtimes, ) return parser @@ -139,7 +141,7 @@ def validate_on_device_demo_args(args: argparse.Namespace, model_name: str): if args.on_device and not can_access_qualcomm_ai_hub(): print( "On-device demos are not available without Qualcomm® AI Hub access.", - "Please sign up for Qualcomm® AI Hub at https://aihub.qualcomm.com/.", + "Please sign up for Qualcomm® AI Hub at https://myaccount.qualcomm.com/signup .", sep=os.linesep, ) sys.exit(1) @@ -210,8 +212,8 @@ def model_from_cli_args( def demo_model_from_cli_args( model_cls: Type[FromPretrainedTypeVar], + model_id: str, cli_args: argparse.Namespace, - check_trace: bool = True, ) -> FromPretrainedTypeVar | HubModel: """ Create this model from an argparse namespace. @@ -219,27 +221,23 @@ def demo_model_from_cli_args( If the model is a BaseModel and an on-device demo is requested, the BaseModel will be wrapped in a HubModel. """ - model = model_from_cli_args( - model_cls, cli_args - ) # TODO(9494): This should be replaced by static input spec is_on_device = "on_device" in cli_args and cli_args.on_device inference_model: FromPretrainedTypeVar | HubModel - if is_on_device and isinstance(model, BaseModel): + if is_on_device and issubclass(model_cls, BaseModel): device = hub.Device(cli_args.device, cli_args.device_os) if cli_args.hub_model_id: model_from_hub = hub.get_model(cli_args.hub_model_id) inference_model = HubModel( model_from_hub, - list(model.get_input_spec().keys()), + list(model_cls.get_input_spec().keys()), device, cli_args.inference_options, ) else: - model_cls = model_cls - export_file = f"qai_hub_models.models.{model.get_model_id()}.export" + export_file = f"qai_hub_models.models.{model_id}.export" export_module = import_module(export_file) compile_job: hub.CompileJob - print(f"Compiling on-device model asset for {model.get_model_id()}.") + print(f"Compiling on-device model asset for {model_id}.") print( f"Running python -m {export_file} --device {device.name} --target-runtime {cli_args.target_runtime.name.lower()}\n" ) @@ -262,7 +260,7 @@ def demo_model_from_cli_args( target_model = compile_job.get_target_model() assert target_model is not None - input_names = list(model.get_input_spec().keys()) + input_names = list(model_cls.get_input_spec().keys()) inference_model = HubModel( target_model, input_names, @@ -271,7 +269,7 @@ def demo_model_from_cli_args( ) print(f"Exported asset: {inference_model.model.name}\n") else: - inference_model = model + inference_model = model_from_cli_args(model_cls, cli_args) return inference_model @@ -419,7 +417,7 @@ def export_parser( help="Which components of the model to be exported.", ) - if issubclass(model_cls, FromPretrainedMixin): + if issubclass(model_cls, FromPretrainedProtocol): # Skip adding CLI from model for compiled model # TODO: #9408 Refactor BaseModel, BasePrecompiledModel to fetch # parameters from compiled model diff --git a/qai_hub_models/utils/asset_loaders.py b/qai_hub_models/utils/asset_loaders.py index 70eabe0a..652d259c 100644 --- a/qai_hub_models/utils/asset_loaders.py +++ b/qai_hub_models/utils/asset_loaders.py @@ -6,6 +6,7 @@ import fileinput import json +import logging import os import shutil import sys @@ -55,6 +56,17 @@ def always_answer_prompts(answer): _always_answer = old_value +@contextmanager +def set_log_level(log_level: int): + logger = logging.getLogger() + old_level = logger.level + try: + logger.setLevel(log_level) + yield + finally: + logger.setLevel(old_level) + + class QAIHM_WEB_ASSET(Enum): STATIC_IMG = 0 ANIMATED_MOV = 1 @@ -412,12 +424,20 @@ def load_torch(pt: PathType) -> Any: return _load_file(pt, partial(torch.load, map_location="cpu")) -def load_json(json_file: PathType) -> Dict: +def load_json(json_filepath: PathType) -> Dict: def _load_json_helper(file_path) -> Any: with open(file_path, "r") as json_file: return json.load(json_file) - return _load_file(json_file, _load_json_helper) + return _load_file(json_filepath, _load_json_helper) + + +def load_yaml(yaml_filepath: PathType) -> Dict: + def _load_yaml_helper(file_path) -> Any: + with open(file_path, "r") as yaml_file: + return yaml.safe_load(yaml_file) + + return _load_file(yaml_filepath, _load_yaml_helper) def load_path(file: PathType, tmpdir: tempfile.TemporaryDirectory | str) -> str | Path: @@ -439,7 +459,7 @@ def SourceAsRoot( source_repo_name: str, source_repo_version: int | str, source_repo_patches: List[str] = [], - keep_sys_path: bool = False, + keep_sys_modules: bool = False, ): """ Context manager that runs code with: @@ -457,21 +477,36 @@ def SourceAsRoot( patches=source_repo_patches, ) SOURCE_AS_ROOT_LOCK.acquire() - cwd = os.getcwd() original_path = list(sys.path) + original_modules = dict(sys.modules) + cwd = os.getcwd() try: + # If repo path already in sys.path from previous load, + # delete it and put it first + if repository_path in sys.path: + sys.path.remove(repository_path) # Patch path for this load only, since the model source # code references modules via a global scope. # Insert with highest priority (see #7666) sys.path.insert(0, repository_path) os.chdir(repository_path) - yield repository_path finally: # Be careful editing these lines (failure means partial clean-up) os.chdir(cwd) - if not keep_sys_path: - sys.path = original_path + sys.path = original_path + if not keep_sys_modules: + # When you call something like `import models`, it loads the `models` module + # into sys.modules so all future `import models` point to that module. + # + # We want all imports done within the sub-repo to be either deleted from + # sys.modules or restored to the previous module if one was overwritten. + for name, module in list(sys.modules.items()): + if (getattr(module, "__file__", "") or "").startswith(repository_path): + if name in original_modules: + sys.modules[name] = original_modules[name] + else: + del sys.modules[name] SOURCE_AS_ROOT_LOCK.release() diff --git a/qai_hub_models/utils/base_model.py b/qai_hub_models/utils/base_model.py index eea39fac..5a0ec84e 100644 --- a/qai_hub_models/utils/base_model.py +++ b/qai_hub_models/utils/base_model.py @@ -4,107 +4,105 @@ # --------------------------------------------------------------------- from __future__ import annotations -import os -from abc import ABC, ABCMeta, abstractmethod -from enum import Enum -from inspect import getmodule -from typing import Any, Dict, List, Type, TypeVar +from pathlib import Path +from typing import Any -import numpy as np import torch from qai_hub.client import SourceModel -from qai_hub_models.evaluators.base_evaluators import BaseEvaluator +from qai_hub_models.models.common import ( + SampleInputsType, + SourceModelFormat, + TargetRuntime, +) +from qai_hub_models.models.protocols import ( + ExecutableModelProtocol, + FromPrecompiledProtocol, + FromPretrainedProtocol, + HubModelProtocol, +) from qai_hub_models.utils.input_spec import InputSpec, make_torch_inputs -InputsType = Dict[str, List[np.ndarray]] - - -class TargetRuntime(Enum): - TFLITE = 0 - QNN = 1 - - def __str__(self): - return self.name.lower() +class CollectionModel: + """ + Model that glues together several BaseModels + """ -class SourceModelFormat(Enum): - ONNX = 0 - TORCHSCRIPT = 1 + pass -class DocstringInheritorMeta(ABCMeta): +class HubModel(HubModelProtocol): """ - Ensures that all subclasses retain the `forward` function's docstring. + Base interface for AI Hub models. """ - def __new__(cls, name, bases, dct): - new_class = super().__new__(cls, name, bases, dct) - if hasattr(new_class, "forward"): - parent_method = getattr(bases[0], "forward", None) - if parent_method and new_class.forward.__doc__ is None: # type: ignore - new_class.forward.__doc__ = parent_method.__doc__ # type: ignore - return new_class - + def __init__(self): + # Change self.get_input_spec() to call _get_input_spec_for_model_instance() instead. + # + # _get_input_spec_for_model_instance() is an override that allows get_input_spec() + # to access instance variables. This may be used in case input shape is "hard-coded" + # based on parameters passed to the model upon initialization. + # + self.get_input_spec = self._get_input_spec_for_model_instance -# Use this for typehints that take in a class and output an instance of the class. -FromPretrainedTypeVar = TypeVar("FromPretrainedTypeVar", bound="FromPretrainedMixin") -FromPrecompiledTypeVar = TypeVar("FromPrecompiledTypeVar", bound="FromPrecompiledMixin") + def _get_input_spec_for_model_instance(self, *args, **kwargs) -> InputSpec: + """ + Get the input specifications for an instance of this model. + Typically this will pre-fill inputs of get_input_spec + with values determined by instance members of the model class. -class FromPretrainedMixin(ABC): - @classmethod - @abstractmethod - def from_pretrained( - cls: Type[FromPretrainedTypeVar], *args, **kwargs - ) -> FromPretrainedTypeVar: + The initializer for BaseModel will automatically override get_input_spec + with this function when the class is instantiated. """ - Utility function that helps users get up and running with a default - pretrained model. While this function may take arguments, all arguments - should have default values specified, so that all classes can be invoked - with `cls.from_pretrained()` and always have it return something reasonable. - """ - pass - + return self.__class__.get_input_spec(*args, **kwargs) -class CollectionModel(FromPretrainedMixin): - """ - Model that glues together several BaseModels - """ + def sample_inputs(self, input_spec: InputSpec | None = None) -> SampleInputsType: + """ + Returns a set of sample inputs for the model. - pass + For each input name in the model, a list of numpy arrays is provided. + If the returned set is batch N, all input names must contain exactly N numpy arrays. + This is a default implementation that returns a single random data array + for each input name based on the shapes and dtypes in `get_input_spec`. -class BaseModel( - torch.nn.Module, FromPretrainedMixin, ABC, metaclass=DocstringInheritorMeta -): - @abstractmethod - def get_input_spec(self, *args, **kwargs) -> InputSpec: - """ - Returns a map from `{input_name -> (shape, dtype)}` - specifying the shape and dtype for each input argument. + A subclass may choose to override this and fetch a batch of real input data + from a data source. """ - pass + if not input_spec: + input_spec = self.get_input_spec() + inputs_dict = {} + inputs_list = make_torch_inputs(input_spec) + for i, input_name in enumerate(input_spec.keys()): + inputs_dict[input_name] = [inputs_list[i].numpy()] + return inputs_dict - @classmethod - def get_model_id(cls) -> str: + def get_hub_profile_options( + self, + target_runtime: TargetRuntime, + other_profile_options: str = "", + ) -> str: """ - Return model ID for this model. - The model ID is the same as the folder name for the model under qai_hub_models/models/... + AI Hub profile options recommended for the model. """ - module = getmodule(cls) - if not module or not module.__file__: - raise ValueError(f"Unable to get model ID for {cls.__name__}") + return other_profile_options - # Module path is always .../qai_hub_models/models//model.py - # Extract model ID from that path. - return os.path.basename(os.path.dirname(module.__file__)) - def get_evaluator(self) -> BaseEvaluator: - """ - Gets default model output evaluator for this model. - """ - raise NotImplementedError("This model does not define a default evaluator.") +class BaseModel( + torch.nn.Module, + HubModel, + FromPretrainedProtocol, + ExecutableModelProtocol, +): + """ + A pre-trained PyTorch model with helpers for submission to AI Hub. + """ + + def __init__(self): + torch.nn.Module.__init__(self) # Initialize Torch Module + HubModel.__init__(self) # Initialize Hub Model def convert_to_torchscript( self, input_spec: InputSpec | None = None, check_trace: bool = True @@ -125,7 +123,7 @@ def convert_to_torchscript( def convert_to_hub_source_model( self, target_runtime: TargetRuntime, - output_path: str, + output_path: str | Path, input_spec: InputSpec | None = None, check_trace: bool = True, ) -> SourceModel: @@ -135,7 +133,6 @@ def convert_to_hub_source_model( # Local import to prevent circular dependency from qai_hub_models.utils.inference import prepare_compile_zoo_model_to_hub - assert isinstance(self, BaseModel) source_model, _ = prepare_compile_zoo_model_to_hub( self, source_model_format=self.preferred_hub_source_model_format(target_runtime), @@ -152,7 +149,7 @@ def get_hub_compile_options( other_compile_options: str = "", ) -> str: """ - Convert to a AI Hub source model appropriate for the export method. + AI Hub compile options recommended for the model. """ compile_options = "" if target_runtime == TargetRuntime.QNN: @@ -164,71 +161,21 @@ def get_hub_compile_options( def preferred_hub_source_model_format( self, target_runtime: TargetRuntime ) -> SourceModelFormat: - return SourceModelFormat.TORCHSCRIPT - - def sample_inputs(self, input_spec: InputSpec | None = None) -> InputsType: """ - Returns a set of sample inputs for the model. - - For each input name in the model, a list of numpy arrays is provided. - If the returned set is batch N, all input names must contain exactly N numpy arrays. - - This is a default implementation that returns a single random data array - for each input name based on the shapes and dtypes in `get_input_spec`. - - A subclass may choose to override this and fetch a batch of real input data - from a data source. + Source model format preferred for conversion on AI Hub. """ - if not input_spec: - input_spec = self.get_input_spec() - inputs_dict = {} - inputs_list = make_torch_inputs(input_spec) - for i, input_name in enumerate(input_spec.keys()): - inputs_dict[input_name] = [inputs_list[i].numpy()] - return inputs_dict - - -class FromPrecompiledMixin(ABC): - @classmethod - @abstractmethod - def from_precompiled( - cls: Type[FromPrecompiledTypeVar], *args, **kwargs - ) -> "FromPrecompiledTypeVar": - """ - Utility function that helps users get up and running with a default - precompiled model. While this function may take arguments, all arguments - should have default values specified, so that all classes can be invoked - with `cls.from_precompiled()` and always have it return something reasonable. - """ - pass - - -class BasePrecompiledModel(FromPrecompiledMixin): - @abstractmethod - def get_input_spec(self, *args, **kwargs) -> InputSpec: - """ - Returns a map from `{input_name -> (shape, dtype)}` - specifying the shape and dtype for each input argument. - """ - pass + return SourceModelFormat.TORCHSCRIPT - def sample_inputs(self, input_spec: InputSpec | None = None) -> InputsType: - """ - Returns a set of sample inputs for the model. - For each input name in the model, a list of numpy arrays is provided. - If the returned set is batch N, all input names must contain exactly N numpy arrays. +class BasePrecompiledModel(HubModel, FromPrecompiledProtocol): + """ + A pre-compiled hub model. + Model PyTorch source is not available, but compiled assets are available. + """ - This is a default implementation that returns a single random data array - for each input name based on the shapes and dtypes in `get_input_spec`. + def __init__(self, target_model_path: str): + self.target_model_path = target_model_path - A subclass may choose to override this and fetch a batch of real input data - from a data source. - """ - if not input_spec: - input_spec = self.get_input_spec() - inputs_dict = {} - inputs_list = make_torch_inputs(input_spec) - for i, input_name in enumerate(input_spec.keys()): - inputs_dict[input_name] = [inputs_list[i].numpy()] - return inputs_dict + def get_target_model_path(self) -> str: + """Get the path to the compiled asset for this model on disk.""" + return self.target_model_path diff --git a/qai_hub_models/utils/compare.py b/qai_hub_models/utils/compare.py index 8d210580..06bd37c8 100644 --- a/qai_hub_models/utils/compare.py +++ b/qai_hub_models/utils/compare.py @@ -29,7 +29,9 @@ def torch_inference( for i in range(len(list(sample_inputs.values())[0])): inputs = {} for input_name in input_names: - inputs[input_name] = torch.from_numpy(sample_inputs[input_name][i]) + inputs[input_name] = torch.from_numpy(sample_inputs[input_name][i]).to( + "cpu" + ) with torch.no_grad(): out = model(**inputs) out_tuple = (out,) if isinstance(out, torch.Tensor) else out diff --git a/qai_hub_models/utils/config_loaders.py b/qai_hub_models/utils/config_loaders.py index b0d68f0d..06be8ba7 100644 --- a/qai_hub_models/utils/config_loaders.py +++ b/qai_hub_models/utils/config_loaders.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import requests import yaml @@ -482,6 +482,7 @@ def __init__( name: str, id: str, status: MODEL_STATUS, + status_reason: str | None, headline: str, domain: MODEL_DOMAIN, description: str, @@ -490,6 +491,7 @@ def __init__( research_paper: str, research_paper_title: str, license: str, + deploy_license: str, source_repo: str, applicable_scenarios: List[str], related_models: List[str], @@ -498,12 +500,14 @@ def __init__( has_animated_banner: bool, code_gen_config: Dict[str, str | bool], license_type: str, + deploy_license_type: str, dataset: List[str], technical_details: Dict[str, str], ) -> None: self.name = name self.id = id self.status = status + self.status_reason = status_reason self.headline = headline self.domain = domain self.description = description @@ -512,7 +516,9 @@ def __init__( self.research_paper = research_paper self.research_paper_title = research_paper_title self.license = license + self.deploy_license = deploy_license self.license_type = license_type + self.deploy_license_type = deploy_license_type self.dataset = dataset self.source_repo = source_repo self.applicable_scenarios = applicable_scenarios @@ -569,9 +575,22 @@ def validate(self) -> Tuple[bool, Optional[str]]: if self.license_type not in HF_AVAILABLE_LICENSES: return False, f"license can be one of these: {HF_AVAILABLE_LICENSES}" - # Web assets exist - if self.status == MODEL_STATUS.PUBLIC and not self.has_static_banner: - return False, "All public models must have a static banner." + if not self.deploy_license: + return False, "deploy_license cannot be empty" + if not self.deploy_license_type: + return False, "deploy_license_type cannot be empty" + + # Status Reason + if self.status == MODEL_STATUS.PRIVATE and not self.status_reason: + return ( + False, + "Private models must set `status_reason` in info.yaml with a link to the related issue.", + ) + if self.status == MODEL_STATUS.PUBLIC and self.status_reason: + return ( + False, + "`status_reason` in info.yaml should not be set for public models.", + ) # Required assets exist if self.status == MODEL_STATUS.PUBLIC: @@ -686,6 +705,7 @@ def from_yaml(info_path: str | Path, code_gen_path: str | Path | None = None): info_yaml["name"], info_yaml["id"], MODEL_STATUS.from_string(info_yaml["status"]), + info_yaml.get("status_reason", None), info_yaml["headline"], MODEL_DOMAIN.from_string(info_yaml["domain"]), info_yaml["description"], @@ -694,6 +714,7 @@ def from_yaml(info_path: str | Path, code_gen_path: str | Path | None = None): info_yaml["research_paper"], info_yaml["research_paper_title"], info_yaml["license"], + info_yaml["deploy_license"], info_yaml["source_repo"], info_yaml["applicable_scenarios"], info_yaml["related_models"], @@ -702,34 +723,40 @@ def from_yaml(info_path: str | Path, code_gen_path: str | Path | None = None): info_yaml["has_animated_banner"], code_gen_config, info_yaml["license_type"], + info_yaml["deploy_license_type"], info_yaml["dataset"], info_yaml["technical_details"], ) # Schema for info.yaml INFO_YAML_SCHEMA = Schema( - { - "name": And(str), - "id": And(str), - "status": And(str), - "headline": And(str), - "domain": And(str), - "description": And(str), - "use_case": And(str), - "tags": And(lambda s: len(s) >= 0), - "research_paper": And(str), - "research_paper_title": And(str), - "license": And(str), - "source_repo": And(str), - "technical_details": And(dict), - "applicable_scenarios": And(lambda s: len(s) >= 0), - "related_models": And(lambda s: len(s) >= 0), - "form_factors": And(lambda s: len(s) >= 0), - "has_static_banner": And(bool), - "has_animated_banner": And(bool), - "license_type": And(str), - "dataset": And(list), - } + And( + { + "name": str, + "id": str, + "status": str, + OptionalSchema("status_reason", default=None): str, + "headline": str, + "domain": str, + "description": str, + "use_case": str, + "tags": lambda s: len(s) >= 0, + "research_paper": str, + "research_paper_title": str, + "license": str, + "deploy_license": str, + "source_repo": str, + "technical_details": dict, + "applicable_scenarios": lambda s: len(s) >= 0, + "related_models": lambda s: len(s) >= 0, + "form_factors": lambda s: len(s) >= 0, + "has_static_banner": bool, + "has_animated_banner": bool, + "license_type": str, + "deploy_license_type": str, + "dataset": list, + } + ) ) # Schema for code-gen.yaml @@ -743,8 +770,6 @@ def from_yaml(info_path: str | Path, code_gen_path: str | Path | None = None): OptionalSchema("tflite_export_failure_reason", default=""): str, OptionalSchema("has_demo", default=True): bool, OptionalSchema("check_trace", default=True): bool, - OptionalSchema("default_profile_options", default=""): str, - OptionalSchema("default_compile_options", default=""): str, OptionalSchema("channel_last_input", default=""): str, OptionalSchema("channel_last_output", default=""): str, OptionalSchema("outputs_to_skip_validation", default=[]): list, @@ -754,6 +779,7 @@ def from_yaml(info_path: str | Path, code_gen_path: str | Path | None = None): OptionalSchema("skip_tests", default=False): bool, OptionalSchema("is_precompiled", default=False): bool, OptionalSchema("no_assets", default=False): bool, + OptionalSchema("global_requirements_incompatible", default=False): bool, OptionalSchema("torchscript_opt", default=[]): list, OptionalSchema("inference_metrics", default="psnr"): str, } @@ -761,7 +787,7 @@ def from_yaml(info_path: str | Path, code_gen_path: str | Path | None = None): ) @staticmethod - def load_info_yaml(path: str | Path): + def load_info_yaml(path: str | Path) -> Dict[str, Any]: with open(path) as f: data = yaml.safe_load(f) try: diff --git a/qai_hub_models/utils/inference.py b/qai_hub_models/utils/inference.py index f28ca846..ef927a52 100644 --- a/qai_hub_models/utils/inference.py +++ b/qai_hub_models/utils/inference.py @@ -6,13 +6,15 @@ import os import tempfile -from typing import List, Tuple +from pathlib import Path +from typing import List, Mapping, Tuple import numpy as np import qai_hub as hub import torch from qai_hub.public_rest_api import DatasetEntries +from qai_hub_models.models.protocols import ExecutableModelProtocol from qai_hub_models.utils.asset_loaders import ModelZooAssetConfig from qai_hub_models.utils.base_model import BaseModel, SourceModelFormat, TargetRuntime from qai_hub_models.utils.input_spec import InputSpec @@ -32,7 +34,7 @@ def prepare_compile_zoo_model_to_hub( model: BaseModel, source_model_format: SourceModelFormat, target_runtime: TargetRuntime, - output_path: str = "", + output_path: str | Path = "", input_spec: InputSpec | None = None, check_trace: bool = True, prepare_compile_options_only: bool = False, @@ -98,7 +100,7 @@ def export_model_func(): ): def export_model_func(): - traced_model = model.convert_to_quantized_torchscript( + traced_model = model.convert_to_torchscript( input_spec=input_spec, check_trace=check_trace ) model_path = os.path.join(output_path, model_name + ".pt") @@ -193,7 +195,7 @@ def compile_zoo_model_to_hub( ) -class HubModel: +class HubModel(ExecutableModelProtocol): """ Class that behaves like a pytorch model except when called, it runs an inference job on hub and returns a torch output. @@ -224,26 +226,27 @@ def __init__( def __call__( self, - *input_tensors: torch.Tensor - | List[torch.Tensor] + *args: torch.Tensor + | np.ndarray + | List[torch.Tensor | np.ndarray] + | hub.Dataset + | DatasetEntries, + ) -> torch.Tensor | Tuple[torch.Tensor, ...]: + return self.forward(*args) + + def forward( + self, + *args: torch.Tensor + | np.ndarray + | List[torch.Tensor | np.ndarray] | hub.Dataset | DatasetEntries, ) -> torch.Tensor | Tuple[torch.Tensor, ...]: - inputs: hub.Dataset | DatasetEntries - if len(input_tensors) == 1 and isinstance(input_tensors[0], hub.Dataset): - inputs = input_tensors[0] - else: - # Upload dataset - inputs = {} - for name, tensor in zip(self.input_names, input_tensors): - if isinstance(tensor, (list, tuple)): - inputs[name] = [t.detach().numpy() for t in tensor] # type: ignore - else: - inputs[name] = [tensor.detach().numpy()] # type: ignore target_runtime = ( TargetRuntime.QNN if is_qnn_hub_model(self.model) else TargetRuntime.TFLITE ) + # Determine whether I/O is channel last channel_last_input, channel_last_output = "", "" if self.model.producer is not None: model_options = self.model.producer.options.strip().split() @@ -252,14 +255,40 @@ def __call__( channel_last_input = model_options[option_num + 1] if model_options[option_num] == "--force_channel_last_output": channel_last_output = model_options[option_num + 1] - if channel_last_input != "": - inputs = transpose_channel_first_to_last( - channel_last_input, inputs, target_runtime - ) + + assert len(args) > 0, "At least 1 input should be provided for inference." + + dataset: hub.Dataset | DatasetEntries + if isinstance(args[0], hub.Dataset) or isinstance(args[0], Mapping): + # Use the existing provided dataset + assert len(args) == 1, "Only 1 dataset can be provided for inference." + dataset = args[0] + else: + # Create dataset from input tensors + dataset = {} + for name, inputs in zip(self.input_names, args): + if not isinstance(inputs, (list, tuple)): + inputs = [inputs] # type: ignore + + converted_inputs = [] + for input in inputs: + if isinstance(input, np.ndarray): + converted_inputs.append(input) + elif isinstance(input, torch.Tensor): + converted_inputs.append(input.detach().numpy()) + else: + raise NotImplementedError(f"Unknown input type: {str(inputs)}") + dataset[name] = converted_inputs + + # Transpose dataset I/O if necessary to fit with the on-device model format + if channel_last_input: + dataset = transpose_channel_first_to_last( + channel_last_input, dataset, target_runtime + ) inference_job = hub.submit_inference_job( model=self.model, - inputs=inputs, + inputs=dataset, device=self.device, name=f"{self.model.name}_demo_inference", options=self.inference_options, @@ -273,7 +302,7 @@ def __call__( assert output_ds_handle is not None output_dataset = output_ds_handle.download() - if channel_last_output != "": + if channel_last_output: output_dataset = transpose_channel_last_to_first( channel_last_output, output_dataset, # type: ignore diff --git a/qai_hub_models/utils/input_spec.py b/qai_hub_models/utils/input_spec.py index 0944724f..7d325dc7 100644 --- a/qai_hub_models/utils/input_spec.py +++ b/qai_hub_models/utils/input_spec.py @@ -24,7 +24,7 @@ def make_torch_inputs(spec: InputSpec, seed: Optional[int] = 42) -> List[torch.T torch_input = [] generator = None if seed is not None: - generator = torch.Generator() + generator = torch.Generator(device="cpu") generator.manual_seed(seed) for sp in spec.values(): torch_dtype = str_to_torch_dtype(sp[1]) diff --git a/qai_hub_models/utils/measurement.py b/qai_hub_models/utils/measurement.py index cf51c776..2c4a8f21 100644 --- a/qai_hub_models/utils/measurement.py +++ b/qai_hub_models/utils/measurement.py @@ -11,6 +11,7 @@ import numpy as np import qai_hub as hub +from tflite import Model as TFModel # type: ignore def display_with_sig_figs(num: float, num_sig_figs: int = 3) -> str: @@ -74,23 +75,22 @@ def get_tflite_unique_parameters( tensors that point to the same buffers. So, we keep track of all buffers we have counted through tensors. """ - from tensorflow.lite.python import schema_py_generated as schema_fb - with open(model_path, "rb") as f: tflite_model = f.read() - model_obj = schema_fb.Model.GetRootAsModel(tflite_model, 0) - model = schema_fb.ModelT.InitFromObj(model_obj) + model = TFModel.GetRootAs(tflite_model, 0) parameter_cnt = 0 buffers_counted = set() - for graph in model.subgraphs: - for tensor in graph.tensors: - buf_index = tensor.buffer - - buffer = model.buffers[buf_index] - if buffer.data is not None: + for i in range(model.SubgraphsLength()): + graph = model.Subgraphs(i) + for j in range(graph.TensorsLength()): + tensor = graph.Tensors(j) + buf_index = tensor.Buffer() + + buffer = model.Buffers(buf_index) + if not buffer.DataIsNone(): if buf_index not in buffers_counted: - parameter_cnt += np.prod(tensor.shape) + parameter_cnt += np.prod(tensor.ShapeAsNumpy()) buffers_counted.add(buf_index) if not as_str: diff --git a/qai_hub_models/utils/model_card.py b/qai_hub_models/utils/model_card.py index 91682146..d9831a77 100644 --- a/qai_hub_models/utils/model_card.py +++ b/qai_hub_models/utils/model_card.py @@ -48,6 +48,7 @@ class ModelRun: model_id: str profile_job_id: str runtime: MODEL_CARD_RUNTIMES + device_type: str def chipset(self) -> Optional[str]: """Chipset the job was run on.""" @@ -62,7 +63,9 @@ def chipset(self) -> Optional[str]: def profile_job(self): """Get the hub.ProfileJob object.""" if len(self.profile_job_id) > 0: - return hub.get_job(self.profile_job_id) + job = hub.get_job(self.profile_job_id) + job.wait() + return job return None def job_status(self) -> str: @@ -77,7 +80,12 @@ def job_status(self) -> str: @property def quantized(self) -> str: """Quantized models are marked so precision can be correctly recorded.""" - return "Yes" if self.model_id.endswith("_quantized") else "No" + return ( + "Yes" + if self.model_id.endswith("Quantized") + or self.model_id.endswith("Quantizable") + else "No" + ) @property def profile_results(self): @@ -163,18 +171,82 @@ def precision(self) -> str: return "fp16" return "null" + def performance_metrics(self) -> Dict[str, Any]: + return dict( + inference_time=self.get_inference_time(), + throughput=self.get_throughput(), + estimated_peak_memory_range=self.get_peak_memory_range(), + primary_compute_unit=self.primary_compute_unit(), + precision=self.precision(), + layer_info=dict( + layers_on_npu=self.npu(), + layers_on_gpu=self.gpu(), + layers_on_cpu=self.cpu(), + total_layers=self.total(), + ), + job_id=self.profile_job_id, + job_status=self.job_status(), + ) + + def reference_device_info(self) -> Dict[str, str]: + """Return a reference ID.""" + REF_DEVICE_MAP = { + "s23": ("qualcomm-snapdragon-8gen2", "Samsung Galaxy S23"), + "s24": ("qualcomm-snapdragon-8gen3", "Samsung Galaxy S24"), + } + chipset = REF_DEVICE_MAP[self.device_type][0] + hub_device = hub.get_devices(REF_DEVICE_MAP[self.device_type][1])[0] + device_name = hub_device.name + os_version = hub_device.os + os_name, form_factor, manufacturer = "", "", "" + for attr in hub_device.attributes: + if attr.startswith("vendor"): + manufacturer = attr.split(":")[-1] + if attr.startswith("format"): + form_factor = attr.split(":")[-1] + if attr.startswith("os"): + os_name = attr.split(":")[-1].capitalize() + chipset = chipset_marketting_name(chipset) + device_info = dict( + name=device_name, + os=os_version, + form_factor=form_factor.capitalize(), + os_name=os_name, + manufacturer=manufacturer.capitalize(), + chipset=chipset, + ) + return device_info + @dataclass class ModelPerf: model_runs: List[ModelRun] - def supported_chipsets(self, chips) -> List[str]: + def supported_chipsets(self, chips: List[str]) -> List[str]: """Return all the supported chipsets given the chipset it works on.""" - supported_chips = chips + + # Don't assign "chips" directly to supported_chips. + # The lists will share the same pointer, and hence the for + # loop below will break. + supported_chips = [] + supported_chips.extend(chips) + for chip in chips: + if chip == "qualcomm-snapdragon-8gen3": + supported_chips.extend( + [ + "qualcomm-snapdragon-8gen2", + "qualcomm-snapdragon-8gen1", + "qualcomm-snapdragon-888", + ] + ) if chip == "qualcomm-snapdragon-8gen2": supported_chips.extend( - ["qualcomm-snapdragon-8gen1", "qualcomm-snapdragon-888"] + [ + "qualcomm-snapdragon-8gen3", + "qualcomm-snapdragon-8gen1", + "qualcomm-snapdragon-888", + ] ) if chip == "qualcomm-snapdragon-855": supported_chips.extend( @@ -222,31 +294,6 @@ def supported_oses(self) -> List[str]: """Return all the supported operating systems.""" return ["Android"] - def reference_device_info(self) -> Dict[str, str]: - """Return a reference ID.""" - chipset = "qualcomm-snapdragon-8gen2" - hub_device = hub.get_devices("Samsung Galaxy S23 Ultra")[0] - device_name = hub_device.name - os_version = hub_device.os - os_name, form_factor, manufacturer = "", "", "" - for attr in hub_device.attributes: - if attr.startswith("vendor"): - manufacturer = attr.split(":")[-1] - if attr.startswith("format"): - form_factor = attr.split(":")[-1] - if attr.startswith("os"): - os_name = attr.split(":")[-1].capitalize() - chipset = chipset_marketting_name(chipset) - device_info = dict( - name=device_name, - os=os_version, - form_factor=form_factor.capitalize(), - os_name=os_name, - manufacturer=manufacturer.capitalize(), - chipset=chipset, - ) - return device_info - def performance_metrics(self): """Performance metrics as per model card.""" perf_card = dict() @@ -254,11 +301,14 @@ def performance_metrics(self): # Figure out unique models in various baselines unique_model_ids = [] chips = [] + devices = [] for run in self.model_runs: if run.model_id not in unique_model_ids: unique_model_ids.append(run.model_id) if run.chipset not in chips: chips.append(run.chipset()) + if run.device_type not in devices: + devices.append(run.device_type) perf_card["aggregated"] = dict( supported_oses=self.supported_oses(), @@ -269,36 +319,32 @@ def performance_metrics(self): perf_per_model = [] for mid in unique_model_ids: - perf_per_device = [] # Calculate per data per runtime - perf_per_runtime = dict() + perf_per_device = dict() for run in self.model_runs: if run.model_id == mid: - runtime_name = run.runtime.name.lower() - perf_per_runtime[runtime_name] = dict( - inference_time=run.get_inference_time(), - throughput=run.get_throughput(), - estimated_peak_memory_range=run.get_peak_memory_range(), - primary_compute_unit=run.primary_compute_unit(), - precision=run.precision(), - layer_info=dict( - layers_on_npu=run.npu(), - layers_on_gpu=run.gpu(), - layers_on_cpu=run.cpu(), - total_layers=run.total(), - ), - job_id=run.profile_job_id, - job_status=run.job_status(), - ) - - # Per model, the device used and timestamp for model card - perf_per_runtime["reference_device_info"] = self.reference_device_info() - perf_per_runtime["timestamp"] = datetime.datetime.utcnow().isoformat() + "Z" - - perf_per_device.append(perf_per_runtime) - - perf_model = dict(name=mid, performance_metrics=perf_per_device) - perf_model["name"] = mid + for dev in devices: + if run.device_type == dev: + # perf_per_runtime = dict() + if dev not in perf_per_device: + perf_per_device[dev] = dict() + runtime_name = run.runtime.name.lower() + perf_per_device[dev][ + runtime_name + ] = run.performance_metrics() + # Per model, the device used and timestamp for model card + if "reference_device_info" not in perf_per_device[dev]: + perf_per_device[dev][ + "reference_device_info" + ] = run.reference_device_info() + + perf_per_device[dev]["timestamp"] = ( + datetime.datetime.utcnow().isoformat() + "Z" + ) + + perf_model = dict( + name=mid, performance_metrics=list(perf_per_device.values()) + ) perf_per_model.append(perf_model) # Perf card with multiple models diff --git a/qai_hub_models/utils/path_helpers.py b/qai_hub_models/utils/path_helpers.py index e1157e1b..2dc4a50f 100644 --- a/qai_hub_models/utils/path_helpers.py +++ b/qai_hub_models/utils/path_helpers.py @@ -5,11 +5,13 @@ from pathlib import Path from typing import Optional +from qai_hub_models.utils.asset_loaders import load_yaml + MODELS_PACKAGE_NAME = "models" QAIHM_PACKAGE_NAME = "qai_hub_models" -def get_all_models(): +def get_all_models(public_only: bool = False): zoo_root = get_qaihm_models_root() all_models = [] for subdir in zoo_root.iterdir(): @@ -17,6 +19,11 @@ def get_all_models(): continue # Heuristic to see if this is a model we should generate export.py for. if (subdir / "model.py").exists() and (subdir / "test.py").exists(): + if public_only: + if not (subdir / "info.yaml").exists(): + continue + if load_yaml(subdir / "info.yaml").get("status") != "public": + continue all_models.append(subdir.name) return all_models diff --git a/qai_hub_models/utils/perf_summary.py b/qai_hub_models/utils/perf_summary.py index 38202f39..13edb3c6 100644 --- a/qai_hub_models/utils/perf_summary.py +++ b/qai_hub_models/utils/perf_summary.py @@ -70,91 +70,100 @@ def update_summary(self, model_id: str, previous_report, new_report): new_perf_metrics = {} # Create chipset to perf metric - for i in range(len(previous_report["models"])): - for j in range(len(new_report["models"])): - if ( - previous_report["models"][i]["name"] - == new_report["models"][j]["name"] - ): - for prev_metric in previous_report["models"][i][ - "performance_metrics" - ]: - if "chipset" in prev_metric["reference_device_info"]: - ref_device = prev_metric["reference_device_info"]["chipset"] - prev_perf_metrics[ref_device] = prev_metric - - for new_metric in new_report["models"][j]["performance_metrics"]: - if "chipset" in new_metric["reference_device_info"]: - ref_device = new_metric["reference_device_info"]["chipset"] - new_perf_metrics[ref_device] = new_metric - - if len(prev_perf_metrics) == 0 or len(new_perf_metrics) == 0: - self.empty_perf_report.append((model_id,)) - - for device in prev_perf_metrics.keys(): - device_info = prev_perf_metrics[device]["reference_device_info"] - if device_info["os_name"] not in self.tracked_oses: - continue - - # Case 3: Chipset is missing in new data - if device not in new_perf_metrics: - self.missing_devices.append((model_id, device)) - continue - - for runtime_type in RUNTIMES_TO_COMPARE: - prev_inference_time = prev_perf_metrics[device][runtime_type][ - "inference_time" - ] - new_inference_time = new_perf_metrics[device][runtime_type][ - "inference_time" - ] - if new_inference_time == prev_inference_time: + if previous_report is not None and new_report is not None: + for i in range(len(previous_report["models"])): + for j in range(len(new_report["models"])): + if ( + previous_report["models"][i]["name"] + == new_report["models"][j]["name"] + ): + for prev_metric in previous_report["models"][i][ + "performance_metrics" + ]: + if "chipset" in prev_metric["reference_device_info"]: + ref_device = prev_metric["reference_device_info"][ + "chipset" + ] + prev_perf_metrics[ref_device] = prev_metric + + for new_metric in new_report["models"][j][ + "performance_metrics" + ]: + if "chipset" in new_metric["reference_device_info"]: + ref_device = new_metric["reference_device_info"][ + "chipset" + ] + new_perf_metrics[ref_device] = new_metric + + if len(prev_perf_metrics) == 0 or len(new_perf_metrics) == 0: + self.empty_perf_report.append((model_id,)) + + for device in prev_perf_metrics.keys(): + device_info = prev_perf_metrics[device]["reference_device_info"] + if device_info["os_name"] not in self.tracked_oses: continue - if new_inference_time == "null" or prev_inference_time == "null": - # Case 1: Model either failed to infer or had a successful run - summary_entry = ( - model_id, - runtime_type, - "inf", - self._format_speedup(new_inference_time), - self._format_speedup(prev_inference_time), - device_info["chipset"], - device_info["os"], - ) - - if new_inference_time == "null": - self.regressions["inf"].append(summary_entry) - else: - self.progressions["inf"].append(summary_entry) + # Case 3: Chipset is missing in new data + if device not in new_perf_metrics: + self.missing_devices.append((model_id, device)) continue - # Case 2: Bucketize speedup difference - progression_speedup = float(prev_inference_time) / float( - new_inference_time - ) - regression_speedup = float(new_inference_time) / float( - prev_inference_time - ) - is_progression = progression_speedup >= 1 - speedup = progression_speedup if is_progression else regression_speedup - - for bucket in self.perf_buckets[1:]: - if bucket <= speedup: - summary = ( + for runtime_type in RUNTIMES_TO_COMPARE: + prev_inference_time = prev_perf_metrics[device][runtime_type][ + "inference_time" + ] + new_inference_time = new_perf_metrics[device][runtime_type][ + "inference_time" + ] + if new_inference_time == prev_inference_time: + continue + + if new_inference_time == "null" or prev_inference_time == "null": + # Case 1: Model either failed to infer or had a successful run + summary_entry = ( model_id, runtime_type, - self._format_speedup(speedup), + "inf", self._format_speedup(new_inference_time), self._format_speedup(prev_inference_time), device_info["chipset"], device_info["os"], ) - if is_progression: - self.progressions[bucket].append(summary) + + if new_inference_time == "null": + self.regressions["inf"].append(summary_entry) else: - self.regressions[bucket].append(summary) - break + self.progressions["inf"].append(summary_entry) + continue + + # Case 2: Bucketize speedup difference + progression_speedup = float(prev_inference_time) / float( + new_inference_time + ) + regression_speedup = float(new_inference_time) / float( + prev_inference_time + ) + is_progression = progression_speedup >= 1 + speedup = ( + progression_speedup if is_progression else regression_speedup + ) + + for bucket in self.perf_buckets[1:]: + if bucket <= speedup: + summary = ( + model_id, + runtime_type, + self._format_speedup(speedup), + self._format_speedup(new_inference_time), + self._format_speedup(prev_inference_time), + device_info["chipset"], + device_info["os"], + ) + if is_progression: + self.progressions[bucket].append(summary) + else: + self.regressions[bucket].append(summary) + break def _get_summary_table(self, bucket_id, get_progressions=True): """ diff --git a/qai_hub_models/utils/printing.py b/qai_hub_models/utils/printing.py index 2e20827f..5efd51ec 100644 --- a/qai_hub_models/utils/printing.py +++ b/qai_hub_models/utils/printing.py @@ -9,7 +9,7 @@ import numpy as np import qai_hub as hub from prettytable import PrettyTable -from qai_hub.client import SourceModelType +from qai_hub.client import DatasetEntries, SourceModelType from tabulate import tabulate from qai_hub_models.utils.base_model import TargetRuntime @@ -22,7 +22,7 @@ def print_inference_metrics( inference_job: hub.InferenceJob, - inference_result: Dict[str, List[np.ndarray]], + inference_result: DatasetEntries, torch_out: List[np.ndarray], outputs_to_skip: Optional[List[int]] = None, metrics: str = "psnr", diff --git a/qai_hub_models/utils/qai_hub_helpers.py b/qai_hub_models/utils/qai_hub_helpers.py index c44f6085..89deb1d6 100644 --- a/qai_hub_models/utils/qai_hub_helpers.py +++ b/qai_hub_models/utils/qai_hub_helpers.py @@ -5,7 +5,8 @@ from __future__ import annotations import os -from typing import Any, Dict, List, Union +from pathlib import Path +from typing import Dict, List import numpy as np import qai_hub as hub @@ -20,8 +21,8 @@ def transpose_channel( io_names: str, - inputs: Union[hub.Dataset, Dict[str, Any]], - target_runtime: "TargetRuntime", + inputs: hub.client.DatasetEntries, + target_runtime: TargetRuntime, first_to_last: bool, ): @@ -29,7 +30,6 @@ def transpose_channel( io_names_list = io_names.strip().split(",") target = dict() - assert isinstance(inputs, dict) for name, array in inputs.items(): if len(array[0].shape) < min_dim or len(array[0].shape) > 5: target[name] = array @@ -47,16 +47,16 @@ def transpose_channel( def transpose_channel_first_to_last( io_names: str, - sample_inputs: Union[hub.Dataset, Dict[str, Any]], - target_runtime: "TargetRuntime", + sample_inputs: hub.client.DatasetEntries, + target_runtime: TargetRuntime, ) -> Dict[str, List[np.ndarray]]: return transpose_channel(io_names, sample_inputs, target_runtime, True) def transpose_channel_last_to_first( io_names: str, - job_outputs: Union[hub.Dataset, Dict[str, Any]], - target_runtime: "TargetRuntime", + job_outputs: hub.client.DatasetEntries, + target_runtime: TargetRuntime, ) -> Dict[str, List[np.ndarray]]: return transpose_channel(io_names, job_outputs, target_runtime, False) @@ -85,12 +85,12 @@ def export_without_hub_access( skip_inferencing: bool, skip_downloading: bool, skip_summary: bool, - output_path: str, + output_path: str | Path, target_runtime: TargetRuntime, compile_options: str, profile_options: str, components: List[str] | None = None, -) -> List[str] | None: +) -> List[str]: print(_WARNING_DASH) print( f"Unable to find a valid API token for {_AIHUB_NAME}. Using results from a previous job run on the same device.\n" diff --git a/qai_hub_models/utils/qnn_helpers.py b/qai_hub_models/utils/qnn_helpers.py index 51d9e255..d437e6df 100644 --- a/qai_hub_models/utils/qnn_helpers.py +++ b/qai_hub_models/utils/qnn_helpers.py @@ -6,11 +6,12 @@ import json from pathlib import Path -from typing import Dict, List +from typing import Dict -import torch from qai_hub.client import Job, Model, SourceModelType +from qai_hub_models.models.common import SampleInputsType + def onnx_elem_type_to_str(elem_type: int) -> str: if elem_type == 1: @@ -33,7 +34,7 @@ def load_encodings(output_path: Path, model_name: str) -> Dict: return encodings["activation_encodings"] -def get_qnn_inputs(compile_job: Job, sample_inputs: Dict[str, List[torch.Tensor]]): +def get_qnn_inputs(compile_job: Job, sample_inputs: SampleInputsType): compile_job.target_shapes return dict(zip(compile_job.target_shapes.keys(), sample_inputs.values())) diff --git a/qai_hub_models/utils/quantization_aimet.py b/qai_hub_models/utils/quantization_aimet.py index 40842fa1..56517e76 100644 --- a/qai_hub_models/utils/quantization_aimet.py +++ b/qai_hub_models/utils/quantization_aimet.py @@ -2,12 +2,24 @@ # Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- +""" +Items defined in this file require that AIMET be installed. +""" from __future__ import annotations +import logging +import os + try: + from aimet_common.utils import AimetLogger # type: ignore from aimet_torch import onnx_utils from aimet_torch.qc_quantize_op import QcQuantizeWrapper from aimet_torch.quantsim import QuantizationSimModel + from aimet_torch.tensor_quantizer import StaticGridPerTensorQuantizer + + # Suppress aimet info logs within zoo + if not os.environ.get("SHOW_AIMET_LOGS"): + AimetLogger.set_level_for_all_areas(logging.WARN) except (ImportError, ModuleNotFoundError): raise NotImplementedError( "AIMET must be installed to load quantized models. " @@ -16,11 +28,10 @@ "https://quic.github.io/aimet-pages/releases/latest/install/index.html" ) -import os import shutil import tempfile from pathlib import Path -from typing import Any +from typing import Any, List from zipfile import ZipFile import torch @@ -31,26 +42,80 @@ _DataLoader, _for_each_batch, ) -from qai_hub_models.utils.base_model import ( - BaseModel, - InputSpec, - SourceModelFormat, - TargetRuntime, +from qai_hub_models.models._shared.common import apply_module_function_recursively +from qai_hub_models.models.common import SourceModelFormat, TargetRuntime +from qai_hub_models.models.protocols import ( + EvalModelProtocol, + HubModelProtocol, + QuantizableModelProtocol, ) -from qai_hub_models.utils.input_spec import make_torch_inputs +from qai_hub_models.utils.input_spec import InputSpec, make_torch_inputs + + +def tie_aimet_observer_groups(groups: List[List[Any]]): + """ + This defines groups of ops that all should use the same output + quantizer observer. The input groups is a list of lists, where the + inner lists contain op references that should all use the same output + quantizer. Each op should have an `output_quantizers` member. + + Example: + + groups = [ + [ + sim.model.net.maxpool2, + sim.model.net.Mixed_5b.module_avg_pool2d, + ], + ] + _tie_aimet_observer_groups(groups) + """ + for group in groups: + output_quantizer = group[0].output_quantizers[0] + for op in group[1:]: + op.output_quantizers[0] = output_quantizer + + +def convert_all_depthwise_to_per_tensor(module): + """ + This recursively iterates a PyTorch module (that has been prepared by + AIMET for quantization) and replaces the weight quantizers with a + per-tensor for all depthwise convolutions. All parameters (bitwidth, + round_mode, etc.) are copied over from the existing quantizer. + """ + # Please see #9842 for context + def convert_depthwise_to_per_tensor(op, parent_module, name): + # Only convert depthwise + if op.groups > 1 and op.out_channels == op.groups: + quantizers = parent_module.param_quantizers + for key in ["weight", "bias"]: + quantizer = quantizers[key] + quantizers[key] = StaticGridPerTensorQuantizer( + bitwidth=quantizer.bitwidth, + round_mode=quantizer.round_mode, + quant_scheme=quantizer.quant_scheme, + use_symmetric_encodings=quantizer.use_symmetric_encodings, + enabled_by_default=quantizer.enabled, + ) + + apply_module_function_recursively( + module, torch.nn.Conv2d, convert_depthwise_to_per_tensor + ) -class AIMETQuantizableMixin: + +class AIMETQuantizableMixin(HubModelProtocol, QuantizableModelProtocol): """ - This mixin provides quantization support with Qualcomm's AIMET package. + Mixin that allows a model to be quantized & exported to disk using AIMET. + + Inheritor must implement HubModel for this mixin to function. """ def __init__( self, - sim_model: QuantizationSimModel, + quant_sim: QuantizationSimModel, needs_onnx_direct_aimet_export: bool = False, ): - self.quant_sim = sim_model + self.quant_sim = quant_sim self.needs_onnx_direct_aimet_export = needs_onnx_direct_aimet_export def preferred_hub_source_model_format( @@ -70,7 +135,7 @@ def quantize( requantize_model_weights=False, ) -> float | None: """ - Re-compute quantization encodings for this model with the given dataset and model evaluator. + Compute quantization encodings for this model with the given dataset and model evaluator. This model will be updated with a new set of quantization parameters. Future calls to forward() and export_...() will take these quantization parameters into account. @@ -104,8 +169,7 @@ def quantize( Returns: If an evaluator is provided, returns its accuracy score. No return value otherwise. """ - assert isinstance(self, BaseModel) - if not evaluator: + if not evaluator and isinstance(self, EvalModelProtocol): evaluator = self.get_evaluator() # Enable or disable quantization for model parameters (model weights). @@ -149,45 +213,36 @@ def convert_to_torchscript_and_aimet_encodings( input_spec: InputSpec | None = None, model_name: str | None = None, ) -> str: - """ - Converts the torch module to a zip file containing an - unquantized torchscript trace and an aimet quantization encodings file. - """ if model_name is None: model_name = self.__class__.__name__ if not input_spec: - input_spec = self._get_input_spec_ts() + input_spec = self.get_input_spec() os.makedirs(output_dir, exist_ok=True) zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip") base_dir = Path(f"{model_name}.aimet") - base_path = Path(output_dir) / base_dir - if base_path.exists(): - shutil.rmtree(base_path) - os.makedirs(base_path) - self.quant_sim.export( - str(base_path), - model_name, - tuple(make_torch_inputs(input_spec)), - export_to_torchscript=True, - ) - - # AIMET exports GraphModule. Convert it to ScriptModule - fx_graph_path = base_path / f"{model_name}.pth" - fx_graph = torch.load(fx_graph_path) - script_module = torch.jit.trace(fx_graph, tuple(make_torch_inputs(input_spec))) - torch.jit.save(script_module, base_path / f"{model_name}.pt") - - with ZipFile(zip_path, "w") as zip_object: - zip_object.write(base_path, base_dir) - zip_object.write( - base_path / f"{model_name}.pt", base_dir / f"{model_name}.pt" - ) - zip_object.write( - base_path / f"{model_name}_torch.encodings", - base_dir / f"{model_name}_torch.encodings", + + with tempfile.TemporaryDirectory() as tmpdir: + base_path = Path(tmpdir) / base_dir + os.makedirs(base_path) + self.quant_sim.export( + str(base_path), + model_name, + tuple(make_torch_inputs(input_spec)), + export_to_torchscript=True, ) + with ZipFile(zip_path, "w") as zip_object: + zip_object.write(base_path, base_dir) + zip_object.write( + base_path / f"{model_name}.torchscript.pth", + base_dir / f"{model_name}.pt", + ) + zip_object.write( + base_path / f"{model_name}_torch.encodings", + base_dir / f"{model_name}_torch.encodings", + ) + return zip_path def convert_to_onnx_and_aimet_encodings( @@ -203,52 +258,45 @@ def convert_to_onnx_and_aimet_encodings( if model_name is None: model_name = self.__class__.__name__ if not input_spec: - input_spec = self._get_input_spec_ts() + input_spec = self.get_input_spec() os.makedirs(output_dir, exist_ok=True) zip_path = os.path.join(output_dir, f"{model_name}.aimet.zip") base_dir = Path(f"{model_name}.aimet") - base_path = Path(output_dir) / base_dir - if base_path.exists(): - shutil.rmtree(base_path) - os.makedirs(base_path) - - onnx_utils.EXPORT_TO_ONNX_DIRECT = self.needs_onnx_direct_aimet_export - self.quant_sim.export( - str(base_path), - model_name, - tuple(make_torch_inputs(input_spec)), - onnx_export_args=dict(input_names=[name for name in input_spec]), - ) - - onnx_file_name = f"{model_name}.onnx" - encodings_file_name = f"{model_name}.encodings" - with ZipFile(zip_path, "w") as zip_object: - zip_object.write(base_path, base_dir) - zip_object.write( - base_path / onnx_file_name, os.path.join(base_dir, onnx_file_name) - ) - zip_object.write( - base_path / encodings_file_name, - os.path.join(base_dir, encodings_file_name), + + with tempfile.TemporaryDirectory() as tmpdir: + base_path = Path(tmpdir) / base_dir + if base_path.exists(): + shutil.rmtree(base_path) + os.makedirs(base_path) + + onnx_utils.EXPORT_TO_ONNX_DIRECT = self.needs_onnx_direct_aimet_export + self.quant_sim.export( + str(base_path), + model_name, + tuple(make_torch_inputs(input_spec)), + onnx_export_args=dict(input_names=[name for name in input_spec]), ) - return zip_path + onnx_file_name = f"{model_name}.onnx" + encodings_file_name = f"{model_name}.encodings" + with ZipFile(zip_path, "w") as zip_object: + zip_object.write(base_path, base_dir) + zip_object.write( + base_path / onnx_file_name, os.path.join(base_dir, onnx_file_name) + ) + zip_object.write( + base_path / encodings_file_name, + os.path.join(base_dir, encodings_file_name), + ) - def convert_to_torchscript(*args, **kwargs): - """Block users from calling convert_to_torchscript() on quantized models, since python will call both parent classes.""" - raise NotImplementedError( - "This model is quantized. Use `model.convert_to_quantized_torchscript` instead!" - ) + return zip_path - def convert_to_quantized_torchscript( + def convert_to_torchscript( self, input_spec: InputSpec | None = None, check_trace: bool = True ) -> Any: - """ - Converts the torch module to a quantized torchscript trace. - """ if not input_spec: - input_spec = self._get_input_spec_ts() + input_spec = self.get_input_spec() with tempfile.TemporaryDirectory() as tempdir: self.quant_sim.export( @@ -267,26 +315,8 @@ def get_calibration_data( ) -> DatasetEntries | None: """ Calibration dataset for this model and input spec. - Default behavior is randomized input in range [0, 1]. """ if not input_spec: - input_spec = self._get_input_spec_ts() + input_spec = self.get_input_spec() inputs = make_torch_inputs(input_spec) return {k: v.numpy() for k, v in zip(input_spec.keys(), inputs)} - - def _get_input_spec_ts(self, *args, **kwargs) -> InputSpec: - """Type safe version of get_input_spec.""" - assert isinstance(self, BaseModel) - return self.get_input_spec(*args, **kwargs) - - -class HubCompileOptionsInt8Mixin: - def get_hub_compile_options( - self, - target_runtime: TargetRuntime, - other_compile_options: str = "", - ) -> str: - compile_options = super().get_hub_compile_options( # type: ignore - target_runtime, other_compile_options - ) - return compile_options + " --quantize_full_type int8 --quantize_io" diff --git a/scripts/build_and_test.py b/scripts/build_and_test.py index 78a7a6af..28dc8875 100755 --- a/scripts/build_and_test.py +++ b/scripts/build_and_test.py @@ -311,8 +311,19 @@ def test_changed_models( # but no model definitions actually changed. That means this was a mass-change # to the export scripts. # - # Just use 1 model as a sample to test the export. This makes CI significantly faster. - export_models = set([next(iter(export_changed_models))]) + # Test a representative set of models. + # One regular model, one aimet, one components, and one non-image input. + # These are among the smallest instances of each of these. + # If none of these models were changed, test one model. + representative_set = [ + "sinet", + "quicksrnetsmall_quantized", + "mediapipe_face", + "facebook_denoiser", + ] + export_models = export_changed_models & set(representative_set) + if len(export_models) == 0: + export_models = set([next(iter(export_changed_models))]) else: export_models = set() @@ -333,6 +344,7 @@ def test_changed_models( self.venv_path, venv_for_each_model=False, use_shared_cache=True, + test_trace=False, ), ) @@ -373,6 +385,18 @@ def test_all_models(self, plan: Plan, step_id: str = "test_all_models") -> str: ), ) + @public_task("Generate perf.yamls.") + @depends(["install_deps"]) + def create_perfs(self, plan: Plan, step_id: str = "generate_perfs") -> str: + return plan.add_step( + step_id, + RunCommandsWithVenvTask( + group_name=None, + venv=self.venv_path, + commands=["python qai_hub_models/scripts/generate_perf_yaml.py --all"], + ), + ) + @public_task("Run profile jobs for all models in Model Zoo.") @depends(["install_deps"]) def test_profile_all_models( @@ -480,6 +504,21 @@ def release(self, plan: Plan, step_id: str = "release") -> str: ), ) + @public_task("Push QAIHM Code (build repo & wheel, push repo)") + @depends(["install_deps"]) + def release_code(self, plan: Plan, step_id: str = "release_code") -> str: + return plan.add_step( + step_id, + ReleaseTask( + self.venv_path, + self.python_executable, + build_repository=True, + push_repository=True, + build_wheel=False, + publish_wheel=False, + ), + ) + @public_task("Mock Release QAIHM (build repo & wheel, but do not push them)") @depends(["install_deps"]) def mock_release(self, plan: Plan, step_id: str = "mock_release") -> str: diff --git a/scripts/ci/git-credential-helper.sh b/scripts/ci/git-credential-helper.sh index 1a294a88..1baff120 100644 --- a/scripts/ci/git-credential-helper.sh +++ b/scripts/ci/git-credential-helper.sh @@ -1,3 +1,7 @@ #!/bin/bash +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- echo username="$GIT_USER" echo password="$GIT_PASSWORD" diff --git a/scripts/quantize_ffnet.py b/scripts/examples/quantize_ffnet.py similarity index 100% rename from scripts/quantize_ffnet.py rename to scripts/examples/quantize_ffnet.py diff --git a/scripts/examples/quantize_imagenet_classifier.py b/scripts/examples/quantize_imagenet_classifier.py index 45fb88ce..79d1e063 100644 --- a/scripts/examples/quantize_imagenet_classifier.py +++ b/scripts/examples/quantize_imagenet_classifier.py @@ -8,13 +8,43 @@ This script assumes the model is added to QAIHM, but is missing quantization parameters. """ import argparse -import importlib from pathlib import Path import torch from torch.utils.data import DataLoader from qai_hub_models.datasets.imagenette import ImagenetteDataset +from qai_hub_models.models.googlenet_quantized.model import GoogLeNetQuantizable +from qai_hub_models.models.inception_v3_quantized.model import InceptionNetV3Quantizable +from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable +from qai_hub_models.models.mobilenet_v3_large_quantized.model import ( + MobileNetV3LargeQuantizable, +) +from qai_hub_models.models.regnet_quantized.model import RegNetQuantizable +from qai_hub_models.models.resnet18_quantized.model import ResNet18Quantizable +from qai_hub_models.models.resnet50_quantized.model import ResNet50Quantizable +from qai_hub_models.models.resnet101_quantized.model import ResNet101Quantizable +from qai_hub_models.models.resnext50_quantized.model import ResNeXt50Quantizable +from qai_hub_models.models.resnext101_quantized.model import ResNeXt101Quantizable +from qai_hub_models.models.shufflenet_v2_quantized.model import ShufflenetV2Quantizable +from qai_hub_models.models.squeezenet1_1_quantized.model import SqueezeNetQuantizable +from qai_hub_models.models.wideresnet50_quantized.model import WideResNet50Quantizable + +CLASSIFIERS = { + "googlenet": GoogLeNetQuantizable, + "inception_v3": InceptionNetV3Quantizable, + "mobilenet_v2": MobileNetV2Quantizable, + "mobilenet_v3_large": MobileNetV3LargeQuantizable, + "regnet": RegNetQuantizable, + "resnet101": ResNet101Quantizable, + "resnet18": ResNet18Quantizable, + "resnet50": ResNet50Quantizable, + "resnext50": ResNeXt50Quantizable, + "resnext101": ResNeXt101Quantizable, + "shufflenet_v2": ShufflenetV2Quantizable, + "squeezenet1_1": SqueezeNetQuantizable, + "wideresnet50": WideResNet50Quantizable, +} if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -42,8 +72,9 @@ parser.add_argument( "--model", type=str, + choices=CLASSIFIERS.keys(), required=True, - help="Name of the model folder to compute encodings.", + help="Name of the model to quantize.", ) parser.add_argument( "--seed", @@ -52,17 +83,17 @@ help="Manual seed to ensure reproducibility for quantization.", ) args = parser.parse_args() - module = importlib.import_module(f"qai_hub_models.models.{args.model}") + ImageNetClassifier_cls = CLASSIFIERS[args.model] dataset = ImagenetteDataset() torch.manual_seed(args.seed) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) - model = module.Model.from_pretrained(aimet_encodings=None) + model = ImageNetClassifier_cls.from_pretrained(aimet_encodings=None) accuracy = model.quantize(dataloader, args.num_iter, model.get_evaluator()) print(f"Accuracy: {accuracy * 100:.3g}%") output_path = args.output_dir or str(Path() / "build") - output_name = args.output_name or f"{module.MODEL_ID}_encodings" + output_name = args.output_name or f"{args.model}_quantized_encodings" model.quant_sim.save_encodings_to_json(output_path, output_name) diff --git a/scripts/examples/test_numerics_imagenet_classifier_quantized.py b/scripts/examples/test_numerics_imagenet_classifier_quantized.py new file mode 100644 index 00000000..3416382a --- /dev/null +++ b/scripts/examples/test_numerics_imagenet_classifier_quantized.py @@ -0,0 +1,306 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +""" +Run it with pytest --on-device +""" +from typing import Tuple + +import numpy as np +import pytest +import qai_hub as hub +import torch +from torch.utils.data import DataLoader, random_split +from tqdm import tqdm + +from qai_hub_models.datasets.imagenette import ImagenetteDataset +from qai_hub_models.models._shared.imagenet_classifier.model import ImagenetClassifier +from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable +from qai_hub_models.models.mobilenet_v3_large_quantized.model import ( + MobileNetV3LargeQuantizable, +) +from qai_hub_models.models.regnet_quantized.model import RegNetQuantizable +from qai_hub_models.models.resnet18_quantized.model import ResNet18Quantizable +from qai_hub_models.models.resnet50_quantized.model import ResNet50Quantizable +from qai_hub_models.models.resnet101_quantized.model import ResNet101Quantizable +from qai_hub_models.models.resnext50_quantized.model import ResNeXt50Quantizable +from qai_hub_models.models.resnext101_quantized.model import ResNeXt101Quantizable +from qai_hub_models.models.shufflenet_v2_quantized.model import ShufflenetV2Quantizable +from qai_hub_models.models.squeezenet1_1_quantized.model import SqueezeNetQuantizable +from qai_hub_models.models.wideresnet50_quantized.model import WideResNet50Quantizable +from qai_hub_models.utils.base_model import SourceModelFormat, TargetRuntime +from qai_hub_models.utils.inference import compile_zoo_model_to_hub +from qai_hub_models.utils.measurement import get_model_size_mb + + +def on_device(func): + # Skip tests if '--on-device' is not in the command line arguments + return pytest.mark.skipif( + "'--on-device' not in sys.argv", reason="needs --on-device option to run" + )(func) + + +@pytest.fixture(scope="module") +def data_loaders(): + dataset = ImagenetteDataset() + calib_len = int(0.1 * len(dataset)) + test_len = len(dataset) - calib_len + # Deterministic random split + calib_dataset, test_dataset = random_split( + dataset, [calib_len, test_len], generator=torch.Generator().manual_seed(42) + ) + calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False) + test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) + return calib_loader, test_loader + + +@pytest.fixture(scope="module") +def test_data(data_loaders) -> Tuple[torch.Tensor, torch.Tensor, hub.Dataset]: + calib_loader, test_loader = data_loaders + num_test = 1000 + + img_batches, label_batches = [], [] + total_samples = 0 + for images, labels in tqdm(test_loader): + img_batches.append(images) + label_batches.append(labels) + total_samples += images.size(0) + if total_samples >= 1000: + break + img_test = torch.cat(img_batches, dim=0)[:num_test] + label_test = torch.cat(label_batches, dim=0)[:num_test] + input_name = list(ImagenetClassifier.get_input_spec().keys())[0] + data_entries = {input_name: np.split(img_test.numpy(), img_test.shape[0])} + hub_ds = hub.upload_dataset(data_entries) + return img_test, label_test, hub_ds + + +def test_dataloader_is_deterministic(data_loaders): + """Test that the calibration-test split and the loading are deterministic""" + calib_loader, test_loader = data_loaders + img, labels = next(iter(calib_loader)) + expected_calib_labels = [701, 569, 482, 571, 482] + assert labels[:5].tolist() == expected_calib_labels + + expected_test_labels = [569, 0, 217, 571, 701] + img, labels = next(iter(test_loader)) + assert labels[:5].tolist() == expected_test_labels + + +@pytest.fixture( + scope="module", + params=[ + # Class, Calibration accuracy, AIMET accuracy + (MobileNetV2Quantizable, 0.8021, 0.8100), + (MobileNetV3LargeQuantizable, 0.8438, 0.8550), + (ResNet18Quantizable, 0.8021, 0.8010), + (ResNet50Quantizable, 0.8229, 0.8520), + (ResNet101Quantizable, 0.8125, 0.8530), + (ResNeXt50Quantizable, 0.8333, 0.8880), + (ResNeXt101Quantizable, 0.8542, 0.9250), + (SqueezeNetQuantizable, 0.6042, 0.6410), + (RegNetQuantizable, 0.8229, 0.8750), + (WideResNet50Quantizable, 0.8958, 0.9190), + (ShufflenetV2Quantizable, 0.7083, 0.6740), + ], +) +def quantized_model(request, data_loaders, test_data): + """ + Create encoding from calibration data and returned quantized model with + validated off-target accuracy computed on QuantSim + """ + img_test, label_test, hub_dataset = test_data + calib_loader, test_loader = data_loaders + model_cls, target_calib_acc, target_sim_acc = request.param + model = model_cls.from_pretrained(aimet_encodings=None) + + # Calibration in quantization + num_calib_batches = 3 + calib_accuracy = model.quantize( + calib_loader, num_calib_batches, evaluator=model.get_evaluator() + ) + print(f"{model_cls=}, {calib_accuracy=}") + np.testing.assert_allclose(target_calib_acc, calib_accuracy, atol=0.01) + + # QuantSim evaluation on eval set + evaluator = model.get_evaluator() + + batch_size = 32 + for i in tqdm(list(range(0, img_test.size(0), batch_size)), desc="QuantSim eval"): + img_batch = img_test[i : i + batch_size] + label_batch = label_test[i : i + batch_size] + + sim_out = model(img_batch).detach() + evaluator.add_batch(sim_out, label_batch) + + sim_acc = evaluator.get_accuracy_score() + print(f"{model_cls=}, {sim_acc=}") + np.testing.assert_allclose(target_sim_acc, sim_acc, atol=0.01) + return model + + +@on_device +@pytest.mark.parametrize( + "source_model_format,target_runtime,hub_needs_calib_data", + [ + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, False), + (SourceModelFormat.ONNX, TargetRuntime.QNN, False), + ], +) +def test_make_encoding_w8a8_accuracy( + source_model_format, + target_runtime, + hub_needs_calib_data, + test_data, + quantized_model, + data_loaders, +): + """ + 1. Export and compile quantized_model on Hub. + 2. Run inference on Hub on test. + + Note: We don't run profile job to get perf here but leave that to the score card. + """ + model = quantized_model + + expected_size_mb_and_acc = { + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, MobileNetV2Quantizable): ( + 3.64, + 0.784, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, MobileNetV2Quantizable): ( + 4.02, + 0.790, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, MobileNetV3LargeQuantizable): ( + 5.79, + 0.859, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, MobileNetV3LargeQuantizable): ( + None, # Fails to convert (AISW-87206) + None, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet18Quantizable): ( + 11.30, + 0.778, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet18Quantizable): ( + 11.61, + 0.789, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet50Quantizable): ( + 25.09, + 0.837, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet50Quantizable): ( + 25.33, + 0.834, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNet101Quantizable): ( + 43.89, + 0.827, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNet101Quantizable): ( + 44.08, + 0.831, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNeXt50Quantizable): ( + 24.77, + 0.888, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNeXt50Quantizable): ( + 24.96, + 0.888, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ResNeXt101Quantizable): ( + 87.29, + 0.906, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, ResNeXt101Quantizable): ( + 87.11, + None, # Fails to infer (#9827) + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, SqueezeNetQuantizable): ( + 1.30, + 0.609, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, SqueezeNetQuantizable): ( + 1.66, + 0.609, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, RegNetQuantizable): ( + 15.43, + 0.859, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, RegNetQuantizable): ( + 15.77, + 0.859, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, WideResNet50Quantizable): ( + 66.59, + 0.900, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, WideResNet50Quantizable): ( + 66.78, + 0.897, + ), + (SourceModelFormat.ONNX, TargetRuntime.TFLITE, ShufflenetV2Quantizable): ( + 1.47, + 0.661, + ), + (SourceModelFormat.ONNX, TargetRuntime.QNN, ShufflenetV2Quantizable): ( + 1.90, + 0.661, + ), + } + expected_size_mb, expected_acc = expected_size_mb_and_acc[ + (source_model_format, target_runtime, model.__class__) + ] + if expected_size_mb is None: + pytest.skip("Fails to compile") + + img_test, label_test, hub_dataset = test_data + calib_loader, test_loader = data_loaders + + # calibration data + calibration_data = None + if hub_needs_calib_data: + # AIMET export has missing encoding and needs calibration data + num_calib_batches = 3 + calib_imgs = [] + for b, (img_calib, labels) in enumerate(iter(calib_loader)): + if b >= num_calib_batches: + break + img_np = img_calib.numpy() + calib_imgs.extend(np.split(img_np, img_np.shape[0])) + calibration_data = {list(model.get_input_spec().keys())[0]: calib_imgs} + + # On-device inference + device = hub.Device("Samsung Galaxy S23") + hub_model = compile_zoo_model_to_hub( + model=model, + source_model_format=source_model_format, + device=device, + target_runtime=target_runtime, + calibration_data=calibration_data, + ) + + # Make sure model is quantized + tgt_model_size_mb = get_model_size_mb(hub_model.model) + model_cls = quantized_model.__class__ + print( + f"{model_cls=}, {source_model_format=}, {target_runtime=}, {tgt_model_size_mb=}" + ) + np.testing.assert_allclose(expected_size_mb, tgt_model_size_mb, rtol=0.1) + + if expected_acc is None: + pytest.skip("Fails to infer") + + # Check on-device accuracy + hub_out = hub_model(hub_dataset) + evaluator = model.get_evaluator() + evaluator.add_batch(hub_out, label_test) + hub_acc = evaluator.get_accuracy_score() + print(f"{model_cls=}, {source_model_format=}, {target_runtime=}, {hub_acc=}") + np.testing.assert_allclose(expected_acc, hub_acc, atol=0.01) diff --git a/scripts/examples/test_numerics_mobilenet_v2_quantized.py b/scripts/examples/test_numerics_mobilenet_v2_quantized.py deleted file mode 100644 index 55ba699d..00000000 --- a/scripts/examples/test_numerics_mobilenet_v2_quantized.py +++ /dev/null @@ -1,177 +0,0 @@ -# --------------------------------------------------------------------- -# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# --------------------------------------------------------------------- -""" -Run it with pytest --on-device -""" -from typing import Tuple - -import numpy as np -import pytest -import qai_hub as hub -import torch -from torch.utils.data import DataLoader, random_split -from tqdm import tqdm - -from qai_hub_models.datasets.imagenette import ImagenetteDataset -from qai_hub_models.models.mobilenet_v2_quantized.model import MobileNetV2Quantizable -from qai_hub_models.utils.inference import compile_zoo_model_to_hub -from qai_hub_models.utils.measurement import get_model_size_mb - - -def on_device(func): - # Skip tests if '--on-device' is not in the command line arguments - return pytest.mark.skipif( - "'--on-device' not in sys.argv", reason="needs --on-device option to run" - )(func) - - -@pytest.fixture(scope="module") -def data_loaders(): - dataset = ImagenetteDataset() - calib_len = int(0.1 * len(dataset)) - test_len = len(dataset) - calib_len - # Deterministic random split - calib_dataset, test_dataset = random_split( - dataset, [calib_len, test_len], generator=torch.Generator().manual_seed(42) - ) - calib_loader = DataLoader(calib_dataset, batch_size=32, shuffle=False) - test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) - return calib_loader, test_loader - - -@pytest.fixture(scope="module") -def test_data(data_loaders) -> Tuple[torch.Tensor, torch.Tensor, hub.Dataset]: - calib_loader, test_loader = data_loaders - num_test = 1000 - - img_batches, label_batches = [], [] - total_samples = 0 - for images, labels in tqdm(test_loader): - img_batches.append(images) - label_batches.append(labels) - total_samples += images.size(0) - if total_samples >= 1000: - break - img_test = torch.cat(img_batches, dim=0)[:num_test] - label_test = torch.cat(label_batches, dim=0)[:num_test] - input_name = list( - MobileNetV2Quantizable.from_pretrained(aimet_encodings=None) - .get_input_spec() - .keys() - )[0] - data_entries = {input_name: np.split(img_test.numpy(), img_test.shape[0])} - hub_ds = hub.upload_dataset(data_entries) - return img_test, label_test, hub_ds - - -def test_dataloader_is_deterministic(data_loaders): - """Test that the calibration-test split and the loading are deterministic""" - calib_loader, test_loader = data_loaders - img, labels = next(iter(calib_loader)) - expected_calib_labels = [701, 569, 482, 571, 482] - assert labels[:5].tolist() == expected_calib_labels - - expected_test_labels = [569, 0, 217, 571, 701] - img, labels = next(iter(test_loader)) - assert labels[:5].tolist() == expected_test_labels - - -@pytest.fixture(scope="module") -def quantized_model(data_loaders, test_data): - """ - Create encoding from calibration data and returned quantized model with - validated off-target accuracy computed on QuantSim - """ - img_test, label_test, hub_dataset = test_data - calib_loader, test_loader = data_loaders - model = MobileNetV2Quantizable.from_pretrained(aimet_encodings=None) - - # Calibration in quantization - num_calib_batches = 3 - calib_accuracy = model.quantize( - calib_loader, num_calib_batches, evaluator=model.get_evaluator() - ) - np.testing.assert_allclose(0.76, calib_accuracy, atol=0.01) - - # QuantSim evaluation on eval set - evaluator = model.get_evaluator() - - batch_size = 32 - for i in tqdm(list(range(0, img_test.size(0), batch_size)), desc="QuantSim eval"): - img_batch = img_test[i : i + batch_size] - label_batch = label_test[i : i + batch_size] - - sim_out = model(img_batch).detach() - evaluator.add_batch(sim_out, label_batch) - - sim_acc = evaluator.get_accuracy_score() - print(f"{sim_acc=}") - np.testing.assert_allclose(0.78125, sim_acc, atol=0.01) - return model - - -@on_device -@pytest.mark.parametrize( - "target_runtime,hub_needs_calib_data,expected_size_mb,expected_acc", - [ - ("onnx-tflite", False, 3.806, 0), - ("torch-tflite", False, 7.0891, 0.719), - ("onnx-qnn", False, 3.844, 0.76), - ("torch-qnn", True, 3.82, 0.7618), - ], -) -def test_make_encoding_w8a8_accuracy( - quantized_model, - data_loaders, - target_runtime, - hub_needs_calib_data, - expected_size_mb, - expected_acc, - test_data, -): - """ - 1. Export and compile quantized_model on Hub. - 2. Run inference on Hub on test. - - Note: We don't run profile job to get perf here but leave that to the score card. - """ - model = quantized_model - - img_test, label_test, hub_dataset = test_data - calib_loader, test_loader = data_loaders - - # calibration data - calibration_data = None - if hub_needs_calib_data: - # AIMET export has missing encoding and needs calibration data - num_calib_batches = 3 - calib_imgs = [] - for b, (img_calib, labels) in enumerate(iter(calib_loader)): - if b >= num_calib_batches: - break - img_np = img_calib.numpy() - calib_imgs.extend(np.split(img_np, img_np.shape[0])) - calibration_data = {list(model.get_input_spec().keys())[0]: calib_imgs} - - # On-device inference - device = hub.Device("Samsung Galaxy S23") - hub_model = compile_zoo_model_to_hub( - model=model, - device=device, - target_runtime=target_runtime, - calibration_data=calibration_data, - ) - - # Make sure model is quantized - tgt_model_size_mb = get_model_size_mb(hub_model.model) - np.testing.assert_allclose(expected_size_mb, tgt_model_size_mb, rtol=0.1) - - # Check on-device accuracy - hub_out = hub_model(hub_dataset) - evaluator = model.get_evaluator() - evaluator.add_batch(hub_out, label_test) - hub_acc = evaluator.get_accuracy_score() - print(f"{target_runtime=}, {hub_acc=}") - np.testing.assert_allclose(expected_acc, hub_acc, atol=0.01) diff --git a/scripts/github/create-aws-profile.sh b/scripts/github/create-aws-profile.sh new file mode 100755 index 00000000..4e71e75a --- /dev/null +++ b/scripts/github/create-aws-profile.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- +set -euo pipefail + +LOCAL_AWS_ACCESS_KEY_ID="$1" +LOCAL_AWS_SECRET_ACCESS_KEY="$2" +LOCAL_AWS_DEFAULT_REGION="$3" +LOCAL_AWS_PROFILE="$4" + +aws configure set aws_access_key_id "$LOCAL_AWS_ACCESS_KEY_ID" --profile "$LOCAL_AWS_PROFILE" +aws configure set aws_secret_access_key "$LOCAL_AWS_SECRET_ACCESS_KEY" --profile "$LOCAL_AWS_PROFILE" +aws configure set region "$LOCAL_AWS_DEFAULT_REGION" --profile "$LOCAL_AWS_PROFILE" + +aws sts get-caller-identity --profile "$LOCAL_AWS_PROFILE" diff --git a/scripts/tasks/changes.py b/scripts/tasks/changes.py index dbbf2fcb..fe5aec9d 100644 --- a/scripts/tasks/changes.py +++ b/scripts/tasks/changes.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: BSD-3-Clause # --------------------------------------------------------------------- import os -from typing import Iterable, Set +from typing import Iterable from .constants import ( PY_PACKAGE_MODELS_ROOT, @@ -34,7 +34,7 @@ def get_python_import_expression(filepath: str) -> str: def resolve_affected_models( - changed_files, + changed_files: Iterable[str], include_model: bool = True, include_demo: bool = True, include_export: bool = True, @@ -55,11 +55,11 @@ def resolve_affected_models( changed_files: List of filepaths to files that changed. Paths are relative to the root of this repository. """ - seen: Set[str] = set() + changed_files = list(changed_files) + seen = set(changed_files) while len(changed_files) > 0: # Pop off stack curr_file = changed_files.pop() - seen.add(curr_file) file_import = get_python_import_expression(curr_file) grep_out = run_and_get_output( @@ -80,23 +80,30 @@ def resolve_affected_models( # Add new nodes to stack for dependent_file in dependent_files: if dependent_file not in seen: + seen.add(dependent_file) changed_files.append(dependent_file) changed_models = set() for f in seen: if f.startswith(PY_PACKAGE_RELATIVE_MODELS_ROOT): - if not include_model and os.path.basename(f) == "model.py": + basename = os.path.basename(f) + if basename not in [ + "model.py", + "export.py", + "test.py", + "test_generated.py", + "demo.py", + ]: continue - if not include_export and os.path.basename(f) == "export.py": + if not include_model and basename == "model.py": continue - if not include_tests and os.path.basename(f) == "test.py": + if not include_export and basename == "export.py": continue - if ( - not include_generated_tests - and os.path.basename(f) == "test_generated.py" - ): + if not include_tests and basename == "test.py": + continue + if not include_generated_tests and basename == "test_generated.py": continue - if not include_demo and os.path.basename(f) == "demo.py": + if not include_demo and basename == "demo.py": continue model_name = f[len(PY_PACKAGE_RELATIVE_MODELS_ROOT) :].split("/")[1] @@ -115,17 +122,17 @@ def get_changed_files_in_package() -> Iterable[str]: os.makedirs("build/model-zoo/", exist_ok=True) changed_files_path = "build/changed-qaihm-files.txt" if not on_github(): - run( - f"git diff $(git merge-base --fork-point origin/main) --name-only > {changed_files_path}" - ) + run(f"git diff origin/main --name-only > {changed_files_path}") if os.path.exists(changed_files_path): with open(changed_files_path, "r") as f: - return [ + changed_files = [ file for file in f.read().split("\n") if file.startswith(PY_PACKAGE_RELATIVE_SRC_ROOT) and file.endswith(".py") ] + # Weed out duplicates + return list(set(changed_files)) return [] diff --git a/scripts/tasks/release.py b/scripts/tasks/release.py index 00289045..86ede149 100644 --- a/scripts/tasks/release.py +++ b/scripts/tasks/release.py @@ -156,6 +156,7 @@ def __init__(self): "git reset origin/main", # this checks out main "symbolically" (no on-disk source tree changes) "git add -u", # Remove any deleted files from the index "git add -f *", + "git add -f .", # https://stackoverflow.com/questions/26042390/ """git commit -m "$QAIHM_TAG Signed-off-by: $QAIHM_REPO_GH_SIGN_OFF_NAME <$QAIHM_REPO_GH_EMAIL>" """, diff --git a/scripts/tasks/test.py b/scripts/tasks/test.py index 84246a4c..324de146 100644 --- a/scripts/tasks/test.py +++ b/scripts/tasks/test.py @@ -5,6 +5,7 @@ from __future__ import annotations import os +from pathlib import Path from tempfile import TemporaryDirectory from typing import Iterable, Optional @@ -18,6 +19,7 @@ from .util import can_support_aimet, model_needs_aimet from .venv import ( CreateVenvTask, + RunCommandsWithVenvTask, SyncLocalQAIHMVenvTask, SyncModelRequirementsVenvTask, SyncModelVenvTask, @@ -189,7 +191,10 @@ def __init__( use_shared_cache: bool = False, # Use the global QAIHM cache rather than a temporary one for tests. export_func: str = "compile", skip_standard_unit_test: bool = False, + test_trace: bool = True, ): + if len(models_for_testing) == 0 and len(models_to_test_export) == 0: + return super().__init__("All Per-Model Tests (Skipped)", []) tasks = [] # Whether or not export tests will be run asynchronously @@ -214,7 +219,52 @@ def __init__( SyncLocalQAIHMVenvTask(base_test_venv, ["dev"], include_aimet=False) ) - print(f"Tests to be run for directories: {models_for_testing}") + print(f"Tests to be run for models: {models_for_testing}") + if not venv_for_each_model: + non_global_models = [] + global_models = [] + for model_name in models_for_testing: + yaml_path = Path(PY_PACKAGE_MODELS_ROOT) / model_name / "code-gen.yaml" + global_incompatible = False + if yaml_path.exists(): + with open(yaml_path, "r") as f: + if "global_requirements_incompatible" in f.read(): + global_incompatible = True + if global_incompatible: + non_global_models.append(model_name) + else: + global_models.append(model_name) + + if len(global_models) > 0: + globals_path = Path(PY_PACKAGE_SRC_ROOT) / "global_requirements.txt" + tasks.append( + RunCommandsWithVenvTask( + group_name="Install Global Requirements", + venv=base_test_venv, + commands=[f'pip install -r "{globals_path}"'], + ) + ) + + trace_tag = " or trace" if test_trace else "" + for model_name in sorted(global_models): + files_to_test = [] + model_dir = Path(PY_PACKAGE_MODELS_ROOT) / model_name + files_to_test.append(str(model_dir / "test.py")) + if model_name in models_to_test_export: + generated_test_path = str(model_dir / "test_generated.py") + if os.path.exists(generated_test_path): + files_to_test.append(generated_test_path) + tasks.append( + PyTestTask( + group_name=f"Test model: {model_name}", + venv=base_test_venv, + report_name=f"changed-models-{model_name}", + files_or_dirs=" ".join(files_to_test), + parallel=False, + extra_args=f'-s -m "unmarked or {export_func}{trace_tag}"', + ) + ) + models_for_testing = non_global_models for model_name in models_for_testing: # Run standard test suite for this model. tasks.append( @@ -222,7 +272,7 @@ def __init__( model_name, python_executable, model_name in models_to_test_export, - venv=None if venv_for_each_model else base_test_venv, + venv=None, use_shared_cache=use_shared_cache, export_func=export_func, skip_standard_unit_test=skip_standard_unit_test, diff --git a/scripts/tasks/venv.py b/scripts/tasks/venv.py index d3634f5d..b1c08eb0 100644 --- a/scripts/tasks/venv.py +++ b/scripts/tasks/venv.py @@ -12,7 +12,6 @@ PY_PACKAGE_INSTALL_ROOT, PY_PACKAGE_MODELS_ROOT, PY_PACKAGE_SRC_ROOT, - QAI_HUB_LATEST_PATH, REPO_ROOT, ) from .task import CompositeTask, RunCommandsTask, RunCommandsWithVenvTask @@ -29,6 +28,8 @@ def __init__(self, venv_path: str, python_executable: str) -> None: def is_package_installed(package_name: str, venv_path: str | None = None) -> bool: if venv_path is not None: + if not os.path.exists(venv_path): + return False command = f'. {venv_path}/bin/activate && python -c "import {package_name}"' else: command = f'python -c "import {package_name}"' @@ -51,7 +52,17 @@ def __init__( ) -> None: tasks = [] - # Install AIMET first to avoid installing two versions of torch (one from AIMET, one from QAIHM). + extras_str = f"[{','.join(extras)}]" if extras else "" + tasks.append( + RunCommandsWithVenvTask( + group_name=f"Install QAIHM{extras_str}", + venv=venv_path, + commands=[ + f'pip install -e "{PY_PACKAGE_INSTALL_ROOT}{extras_str}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html', + ], + ) + ) + if include_aimet: if can_support_aimet(): if is_package_installed("aimet_torch", venv_path): @@ -84,33 +95,6 @@ def __init__( ) ) - qai_hub_wheel_url = os.environ.get("QAI_HUB_WHEEL_URL", None) - if not is_package_installed("qai_hub", venv_path): - if qai_hub_wheel_url is None: - if os.path.exists(QAI_HUB_LATEST_PATH): - qai_hub_wheel_url = QAI_HUB_LATEST_PATH - - if qai_hub_wheel_url: - # Install local QAI Hub wheel if it exists, instead of pulling it from PyPi. - tasks.append( - RunCommandsWithVenvTask( - group_name="Install QAI Hub (Pre-Release)", - venv=venv_path, - commands=[f'pip install "{qai_hub_wheel_url}"'], - ) - ) - - extras_str = f"[{','.join(extras)}]" if extras else "" - tasks.append( - RunCommandsWithVenvTask( - group_name=f"Install QAIHM{extras_str}", - venv=venv_path, - commands=[ - f'pip install -e "{PY_PACKAGE_INSTALL_ROOT}{extras_str}" -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.13/index.html' - ], - ) - ) - super().__init__( f"Create Local QAIHM{extras_str} Virtual Environment at {venv_path}", [task for task in tasks], diff --git a/scripts/util/common.sh b/scripts/util/common.sh index 0c7ac1d2..088fbaee 100644 --- a/scripts/util/common.sh +++ b/scripts/util/common.sh @@ -1,3 +1,7 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- # Common utilities # shellcheck disable=SC2034 # various definitions appear unused in this included source. diff --git a/scripts/util/env_create.sh b/scripts/util/env_create.sh index b315026e..71b85de8 100755 --- a/scripts/util/env_create.sh +++ b/scripts/util/env_create.sh @@ -1,3 +1,7 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- # shellcheck source=/dev/null # we are statically sourcing a script. # This can be sourced and hence does not specify an interpreter. diff --git a/scripts/util/env_sync.sh b/scripts/util/env_sync.sh index 4c6fb60d..a0165e11 100644 --- a/scripts/util/env_sync.sh +++ b/scripts/util/env_sync.sh @@ -1,3 +1,7 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- # This should be sourced and hence does not specify an interpreter. REPO_ROOT=$(git rev-parse --show-toplevel) diff --git a/scripts/util/github.sh b/scripts/util/github.sh index 619c2baa..5b7cee29 100644 --- a/scripts/util/github.sh +++ b/scripts/util/github.sh @@ -1,3 +1,7 @@ +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- REPO_ROOT=$(git rev-parse --show-toplevel) . "${REPO_ROOT}/scripts/util/common.sh" diff --git a/scripts/util/pytest_with_coverage.sh b/scripts/util/pytest_with_coverage.sh index 7863f986..7b141a57 100755 --- a/scripts/util/pytest_with_coverage.sh +++ b/scripts/util/pytest_with_coverage.sh @@ -1,5 +1,9 @@ #!/usr/bin/env bash +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- REPO_ROOT=$(git rev-parse --show-toplevel) # Load helpers diff --git a/scripts/util/run_mypy.sh b/scripts/util/run_mypy.sh index 5c4d8f98..416eaec9 100755 --- a/scripts/util/run_mypy.sh +++ b/scripts/util/run_mypy.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash +# --------------------------------------------------------------------- +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# --------------------------------------------------------------------- # shellcheck source=/dev/null REPO_ROOT=$(git rev-parse --show-toplevel) @@ -18,5 +22,5 @@ paths=(qai_hub_models) for path in "${paths[@]}"; do pathToCheck="${path}" echo "Running mypy on ${pathToCheck}" - mypy --warn-unused-configs --config-file="${REPO_ROOT}/mypy.ini" "${pathToCheck}" + mypy --ignore-missing-imports --warn-unused-configs --config-file="${REPO_ROOT}/mypy.ini" "${pathToCheck}" done diff --git a/setup.py b/setup.py index 989a9ca0..a40bd1ae 100644 --- a/setup.py +++ b/setup.py @@ -68,5 +68,5 @@ def get_data_files() -> List[str]: include_package_data=True, install_requires=[line.strip() for line in open(requirements_path).readlines()], extras_require=extras_require, - license="MIT", + license="BSD-3", )