Add LLama2 Updated Performance Numbers

quic · Jul 2, 2024 · 3918be7 · 3918be7
1 parent 809a06c
commit 3918be7
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 25 deletions.
diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/perf.yaml b/qai_hub_models/models/llama_v2_7b_chat_quantized/perf.yaml
@@ -8,18 +8,18 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-05-23T00:34:02.549319Z'
+    timestamp: '2024-07-01T19:11:33.087816Z'
     torchscript_onnx_qnn:
-      inference_time: 90268
-      throughput: 11.07
+      inference_time: 88438
+      throughput: 11.307
       estimated_peak_memory_range:
-        min: 66715648
-        max: 4562679888
+        min: 95744000
+        max: 4468197056
       layer_info:
-        layers_on_npu: 34842
+        layers_on_npu: 33818
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 34842
+        total_layers: 33818
       precision: uint16
       primary_compute_unit: NPU
       job_id: "null"
@@ -31,18 +31,18 @@ models:
       os_name: Windows
       manufacturer: Qualcomm
       chipset: Snapdragon® X Elite
-    timestamp: '2024-05-23T00:34:02.549319Z'
+    timestamp: '2024-07-01T19:09:26.083951Z'
     torchscript_onnx_qnn:
-      inference_time: 118139
-      throughput: 8.46
+      inference_time: 95960
+      throughput: 10.421
       estimated_peak_memory_range:
-        min: 68124672
-        max: 68124672
+        min: 68235264
+        max: 68235264
       layer_info:
-        layers_on_npu: 34842
+        layers_on_npu: 33818
         layers_on_gpu: 0
         layers_on_cpu: 0
-        total_layers: 34842
+        total_layers: 33818
       precision: uint16
       primary_compute_unit: NPU
       job_id: "null"
@@ -56,13 +56,13 @@ models:
       os_name: Android
       manufacturer: Samsung
       chipset: Snapdragon® 8 Gen 3
-    timestamp: '2024-05-23T00:34:02.549319Z'
+    timestamp: '2024-07-01T20:53:21.204302Z'
     torchscript_onnx_qnn:
-      inference_time: 1917811
-      throughput: 533.94
+      inference_time: 1484949
+      throughput: 689.5859
       estimated_peak_memory_range:
-        min: 20480
-        max: 1078248176
+        min: 8421376
+        max: 1809446256
       layer_info:
         layers_on_npu: 31766
         layers_on_gpu: 0
@@ -79,13 +79,13 @@ models:
       os_name: Windows
       manufacturer: Qualcomm
       chipset: Snapdragon® X Elite
-    timestamp: '2024-05-23T00:34:02.549319Z'
+    timestamp: '2024-07-02T00:17:42.777637Z'
     torchscript_onnx_qnn:
-      inference_time: 2302575
-      throughput: 445.21
+      inference_time: 1889092
+      throughput: 542.059
       estimated_peak_memory_range:
-        min: 10788864
-        max: 10788864
+        min: 10784768
+        max: 10784768
       layer_info:
         layers_on_npu: 31766
         layers_on_gpu: 0
@@ -124,4 +124,4 @@ aggregated:
       precision: uint16
       primary_compute_unit: NPU
       job_id: ""
-      job_status: Passed
+      job_status: Passed
diff --git a/qai_hub_models/models/llama_v2_7b_chat_quantized/requirements.txt b/qai_hub_models/models/llama_v2_7b_chat_quantized/requirements.txt
@@ -1,2 +1,3 @@
 transformers==4.41.1
 sentencepiece==0.2.0
+psutil