Fix the issue with supporting new evaluationresult asset types (#3544)

* updating model evaluation environment with modelling_llama patch. * add cost, quality, performance tags for evaluationresult. * reverting the changes for transformers patch. * updating the changelog and bump up the version. * fix the issue with supporting new evaluationresult assets. * updated the change log file. * updating the test cases. * updating the test cases. * fixing unit tests. * fixing change log file. --------- Co-authored-by: Chandra Sekhar Gupta Aravapalli <[email protected]>
Azure · Nov 4, 2024 · 4f98351 · 4f98351
1 parent 81f80b9
commit 4f98351
Show file tree

Hide file tree

Showing 20 changed files with 326 additions and 8 deletions.
diff --git a/scripts/azureml-assets/CHANGELOG.md b/scripts/azureml-assets/CHANGELOG.md
@@ -1,7 +1,10 @@
 ## 1.17.0 (Unreleased)
 ### 🚀 New Features
 
+
+## 1.16.65 (2024-11-04)
 ### 🐛 Bugs Fixed
+- [#3544](https://github.com/Azure/azureml-assets/pull/3544) Fix validate assets for new evaluationresult asset tags
 
 
 ## 1.16.64 (2024-10-31)

diff --git a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml
@@ -5,7 +5,7 @@ evaluation_type:
   values:
     - text_generation
     - text_embeddings
-    - vision
     - text_cost
     - text_performance
     - text_quality
+    - vision
diff --git a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_cost.yaml b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_cost.yaml
@@ -0,0 +1,6 @@
+# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_cost`.
+index_metric_key:
+  required: True
+  allow_multiple: False
+  values:
+  - total_cost_per_1M_tokens
diff --git a/...ts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_performance.yaml b/...ts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_performance.yaml
@@ -0,0 +1,6 @@
+# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_performance`.
+index_metric_key:
+  required: True
+  allow_multiple: False
+  values:
+  - throughput_gtps_token_count
diff --git a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_quality.yaml b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_quality.yaml
@@ -0,0 +1,6 @@
+# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_quality`.
+index_metric_key:
+  required: True
+  allow_multiple: False
+  values:
+  - index_metric
diff --git a/scripts/azureml-assets/azureml/assets/validate_assets.py b/scripts/azureml-assets/azureml/assets/validate_assets.py
@@ -1077,12 +1077,17 @@ def validate_assets(input_dirs: List[Path],
                 asset_spec = asset_config._spec._yaml
                 evaluation_type = asset_spec.get('tags', {}).get('evaluation_type', None)
 
-                if evaluation_type == 'text_generation':
-                    error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_generation.yaml')
-                elif evaluation_type == 'text_embeddings':
-                    error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_embeddings.yaml')
-                elif evaluation_type == 'vision':
-                    error_count += validate_tags(asset_config, 'evaluationresult/tag_values_vision.yaml')
+                evaluation_tag_files = {
+                    'text_generation': 'evaluationresult/tag_values_text_generation.yaml',
+                    'text_embeddings': 'evaluationresult/tag_values_text_embeddings.yaml',
+                    'vision': 'evaluationresult/tag_values_vision.yaml',
+                    'text_quality': 'evaluationresult/tag_values_text_quality.yaml',
+                    'text_performance': 'evaluationresult/tag_values_text_performance.yaml',
+                    'text_cost': 'evaluationresult/tag_values_text_cost.yaml'
+                }
+
+                if evaluation_type in evaluation_tag_files:
+                    error_count += validate_tags(asset_config, evaluation_tag_files[evaluation_type])
                 else:
                     _log_error(
                         asset_config.file_name_with_path,

diff --git a/scripts/azureml-assets/setup.py b/scripts/azureml-assets/setup.py
@@ -7,7 +7,7 @@
 
 setup(
    name="azureml-assets",
-   version="1.16.64",
+   version="1.16.65",
    description="Utilities for publishing assets to Azure Machine Learning system registries.",
    author="Microsoft Corp",
    packages=find_packages(),

diff --git a/test/resources/validate/evaluationresult/text_cost_correct/asset.yaml b/test/resources/validate/evaluationresult/text_cost_correct/asset.yaml
@@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
diff --git a/test/resources/validate/evaluationresult/text_cost_correct/spec.yaml b/test/resources/validate/evaluationresult/text_cost_correct/spec.yaml
@@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-16k_cost
+version: 21.10.24
+display_name: model1
+description: Cost benchmark results for model1
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: synthetic
+dataset_name: synthetic
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+tags:
+  evaluation_type: text_cost
+  index_metric_key: total_cost_per_1M_tokens
+
+metrics:
+  input_token_cost_per_1M_tokens: 3.0
+  output_token_cost_per_1M_tokens: 4.0
+  total_cost_per_1M_tokens: 3.25
+
+properties:
+  deployment_category: azureml
+  disclaimer: Cost Calculation is indicative and may vary based on the actual usage
+    and configuration.
+  additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
+  currency: USD
+  input_output_token_ratio: '3:1'
+  input_token_weightage_per_million: 750000
+  output_token_weightage_per_million: 250000
+  deployment_type: global
+  region: eastus
+  input_meter_id: input_meter_id
+  output_meter_id: output_meter_id
+
diff --git a/test/resources/validate/evaluationresult/text_cost_incorrect/asset.yaml b/test/resources/validate/evaluationresult/text_cost_incorrect/asset.yaml
@@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
diff --git a/test/resources/validate/evaluationresult/text_cost_incorrect/spec.yaml b/test/resources/validate/evaluationresult/text_cost_incorrect/spec.yaml
@@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-16k_cost
+version: 21.10.24
+display_name: model1
+description: Cost benchmark results for model1
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: synthetic
+dataset_name: synthetic
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+tags:
+  evaluation_type: text_cost
+  index_metric_key: random_key
+
+metrics:
+  input_token_cost_per_1M_tokens: 3.0
+  output_token_cost_per_1M_tokens: 4.0
+  total_cost_per_1M_tokens: 3.25
+
+properties:
+  deployment_category: azureml
+  disclaimer: Cost Calculation is indicative and may vary based on the actual usage
+    and configuration.
+  additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
+  currency: USD
+  input_output_token_ratio: '3:1'
+  input_token_weightage_per_million: 750000
+  output_token_weightage_per_million: 250000
+  deployment_type: global
+  region: eastus
+  input_meter_id: input_meter_id
+  output_meter_id: output_meter_id
+
diff --git a/test/resources/validate/evaluationresult/text_performance_correct/asset.yaml b/test/resources/validate/evaluationresult/text_performance_correct/asset.yaml
@@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
diff --git a/test/resources/validate/evaluationresult/text_performance_correct/spec.yaml b/test/resources/validate/evaluationresult/text_performance_correct/spec.yaml
@@ -0,0 +1,54 @@
+type: evaluationresult
+name: synthetic_model_perf
+version: 10.30.24
+display_name: synthetic_model
+description: Performance benchmark results for model on synthetic data
+
+dataset_family: synthetic
+dataset_name: synthetic
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_performance
+  index_metric_value: generated_tokens_per_sec
+  index_metric_key: throughput_gtps_token_count
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  throughput_gtps_token_count: 25.3
+  throughput_ttps_token_count: 145.43
+  throughput_rps_request_count: 0.14
+  latency_p50_secs: 7.13
+  latency_p90_secs: 7.4
+  latency_p95_secs: 7.52
+  latency_p99_secs: 8.2
+  latency_mean_secs: 7.17
+  latency_ttft_secs: 1.37
+  time_between_tokens_secs: 0.29
+  index_metric: 25.3
+
+properties:
+  deployment_category: azure_openai
+  deployment_type: standard
+  tokens_rate_limit: 30k
+  total_token_length_per_request: 1000
+  prompt_token_generated_token_ratio: '80:20'
+  input_prompt_tokens: 800
+  output_generated_tokens: 200
+  num_of_inference_requests: 2
+  num_of_inference_aggregations: 336
+  payload_task_type: chat_completion
+  num_parallel_inference_requests: '1'
+  stream: true
+  tokenizer: gpt-4-0314
+  region: uksouth
+
diff --git a/test/resources/validate/evaluationresult/text_performance_incorrect/asset.yaml b/test/resources/validate/evaluationresult/text_performance_incorrect/asset.yaml
@@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
diff --git a/test/resources/validate/evaluationresult/text_performance_incorrect/spec.yaml b/test/resources/validate/evaluationresult/text_performance_incorrect/spec.yaml
@@ -0,0 +1,54 @@
+type: evaluationresult
+name: synthetic_model_perf
+version: 10.30.24
+display_name: synthetic_model
+description: Performance benchmark results for model on synthetic data
+
+dataset_family: synthetic
+dataset_name: synthetic
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_performance
+  index_metric_value: generated_tokens_per_sec
+  index_metric_key: random_key
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  throughput_gtps_token_count: 25.3
+  throughput_ttps_token_count: 145.43
+  throughput_rps_request_count: 0.14
+  latency_p50_secs: 7.13
+  latency_p90_secs: 7.4
+  latency_p95_secs: 7.52
+  latency_p99_secs: 8.2
+  latency_mean_secs: 7.17
+  latency_ttft_secs: 1.37
+  time_between_tokens_secs: 0.29
+  index_metric: 25.3
+
+properties:
+  deployment_category: azure_openai
+  deployment_type: standard
+  tokens_rate_limit: 30k
+  total_token_length_per_request: 1000
+  prompt_token_generated_token_ratio: '80:20'
+  input_prompt_tokens: 800
+  output_generated_tokens: 200
+  num_of_inference_requests: 2
+  num_of_inference_aggregations: 336
+  payload_task_type: chat_completion
+  num_parallel_inference_requests: '1'
+  stream: true
+  tokenizer: gpt-4-0314
+  region: uksouth
+
diff --git a/test/resources/validate/evaluationresult/text_quality_correct/asset.yaml b/test/resources/validate/evaluationresult/text_quality_correct/asset.yaml
@@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
diff --git a/test/resources/validate/evaluationresult/text_quality_correct/spec.yaml b/test/resources/validate/evaluationresult/text_quality_correct/spec.yaml
@@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-0613_quality_index
+version: 10.30.24
+display_name: model1_quality_index
+description: aggregated quality benchmark results for model1
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: aggregate
+dataset_name: aggregate
+
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_quality
+  index_metric_key: index_metric
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  accuracy: 0.873692
+  coherence: 4.882209
+  fluency: 4.924
+  GPTSimilarity: 3.916613
+  groundedness: 4.296203
+  relevance: 4.333895
+  index_metric: 0.85442
+
+properties:
+  total_datasets: 15
+
diff --git a/test/resources/validate/evaluationresult/text_quality_incorrect/asset.yaml b/test/resources/validate/evaluationresult/text_quality_incorrect/asset.yaml
@@ -0,0 +1,5 @@
+type: evaluationresult
+spec: spec.yaml
+categories:
+- EvaluationResult
+
diff --git a/test/resources/validate/evaluationresult/text_quality_incorrect/spec.yaml b/test/resources/validate/evaluationresult/text_quality_incorrect/spec.yaml
@@ -0,0 +1,37 @@
+type: evaluationresult
+name: model1-0613_quality_index
+version: 10.30.24
+display_name: model1_quality_index
+description: aggregated quality benchmark results for model1
+
+model_name: model1
+model_version: '1'
+model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+dataset_family: aggregate
+dataset_name: aggregate
+
+relationships:
+    - relationshipType: Source
+      assetId: azureml://registries/azureml/models/model1/versions/1
+
+tags:
+  evaluation_type: text_quality
+  index_metric_key: random_key
+  azure_registry_name: azureml
+  azure_model_name: model1
+  azure_latest_model_version: 1
+  azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1
+
+metrics:
+  accuracy: 0.873692
+  coherence: 4.882209
+  fluency: 4.924
+  GPTSimilarity: 3.916613
+  groundedness: 4.296203
+  relevance: 4.333895
+  index_metric: 0.85442
+
+properties:
+  total_datasets: 15
+
diff --git a/test/test_validate_assets.py b/test/test_validate_assets.py
@@ -51,6 +51,12 @@
         ("evaluationresult/text_generation_incorrect", False, True, None, False),
         ("evaluationresult/vision_correct", False, True, None, True),
         ("evaluationresult/vision_incorrect", False, True, None, False),
+        ("evaluationresult/text_cost_correct", False, True, None, True),
+        ("evaluationresult/text_cost_incorrect", False, True, None, False),
+        ("evaluationresult/text_quality_correct", False, True, None, True),
+        ("evaluationresult/text_quality_incorrect", False, True, None, False),
+        ("evaluationresult/text_performance_correct", False, True, None, True),
+        ("evaluationresult/text_performance_incorrect", False, True, None, False),
     ]
 )
 def test_validate_assets(test_subdir: str, check_images: bool, check_names: bool,