diff --git a/scripts/azureml-assets/CHANGELOG.md b/scripts/azureml-assets/CHANGELOG.md index b0055d139e..4865bce8e8 100644 --- a/scripts/azureml-assets/CHANGELOG.md +++ b/scripts/azureml-assets/CHANGELOG.md @@ -1,7 +1,10 @@ ## 1.17.0 (Unreleased) ### 🚀 New Features + +## 1.16.65 (2024-11-04) ### 🐛 Bugs Fixed +- [#3544](https://github.com/Azure/azureml-assets/pull/3544) Fix validate assets for new evaluationresult asset tags ## 1.16.64 (2024-10-31) diff --git a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml index 3a95a702e8..f956622c28 100644 --- a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml +++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_shared.yaml @@ -5,7 +5,7 @@ evaluation_type: values: - text_generation - text_embeddings - - vision - text_cost - text_performance - text_quality + - vision diff --git a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_cost.yaml b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_cost.yaml new file mode 100644 index 0000000000..47ccd6ef5f --- /dev/null +++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_cost.yaml @@ -0,0 +1,6 @@ +# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_cost`. +index_metric_key: + required: True + allow_multiple: False + values: + - total_cost_per_1M_tokens diff --git a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_performance.yaml b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_performance.yaml new file mode 100644 index 0000000000..d639b202fa --- /dev/null +++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_performance.yaml @@ -0,0 +1,6 @@ +# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_performance`. +index_metric_key: + required: True + allow_multiple: False + values: + - throughput_gtps_token_count diff --git a/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_quality.yaml b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_quality.yaml new file mode 100644 index 0000000000..0ce53df01d --- /dev/null +++ b/scripts/azureml-assets/azureml/assets/config/evaluationresult/tag_values_text_quality.yaml @@ -0,0 +1,6 @@ +# Valid tag values that can be applied to evaluation results whose evaluation_type is `text_quality`. +index_metric_key: + required: True + allow_multiple: False + values: + - index_metric diff --git a/scripts/azureml-assets/azureml/assets/validate_assets.py b/scripts/azureml-assets/azureml/assets/validate_assets.py index 3ef4662143..52c590336e 100644 --- a/scripts/azureml-assets/azureml/assets/validate_assets.py +++ b/scripts/azureml-assets/azureml/assets/validate_assets.py @@ -1077,12 +1077,17 @@ def validate_assets(input_dirs: List[Path], asset_spec = asset_config._spec._yaml evaluation_type = asset_spec.get('tags', {}).get('evaluation_type', None) - if evaluation_type == 'text_generation': - error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_generation.yaml') - elif evaluation_type == 'text_embeddings': - error_count += validate_tags(asset_config, 'evaluationresult/tag_values_text_embeddings.yaml') - elif evaluation_type == 'vision': - error_count += validate_tags(asset_config, 'evaluationresult/tag_values_vision.yaml') + evaluation_tag_files = { + 'text_generation': 'evaluationresult/tag_values_text_generation.yaml', + 'text_embeddings': 'evaluationresult/tag_values_text_embeddings.yaml', + 'vision': 'evaluationresult/tag_values_vision.yaml', + 'text_quality': 'evaluationresult/tag_values_text_quality.yaml', + 'text_performance': 'evaluationresult/tag_values_text_performance.yaml', + 'text_cost': 'evaluationresult/tag_values_text_cost.yaml' + } + + if evaluation_type in evaluation_tag_files: + error_count += validate_tags(asset_config, evaluation_tag_files[evaluation_type]) else: _log_error( asset_config.file_name_with_path, diff --git a/scripts/azureml-assets/setup.py b/scripts/azureml-assets/setup.py index 1fe4a0d206..7982ec0231 100644 --- a/scripts/azureml-assets/setup.py +++ b/scripts/azureml-assets/setup.py @@ -7,7 +7,7 @@ setup( name="azureml-assets", - version="1.16.64", + version="1.16.65", description="Utilities for publishing assets to Azure Machine Learning system registries.", author="Microsoft Corp", packages=find_packages(), diff --git a/test/resources/validate/evaluationresult/text_cost_correct/asset.yaml b/test/resources/validate/evaluationresult/text_cost_correct/asset.yaml new file mode 100644 index 0000000000..8da19ed534 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_cost_correct/asset.yaml @@ -0,0 +1,5 @@ +type: evaluationresult +spec: spec.yaml +categories: +- EvaluationResult + diff --git a/test/resources/validate/evaluationresult/text_cost_correct/spec.yaml b/test/resources/validate/evaluationresult/text_cost_correct/spec.yaml new file mode 100644 index 0000000000..118f67b6d4 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_cost_correct/spec.yaml @@ -0,0 +1,37 @@ +type: evaluationresult +name: model1-16k_cost +version: 21.10.24 +display_name: model1 +description: Cost benchmark results for model1 +model_name: model1 +model_version: '1' +model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +dataset_family: synthetic +dataset_name: synthetic +relationships: + - relationshipType: Source + assetId: azureml://registries/azureml/models/model1/versions/1 +tags: + evaluation_type: text_cost + index_metric_key: total_cost_per_1M_tokens + +metrics: + input_token_cost_per_1M_tokens: 3.0 + output_token_cost_per_1M_tokens: 4.0 + total_cost_per_1M_tokens: 3.25 + +properties: + deployment_category: azureml + disclaimer: Cost Calculation is indicative and may vary based on the actual usage + and configuration. + additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/ + currency: USD + input_output_token_ratio: '3:1' + input_token_weightage_per_million: 750000 + output_token_weightage_per_million: 250000 + deployment_type: global + region: eastus + input_meter_id: input_meter_id + output_meter_id: output_meter_id + diff --git a/test/resources/validate/evaluationresult/text_cost_incorrect/asset.yaml b/test/resources/validate/evaluationresult/text_cost_incorrect/asset.yaml new file mode 100644 index 0000000000..8da19ed534 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_cost_incorrect/asset.yaml @@ -0,0 +1,5 @@ +type: evaluationresult +spec: spec.yaml +categories: +- EvaluationResult + diff --git a/test/resources/validate/evaluationresult/text_cost_incorrect/spec.yaml b/test/resources/validate/evaluationresult/text_cost_incorrect/spec.yaml new file mode 100644 index 0000000000..3333492414 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_cost_incorrect/spec.yaml @@ -0,0 +1,37 @@ +type: evaluationresult +name: model1-16k_cost +version: 21.10.24 +display_name: model1 +description: Cost benchmark results for model1 +model_name: model1 +model_version: '1' +model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +dataset_family: synthetic +dataset_name: synthetic +relationships: + - relationshipType: Source + assetId: azureml://registries/azureml/models/model1/versions/1 +tags: + evaluation_type: text_cost + index_metric_key: random_key + +metrics: + input_token_cost_per_1M_tokens: 3.0 + output_token_cost_per_1M_tokens: 4.0 + total_cost_per_1M_tokens: 3.25 + +properties: + deployment_category: azureml + disclaimer: Cost Calculation is indicative and may vary based on the actual usage + and configuration. + additional_info: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/ + currency: USD + input_output_token_ratio: '3:1' + input_token_weightage_per_million: 750000 + output_token_weightage_per_million: 250000 + deployment_type: global + region: eastus + input_meter_id: input_meter_id + output_meter_id: output_meter_id + diff --git a/test/resources/validate/evaluationresult/text_performance_correct/asset.yaml b/test/resources/validate/evaluationresult/text_performance_correct/asset.yaml new file mode 100644 index 0000000000..8da19ed534 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_performance_correct/asset.yaml @@ -0,0 +1,5 @@ +type: evaluationresult +spec: spec.yaml +categories: +- EvaluationResult + diff --git a/test/resources/validate/evaluationresult/text_performance_correct/spec.yaml b/test/resources/validate/evaluationresult/text_performance_correct/spec.yaml new file mode 100644 index 0000000000..666b338fb5 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_performance_correct/spec.yaml @@ -0,0 +1,54 @@ +type: evaluationresult +name: synthetic_model_perf +version: 10.30.24 +display_name: synthetic_model +description: Performance benchmark results for model on synthetic data + +dataset_family: synthetic +dataset_name: synthetic + +model_name: model1 +model_version: '1' +model_asset_id: azureml://registries/azureml/models/model1/versions/1 +relationships: + - relationshipType: Source + assetId: azureml://registries/azureml/models/model1/versions/1 + +tags: + evaluation_type: text_performance + index_metric_value: generated_tokens_per_sec + index_metric_key: throughput_gtps_token_count + azure_registry_name: azureml + azure_model_name: model1 + azure_latest_model_version: 1 + azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +metrics: + throughput_gtps_token_count: 25.3 + throughput_ttps_token_count: 145.43 + throughput_rps_request_count: 0.14 + latency_p50_secs: 7.13 + latency_p90_secs: 7.4 + latency_p95_secs: 7.52 + latency_p99_secs: 8.2 + latency_mean_secs: 7.17 + latency_ttft_secs: 1.37 + time_between_tokens_secs: 0.29 + index_metric: 25.3 + +properties: + deployment_category: azure_openai + deployment_type: standard + tokens_rate_limit: 30k + total_token_length_per_request: 1000 + prompt_token_generated_token_ratio: '80:20' + input_prompt_tokens: 800 + output_generated_tokens: 200 + num_of_inference_requests: 2 + num_of_inference_aggregations: 336 + payload_task_type: chat_completion + num_parallel_inference_requests: '1' + stream: true + tokenizer: gpt-4-0314 + region: uksouth + diff --git a/test/resources/validate/evaluationresult/text_performance_incorrect/asset.yaml b/test/resources/validate/evaluationresult/text_performance_incorrect/asset.yaml new file mode 100644 index 0000000000..8da19ed534 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_performance_incorrect/asset.yaml @@ -0,0 +1,5 @@ +type: evaluationresult +spec: spec.yaml +categories: +- EvaluationResult + diff --git a/test/resources/validate/evaluationresult/text_performance_incorrect/spec.yaml b/test/resources/validate/evaluationresult/text_performance_incorrect/spec.yaml new file mode 100644 index 0000000000..dd429980c1 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_performance_incorrect/spec.yaml @@ -0,0 +1,54 @@ +type: evaluationresult +name: synthetic_model_perf +version: 10.30.24 +display_name: synthetic_model +description: Performance benchmark results for model on synthetic data + +dataset_family: synthetic +dataset_name: synthetic + +model_name: model1 +model_version: '1' +model_asset_id: azureml://registries/azureml/models/model1/versions/1 +relationships: + - relationshipType: Source + assetId: azureml://registries/azureml/models/model1/versions/1 + +tags: + evaluation_type: text_performance + index_metric_value: generated_tokens_per_sec + index_metric_key: random_key + azure_registry_name: azureml + azure_model_name: model1 + azure_latest_model_version: 1 + azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +metrics: + throughput_gtps_token_count: 25.3 + throughput_ttps_token_count: 145.43 + throughput_rps_request_count: 0.14 + latency_p50_secs: 7.13 + latency_p90_secs: 7.4 + latency_p95_secs: 7.52 + latency_p99_secs: 8.2 + latency_mean_secs: 7.17 + latency_ttft_secs: 1.37 + time_between_tokens_secs: 0.29 + index_metric: 25.3 + +properties: + deployment_category: azure_openai + deployment_type: standard + tokens_rate_limit: 30k + total_token_length_per_request: 1000 + prompt_token_generated_token_ratio: '80:20' + input_prompt_tokens: 800 + output_generated_tokens: 200 + num_of_inference_requests: 2 + num_of_inference_aggregations: 336 + payload_task_type: chat_completion + num_parallel_inference_requests: '1' + stream: true + tokenizer: gpt-4-0314 + region: uksouth + diff --git a/test/resources/validate/evaluationresult/text_quality_correct/asset.yaml b/test/resources/validate/evaluationresult/text_quality_correct/asset.yaml new file mode 100644 index 0000000000..8da19ed534 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_quality_correct/asset.yaml @@ -0,0 +1,5 @@ +type: evaluationresult +spec: spec.yaml +categories: +- EvaluationResult + diff --git a/test/resources/validate/evaluationresult/text_quality_correct/spec.yaml b/test/resources/validate/evaluationresult/text_quality_correct/spec.yaml new file mode 100644 index 0000000000..10026dabdd --- /dev/null +++ b/test/resources/validate/evaluationresult/text_quality_correct/spec.yaml @@ -0,0 +1,37 @@ +type: evaluationresult +name: model1-0613_quality_index +version: 10.30.24 +display_name: model1_quality_index +description: aggregated quality benchmark results for model1 + +model_name: model1 +model_version: '1' +model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +dataset_family: aggregate +dataset_name: aggregate + +relationships: + - relationshipType: Source + assetId: azureml://registries/azureml/models/model1/versions/1 + +tags: + evaluation_type: text_quality + index_metric_key: index_metric + azure_registry_name: azureml + azure_model_name: model1 + azure_latest_model_version: 1 + azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +metrics: + accuracy: 0.873692 + coherence: 4.882209 + fluency: 4.924 + GPTSimilarity: 3.916613 + groundedness: 4.296203 + relevance: 4.333895 + index_metric: 0.85442 + +properties: + total_datasets: 15 + diff --git a/test/resources/validate/evaluationresult/text_quality_incorrect/asset.yaml b/test/resources/validate/evaluationresult/text_quality_incorrect/asset.yaml new file mode 100644 index 0000000000..8da19ed534 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_quality_incorrect/asset.yaml @@ -0,0 +1,5 @@ +type: evaluationresult +spec: spec.yaml +categories: +- EvaluationResult + diff --git a/test/resources/validate/evaluationresult/text_quality_incorrect/spec.yaml b/test/resources/validate/evaluationresult/text_quality_incorrect/spec.yaml new file mode 100644 index 0000000000..165432ff33 --- /dev/null +++ b/test/resources/validate/evaluationresult/text_quality_incorrect/spec.yaml @@ -0,0 +1,37 @@ +type: evaluationresult +name: model1-0613_quality_index +version: 10.30.24 +display_name: model1_quality_index +description: aggregated quality benchmark results for model1 + +model_name: model1 +model_version: '1' +model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +dataset_family: aggregate +dataset_name: aggregate + +relationships: + - relationshipType: Source + assetId: azureml://registries/azureml/models/model1/versions/1 + +tags: + evaluation_type: text_quality + index_metric_key: random_key + azure_registry_name: azureml + azure_model_name: model1 + azure_latest_model_version: 1 + azure_latest_model_asset_id: azureml://registries/azureml/models/model1/versions/1 + +metrics: + accuracy: 0.873692 + coherence: 4.882209 + fluency: 4.924 + GPTSimilarity: 3.916613 + groundedness: 4.296203 + relevance: 4.333895 + index_metric: 0.85442 + +properties: + total_datasets: 15 + diff --git a/test/test_validate_assets.py b/test/test_validate_assets.py index b50179eaff..9e59dd5975 100644 --- a/test/test_validate_assets.py +++ b/test/test_validate_assets.py @@ -51,6 +51,12 @@ ("evaluationresult/text_generation_incorrect", False, True, None, False), ("evaluationresult/vision_correct", False, True, None, True), ("evaluationresult/vision_incorrect", False, True, None, False), + ("evaluationresult/text_cost_correct", False, True, None, True), + ("evaluationresult/text_cost_incorrect", False, True, None, False), + ("evaluationresult/text_quality_correct", False, True, None, True), + ("evaluationresult/text_quality_incorrect", False, True, None, False), + ("evaluationresult/text_performance_correct", False, True, None, True), + ("evaluationresult/text_performance_incorrect", False, True, None, False), ] ) def test_validate_assets(test_subdir: str, check_images: bool, check_names: bool,