diff --git a/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml b/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml index 57e0172b7f..c61bebe1bb 100644 --- a/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml +++ b/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml @@ -70,7 +70,7 @@ jobs: type: aml_token feature_selection: type: spark - component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1 + component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2 inputs: input_data_1: type: mltable diff --git a/assets/model_monitoring/components/data_quality/data_quality_signal_monitor/spec.yaml b/assets/model_monitoring/components/data_quality/data_quality_signal_monitor/spec.yaml index 9988f8b4bc..a0e8fe93b8 100644 --- a/assets/model_monitoring/components/data_quality/data_quality_signal_monitor/spec.yaml +++ b/assets/model_monitoring/components/data_quality/data_quality_signal_monitor/spec.yaml @@ -70,7 +70,7 @@ jobs: type: aml_token feature_selection: type: spark - component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1 + component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2 inputs: input_data_1: type: mltable diff --git a/assets/model_monitoring/components/model_monitor/model_monitor_feature_selector/spec.yaml b/assets/model_monitoring/components/model_monitor/model_monitor_feature_selector/spec.yaml index 5587a65d42..cb119a64a3 100644 --- a/assets/model_monitoring/components/model_monitor/model_monitor_feature_selector/spec.yaml +++ b/assets/model_monitoring/components/model_monitor/model_monitor_feature_selector/spec.yaml @@ -4,7 +4,7 @@ type: spark name: model_monitor_feature_selector display_name: Model Monitor - Feature Selector description: Selects features to compute signal metrics on. -version: 0.3.1 +version: 0.3.2 is_deterministic: true code: ../../src diff --git a/assets/model_monitoring/components/prediction_drift/prediction_drift_signal_monitor/spec.yaml b/assets/model_monitoring/components/prediction_drift/prediction_drift_signal_monitor/spec.yaml index 8979e6ba0c..f20f8f605b 100644 --- a/assets/model_monitoring/components/prediction_drift/prediction_drift_signal_monitor/spec.yaml +++ b/assets/model_monitoring/components/prediction_drift/prediction_drift_signal_monitor/spec.yaml @@ -47,7 +47,7 @@ outputs: jobs: feature_selection: type: spark - component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1 + component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2 inputs: input_data_1: type: mltable diff --git a/assets/model_monitoring/components/src/model_monitor_feature_selector/selectors/feature_selector_all.py b/assets/model_monitoring/components/src/model_monitor_feature_selector/selectors/feature_selector_all.py index c947491439..e66c705aa6 100644 --- a/assets/model_monitoring/components/src/model_monitor_feature_selector/selectors/feature_selector_all.py +++ b/assets/model_monitoring/components/src/model_monitor_feature_selector/selectors/feature_selector_all.py @@ -43,5 +43,9 @@ def select( spark = init_spark() features = spark.createDataFrame(data=rows, schema=schema) features.show() + if features.isEmpty(): + raise Exception( + "Could not generate features set correctly. Found no common columns between input datasets." + ) return features diff --git a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/MLTable b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/MLTable new file mode 100644 index 0000000000..6c2bd8c423 --- /dev/null +++ b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/MLTable @@ -0,0 +1,9 @@ +type: mltable + +paths: + - pattern: ./*.csv +transformations: + - read_delimited: + delimiter: ',' + encoding: ascii + header: all_files_same_headers \ No newline at end of file diff --git a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/iris_baseline.csv b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/iris_baseline.csv new file mode 100644 index 0000000000..b1c25f4876 --- /dev/null +++ b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/iris_baseline.csv @@ -0,0 +1,76 @@ +sepal_lengths,sepal_widths,petal_lengths,petal_widths +5,3,1.6,0.2 +5,3.4,1.6,0.4 +5.2,3.5,1.5,0.2 +5.2,3.4,1.4,0.2 +4.7,3.2,1.6,0.2 +4.8,3.1,1.6,0.2 +5.4,3.4,1.5,0.4 +5.2,4.1,1.5,0.1 +5.5,4.2,1.4,0.2 +4.9,3.1,1.5,0.1 +5,3.2,1.2,0.2 +5.5,3.5,1.3,0.2 +4.9,3.1,1.5,0.1 +4.4,3,1.3,0.2 +5.1,3.4,1.5,0.2 +5,3.5,1.3,0.3 +4.5,2.3,1.3,0.3 +4.4,3.2,1.3,0.2 +5,3.5,1.6,0.6 +5.1,3.8,1.9,0.4 +4.8,3,1.4,0.3 +5.1,3.8,1.6,0.2 +4.6,3.2,1.4,0.2 +5.3,3.7,1.5,0.2 +5,3.3,1.4,0.2 +6.6,3,4.4,1.4 +6.8,2.8,4.8,1.4 +6.7,3,5,1.7 +6,2.9,4.5,1.5 +5.7,2.6,3.5,1 +5.5,2.4,3.8,1.1 +5.5,2.4,3.7,1 +5.8,2.7,3.9,1.2 +6,2.7,5.1,1.6 +5.4,3,4.5,1.5 +6,3.4,4.5,1.6 +6.7,3.1,4.7,1.5 +6.3,2.3,4.4,1.3 +5.6,3,4.1,1.3 +5.5,2.5,4,1.3 +5.5,2.6,4.4,1.2 +6.1,3,4.6,1.4 +5.8,2.6,4,1.2 +5,2.3,3.3,1 +5.6,2.7,4.2,1.3 +5.7,3,4.2,1.2 +5.7,2.9,4.2,1.3 +6.2,2.9,4.3,1.3 +5.1,2.5,3,1.1 +5.7,2.8,4.1,1.3 +7.2,3.2,6,1.8 +6.2,2.8,4.8,1.8 +6.1,3,4.9,1.8 +6.4,2.8,5.6,2.1 +7.2,3,5.8,1.6 +7.4,2.8,6.1,1.9 +7.9,3.8,6.4,2 +6.4,2.8,5.6,2.2 +6.3,2.8,5.1,1.5 +6.1,2.6,5.6,1.4 +7.7,3,6.1,2.3 +6.3,3.4,5.6,2.4 +6.4,3.1,5.5,1.8 +6,3,4.8,1.8 +6.9,3.1,5.4,2.1 +6.7,3.1,5.6,2.4 +6.9,3.1,5.1,2.3 +5.8,2.7,5.1,1.9 +6.8,3.2,5.9,2.3 +6.7,3.3,5.7,2.5 +6.7,3,5.2,2.3 +6.3,2.5,5,1.9 +6.5,3,5.2,2 +6.2,3.4,5.4,2.3 +5.9,3,5.1,1.8 diff --git a/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py b/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py index 3ea18f75ae..8f816a78ad 100644 --- a/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py +++ b/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py @@ -13,7 +13,8 @@ DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT, DATA_ASSET_EMPTY, DATA_ASSET_IRIS_BASELINE_INT_DATA_TYPE, - DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT_INT_DATA + DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT_INT_DATA, + DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS ) @@ -92,6 +93,22 @@ def test_monitoring_run_empty_production_data_failed( # empty production data should fail the job assert pipeline_job.status == "Failed" + def test_monitoring_run_no_common_features_production_data_failed( + self, ml_client: MLClient, get_component, download_job_output, + test_suite_name + ): + """Test the scenario where the production data has no common features with baseline.""" + pipeline_job = _submit_data_drift_model_monitor_job( + ml_client, + get_component, + test_suite_name, + DATA_ASSET_IRIS_BASELINE_DATA, + DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS, + ) + + # No common columns should fail the job in the feature selector step. + assert pipeline_job.status == "Failed" + def test_monitoring_run_use_int_data_has_no_drift_successful( self, ml_client: MLClient, get_component, download_job_output, test_suite_name diff --git a/assets/model_monitoring/components/tests/e2e/utils/constants.py b/assets/model_monitoring/components/tests/e2e/utils/constants.py index 6b2d767b61..f611199957 100644 --- a/assets/model_monitoring/components/tests/e2e/utils/constants.py +++ b/assets/model_monitoring/components/tests/e2e/utils/constants.py @@ -78,6 +78,10 @@ "azureml:mltable_empty:1" ) +DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS = ( + "azureml:mltable_iris_preprocessed_model_inputs_no_common_columns:1" +) + # MDC-generated target dataset of an iris model which contains both the input features as well as the inferred results. # The data contains no drift. Output logs have been generated for 2023/01/01/00 and 2023/02/01/00. DATA_ASSET_IRIS_MODEL_INPUTS_OUTPUTS_WITH_NO_DRIFT = ( diff --git a/assets/model_monitoring/components/tests/unit/test_feature_selector_all.py b/assets/model_monitoring/components/tests/unit/test_feature_selector_all.py new file mode 100644 index 0000000000..03f5c4d7f0 --- /dev/null +++ b/assets/model_monitoring/components/tests/unit/test_feature_selector_all.py @@ -0,0 +1,44 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""This file contains unit tests for the FeatureSelectorAll class.""" + +import pytest +from pyspark.sql.types import StructType, StructField, DoubleType, FloatType +from src.model_monitor_feature_selector.selectors.feature_selector_all import FeatureSelectorAll +from tests.e2e.utils.io_utils import create_pyspark_dataframe + + +@pytest.mark.unit +class TestFeatureSelectorAll: + """Test class for feature selector component.""" + + def test_feature_selector_all_select_expect_succeed(self): + """Test feature selector scenarios.""" + feature_selector = FeatureSelectorAll() + + # Test with two dataframes that have common columns but datatype is not in the same type + float_data = [(3.55,), (6.88,), (7.99,)] + schema = StructType([ + StructField("target", FloatType(), True)]) + baseline_df = create_pyspark_dataframe(float_data, schema) + double_data = [(3.55,), (6.88,), (7.99,)] + schema = StructType([ + StructField("target", DoubleType(), True)]) + production_df = create_pyspark_dataframe(double_data, schema) + features = feature_selector.select(baseline_df, production_df) + assert features.count() == 1 + + def test_feature_selector_all_select_no_common_columns_expect_failure(self): + """Test feature selector scenarios.""" + feature_selector = FeatureSelectorAll() + + # Test with two dataframes that have no common columns + baseline_df = create_pyspark_dataframe([(1, "a"), (2, "b")], + ["id", "name"]) + production_df = create_pyspark_dataframe([(3, "c"), (4, "d")], + ["age", "gender"]) + try: + feature_selector.select(baseline_df, production_df) + except Exception as e: + assert "Found no common columns between input datasets." in e.args[0]