Add common features validation in Feature Selector component (#2073)

* add validation for no common columns * use spark functions on dataframe * add unit test for feature selector * remove whitespace * add feature selector test in data drift signal e2e test * spacing * fix style changes * fix comments * update spec version
Azure · Jan 5, 2024 · 55474f4 · 55474f4
1 parent 7d89997
commit 55474f4
Show file tree

Hide file tree

Showing 10 changed files with 159 additions and 5 deletions.
diff --git a/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml b/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml
@@ -70,7 +70,7 @@ jobs:
       type: aml_token
   feature_selection:
     type: spark
-    component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1
+    component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2
     inputs:
       input_data_1:
         type: mltable

diff --git a/assets/model_monitoring/components/data_quality/data_quality_signal_monitor/spec.yaml b/assets/model_monitoring/components/data_quality/data_quality_signal_monitor/spec.yaml
@@ -70,7 +70,7 @@ jobs:
       type: aml_token
   feature_selection:
     type: spark
-    component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1
+    component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2
     inputs:
       input_data_1:
         type: mltable

diff --git a/assets/model_monitoring/components/model_monitor/model_monitor_feature_selector/spec.yaml b/assets/model_monitoring/components/model_monitor/model_monitor_feature_selector/spec.yaml
@@ -4,7 +4,7 @@ type: spark
 name: model_monitor_feature_selector
 display_name: Model Monitor - Feature Selector
 description: Selects features to compute signal metrics on.
-version: 0.3.1
+version: 0.3.2
 is_deterministic: true
 
 code: ../../src

diff --git a/...ts/model_monitoring/components/prediction_drift/prediction_drift_signal_monitor/spec.yaml b/...ts/model_monitoring/components/prediction_drift/prediction_drift_signal_monitor/spec.yaml
@@ -47,7 +47,7 @@ outputs:
 jobs:
   feature_selection:
     type: spark
-    component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1
+    component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2
     inputs:
       input_data_1:
         type: mltable

diff --git a/...onitoring/components/src/model_monitor_feature_selector/selectors/feature_selector_all.py b/...onitoring/components/src/model_monitor_feature_selector/selectors/feature_selector_all.py
@@ -43,5 +43,9 @@ def select(
         spark = init_spark()
         features = spark.createDataFrame(data=rows, schema=schema)
         features.show()
+        if features.isEmpty():
+            raise Exception(
+                "Could not generate features set correctly. Found no common columns between input datasets."
+            )
 
         return features
diff --git a/...ents/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/MLTable b/...ents/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/MLTable
@@ -0,0 +1,9 @@
+type: mltable
+
+paths:
+  - pattern: ./*.csv
+transformations:
+  - read_delimited:
+      delimiter: ','
+      encoding: ascii
+      header: all_files_same_headers
diff --git a/.../e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/iris_baseline.csv b/.../e2e/resources/mltable_iris_preprocessed_model_inputs_no_common_columns/iris_baseline.csv
@@ -0,0 +1,76 @@
+sepal_lengths,sepal_widths,petal_lengths,petal_widths
+5,3,1.6,0.2
+5,3.4,1.6,0.4
+5.2,3.5,1.5,0.2
+5.2,3.4,1.4,0.2
+4.7,3.2,1.6,0.2
+4.8,3.1,1.6,0.2
+5.4,3.4,1.5,0.4
+5.2,4.1,1.5,0.1
+5.5,4.2,1.4,0.2
+4.9,3.1,1.5,0.1
+5,3.2,1.2,0.2
+5.5,3.5,1.3,0.2
+4.9,3.1,1.5,0.1
+4.4,3,1.3,0.2
+5.1,3.4,1.5,0.2
+5,3.5,1.3,0.3
+4.5,2.3,1.3,0.3
+4.4,3.2,1.3,0.2
+5,3.5,1.6,0.6
+5.1,3.8,1.9,0.4
+4.8,3,1.4,0.3
+5.1,3.8,1.6,0.2
+4.6,3.2,1.4,0.2
+5.3,3.7,1.5,0.2
+5,3.3,1.4,0.2
+6.6,3,4.4,1.4
+6.8,2.8,4.8,1.4
+6.7,3,5,1.7
+6,2.9,4.5,1.5
+5.7,2.6,3.5,1
+5.5,2.4,3.8,1.1
+5.5,2.4,3.7,1
+5.8,2.7,3.9,1.2
+6,2.7,5.1,1.6
+5.4,3,4.5,1.5
+6,3.4,4.5,1.6
+6.7,3.1,4.7,1.5
+6.3,2.3,4.4,1.3
+5.6,3,4.1,1.3
+5.5,2.5,4,1.3
+5.5,2.6,4.4,1.2
+6.1,3,4.6,1.4
+5.8,2.6,4,1.2
+5,2.3,3.3,1
+5.6,2.7,4.2,1.3
+5.7,3,4.2,1.2
+5.7,2.9,4.2,1.3
+6.2,2.9,4.3,1.3
+5.1,2.5,3,1.1
+5.7,2.8,4.1,1.3
+7.2,3.2,6,1.8
+6.2,2.8,4.8,1.8
+6.1,3,4.9,1.8
+6.4,2.8,5.6,2.1
+7.2,3,5.8,1.6
+7.4,2.8,6.1,1.9
+7.9,3.8,6.4,2
+6.4,2.8,5.6,2.2
+6.3,2.8,5.1,1.5
+6.1,2.6,5.6,1.4
+7.7,3,6.1,2.3
+6.3,3.4,5.6,2.4
+6.4,3.1,5.5,1.8
+6,3,4.8,1.8
+6.9,3.1,5.4,2.1
+6.7,3.1,5.6,2.4
+6.9,3.1,5.1,2.3
+5.8,2.7,5.1,1.9
+6.8,3.2,5.9,2.3
+6.7,3.3,5.7,2.5
+6.7,3,5.2,2.3
+6.3,2.5,5,1.9
+6.5,3,5.2,2
+6.2,3.4,5.4,2.3
+5.9,3,5.1,1.8
diff --git a/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py b/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py
@@ -13,7 +13,8 @@
     DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT,
     DATA_ASSET_EMPTY,
     DATA_ASSET_IRIS_BASELINE_INT_DATA_TYPE,
-    DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT_INT_DATA
+    DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT_INT_DATA,
+    DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS
 )
 
 
@@ -92,6 +93,22 @@ def test_monitoring_run_empty_production_data_failed(
         # empty production data should fail the job
         assert pipeline_job.status == "Failed"
 
+    def test_monitoring_run_no_common_features_production_data_failed(
+        self, ml_client: MLClient, get_component, download_job_output,
+        test_suite_name
+    ):
+        """Test the scenario where the production data has no common features with baseline."""
+        pipeline_job = _submit_data_drift_model_monitor_job(
+            ml_client,
+            get_component,
+            test_suite_name,
+            DATA_ASSET_IRIS_BASELINE_DATA,
+            DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS,
+        )
+
+        # No common columns should fail the job in the feature selector step.
+        assert pipeline_job.status == "Failed"
+
     def test_monitoring_run_use_int_data_has_no_drift_successful(
         self, ml_client: MLClient, get_component, download_job_output,
         test_suite_name

diff --git a/assets/model_monitoring/components/tests/e2e/utils/constants.py b/assets/model_monitoring/components/tests/e2e/utils/constants.py
@@ -78,6 +78,10 @@
     "azureml:mltable_empty:1"
 )
 
+DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS = (
+    "azureml:mltable_iris_preprocessed_model_inputs_no_common_columns:1"
+)
+
 # MDC-generated target dataset of an iris model which contains both the input features as well as the inferred results.
 # The data contains no drift. Output logs have been generated for 2023/01/01/00 and 2023/02/01/00.
 DATA_ASSET_IRIS_MODEL_INPUTS_OUTPUTS_WITH_NO_DRIFT = (

diff --git a/assets/model_monitoring/components/tests/unit/test_feature_selector_all.py b/assets/model_monitoring/components/tests/unit/test_feature_selector_all.py
@@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""This file contains unit tests for the FeatureSelectorAll class."""
+
+import pytest
+from pyspark.sql.types import StructType, StructField, DoubleType, FloatType
+from src.model_monitor_feature_selector.selectors.feature_selector_all import FeatureSelectorAll
+from tests.e2e.utils.io_utils import create_pyspark_dataframe
+
+
+@pytest.mark.unit
+class TestFeatureSelectorAll:
+    """Test class for feature selector component."""
+
+    def test_feature_selector_all_select_expect_succeed(self):
+        """Test feature selector scenarios."""
+        feature_selector = FeatureSelectorAll()
+
+        # Test with two dataframes that have common columns but datatype is not in the same type
+        float_data = [(3.55,), (6.88,), (7.99,)]
+        schema = StructType([
+            StructField("target", FloatType(), True)])
+        baseline_df = create_pyspark_dataframe(float_data, schema)
+        double_data = [(3.55,), (6.88,), (7.99,)]
+        schema = StructType([
+            StructField("target", DoubleType(), True)])
+        production_df = create_pyspark_dataframe(double_data, schema)
+        features = feature_selector.select(baseline_df, production_df)
+        assert features.count() == 1
+
+    def test_feature_selector_all_select_no_common_columns_expect_failure(self):
+        """Test feature selector scenarios."""
+        feature_selector = FeatureSelectorAll()
+
+        # Test with two dataframes that have no common columns
+        baseline_df = create_pyspark_dataframe([(1, "a"), (2, "b")],
+                                               ["id", "name"])
+        production_df = create_pyspark_dataframe([(3, "c"), (4, "d")],
+                                                 ["age", "gender"])
+        try:
+            feature_selector.select(baseline_df, production_df)
+        except Exception as e:
+            assert "Found no common columns between input datasets." in e.args[0]