Skip to content

Commit

Permalink
Add common features validation in Feature Selector component (#2073)
Browse files Browse the repository at this point in the history
* add validation for no common columns

* use spark functions on dataframe

* add unit test for feature selector

* remove whitespace

* add feature selector test in data drift signal e2e test

* spacing

* fix style changes

* fix comments

* update spec version
  • Loading branch information
alanpo1 authored Jan 5, 2024
1 parent 7d89997 commit 55474f4
Show file tree
Hide file tree
Showing 10 changed files with 159 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
type: aml_token
feature_selection:
type: spark
component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1
component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2
inputs:
input_data_1:
type: mltable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
type: aml_token
feature_selection:
type: spark
component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1
component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2
inputs:
input_data_1:
type: mltable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ type: spark
name: model_monitor_feature_selector
display_name: Model Monitor - Feature Selector
description: Selects features to compute signal metrics on.
version: 0.3.1
version: 0.3.2
is_deterministic: true

code: ../../src
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ outputs:
jobs:
feature_selection:
type: spark
component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.1
component: azureml://registries/azureml/components/model_monitor_feature_selector/versions/0.3.2
inputs:
input_data_1:
type: mltable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,9 @@ def select(
spark = init_spark()
features = spark.createDataFrame(data=rows, schema=schema)
features.show()
if features.isEmpty():
raise Exception(
"Could not generate features set correctly. Found no common columns between input datasets."
)

return features
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
type: mltable

paths:
- pattern: ./*.csv
transformations:
- read_delimited:
delimiter: ','
encoding: ascii
header: all_files_same_headers
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
sepal_lengths,sepal_widths,petal_lengths,petal_widths
5,3,1.6,0.2
5,3.4,1.6,0.4
5.2,3.5,1.5,0.2
5.2,3.4,1.4,0.2
4.7,3.2,1.6,0.2
4.8,3.1,1.6,0.2
5.4,3.4,1.5,0.4
5.2,4.1,1.5,0.1
5.5,4.2,1.4,0.2
4.9,3.1,1.5,0.1
5,3.2,1.2,0.2
5.5,3.5,1.3,0.2
4.9,3.1,1.5,0.1
4.4,3,1.3,0.2
5.1,3.4,1.5,0.2
5,3.5,1.3,0.3
4.5,2.3,1.3,0.3
4.4,3.2,1.3,0.2
5,3.5,1.6,0.6
5.1,3.8,1.9,0.4
4.8,3,1.4,0.3
5.1,3.8,1.6,0.2
4.6,3.2,1.4,0.2
5.3,3.7,1.5,0.2
5,3.3,1.4,0.2
6.6,3,4.4,1.4
6.8,2.8,4.8,1.4
6.7,3,5,1.7
6,2.9,4.5,1.5
5.7,2.6,3.5,1
5.5,2.4,3.8,1.1
5.5,2.4,3.7,1
5.8,2.7,3.9,1.2
6,2.7,5.1,1.6
5.4,3,4.5,1.5
6,3.4,4.5,1.6
6.7,3.1,4.7,1.5
6.3,2.3,4.4,1.3
5.6,3,4.1,1.3
5.5,2.5,4,1.3
5.5,2.6,4.4,1.2
6.1,3,4.6,1.4
5.8,2.6,4,1.2
5,2.3,3.3,1
5.6,2.7,4.2,1.3
5.7,3,4.2,1.2
5.7,2.9,4.2,1.3
6.2,2.9,4.3,1.3
5.1,2.5,3,1.1
5.7,2.8,4.1,1.3
7.2,3.2,6,1.8
6.2,2.8,4.8,1.8
6.1,3,4.9,1.8
6.4,2.8,5.6,2.1
7.2,3,5.8,1.6
7.4,2.8,6.1,1.9
7.9,3.8,6.4,2
6.4,2.8,5.6,2.2
6.3,2.8,5.1,1.5
6.1,2.6,5.6,1.4
7.7,3,6.1,2.3
6.3,3.4,5.6,2.4
6.4,3.1,5.5,1.8
6,3,4.8,1.8
6.9,3.1,5.4,2.1
6.7,3.1,5.6,2.4
6.9,3.1,5.1,2.3
5.8,2.7,5.1,1.9
6.8,3.2,5.9,2.3
6.7,3.3,5.7,2.5
6.7,3,5.2,2.3
6.3,2.5,5,1.9
6.5,3,5.2,2
6.2,3.4,5.4,2.3
5.9,3,5.1,1.8
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT,
DATA_ASSET_EMPTY,
DATA_ASSET_IRIS_BASELINE_INT_DATA_TYPE,
DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT_INT_DATA
DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT_INT_DATA,
DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS
)


Expand Down Expand Up @@ -92,6 +93,22 @@ def test_monitoring_run_empty_production_data_failed(
# empty production data should fail the job
assert pipeline_job.status == "Failed"

def test_monitoring_run_no_common_features_production_data_failed(
self, ml_client: MLClient, get_component, download_job_output,
test_suite_name
):
"""Test the scenario where the production data has no common features with baseline."""
pipeline_job = _submit_data_drift_model_monitor_job(
ml_client,
get_component,
test_suite_name,
DATA_ASSET_IRIS_BASELINE_DATA,
DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS,
)

# No common columns should fail the job in the feature selector step.
assert pipeline_job.status == "Failed"

def test_monitoring_run_use_int_data_has_no_drift_successful(
self, ml_client: MLClient, get_component, download_job_output,
test_suite_name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@
"azureml:mltable_empty:1"
)

DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS = (
"azureml:mltable_iris_preprocessed_model_inputs_no_common_columns:1"
)

# MDC-generated target dataset of an iris model which contains both the input features as well as the inferred results.
# The data contains no drift. Output logs have been generated for 2023/01/01/00 and 2023/02/01/00.
DATA_ASSET_IRIS_MODEL_INPUTS_OUTPUTS_WITH_NO_DRIFT = (
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""This file contains unit tests for the FeatureSelectorAll class."""

import pytest
from pyspark.sql.types import StructType, StructField, DoubleType, FloatType
from src.model_monitor_feature_selector.selectors.feature_selector_all import FeatureSelectorAll
from tests.e2e.utils.io_utils import create_pyspark_dataframe


@pytest.mark.unit
class TestFeatureSelectorAll:
"""Test class for feature selector component."""

def test_feature_selector_all_select_expect_succeed(self):
"""Test feature selector scenarios."""
feature_selector = FeatureSelectorAll()

# Test with two dataframes that have common columns but datatype is not in the same type
float_data = [(3.55,), (6.88,), (7.99,)]
schema = StructType([
StructField("target", FloatType(), True)])
baseline_df = create_pyspark_dataframe(float_data, schema)
double_data = [(3.55,), (6.88,), (7.99,)]
schema = StructType([
StructField("target", DoubleType(), True)])
production_df = create_pyspark_dataframe(double_data, schema)
features = feature_selector.select(baseline_df, production_df)
assert features.count() == 1

def test_feature_selector_all_select_no_common_columns_expect_failure(self):
"""Test feature selector scenarios."""
feature_selector = FeatureSelectorAll()

# Test with two dataframes that have no common columns
baseline_df = create_pyspark_dataframe([(1, "a"), (2, "b")],
["id", "name"])
production_df = create_pyspark_dataframe([(3, "c"), (4, "d")],
["age", "gender"])
try:
feature_selector.select(baseline_df, production_df)
except Exception as e:
assert "Found no common columns between input datasets." in e.args[0]

0 comments on commit 55474f4

Please sign in to comment.