From a1f9dfd42b22b4de9a754a010485ea2757bb9a5a Mon Sep 17 00:00:00 2001 From: alanpo1 <143033175+alanpo1@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:34:12 -0800 Subject: [PATCH] Create correct histogram buckets when data column has a single distinct value (#2100) * working on new testcase for histogram failure * dataset col numerical * check for buckets with single values. update spec * style and documentation edits * syntax * syntax fix * fix column to not be inferred as bool * fix column to not be inferred as bool * consider edge-case min_value is 0 * handle negative min values for delta * add histogram UT * fix linter issues * linter * test expected values in single bucket * fix ut * revert spec version * revert compute histogram spec version * fix UT * fix UT --- .../data_drift_signal_monitor/spec.yaml | 4 +- .../spec.yaml | 2 +- .../src/shared_utilities/histogram_utils.py | 13 +- .../iris_baseline.csv | 2 +- .../MLTable | 10 + .../iris_baseline.csv | 14 ++ .../MLTable | 9 + .../iris_baseline.csv | 11 + .../e2e/test_data_drift_signal_monitor_e2e.py | 19 +- .../components/tests/e2e/utils/constants.py | 8 + .../tests/unit/test_histogram_utils.py | 207 ++++++++++++++++++ 11 files changed, 291 insertions(+), 8 deletions(-) create mode 100644 assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/MLTable create mode 100644 assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/iris_baseline.csv create mode 100644 assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/MLTable create mode 100644 assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/iris_baseline.csv create mode 100644 assets/model_monitoring/components/tests/unit/test_histogram_utils.py diff --git a/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml b/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml index 00ec3263ef..6ce6d7e776 100644 --- a/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml +++ b/assets/model_monitoring/components/data_drift/data_drift_signal_monitor/spec.yaml @@ -4,7 +4,7 @@ type: pipeline name: data_drift_signal_monitor display_name: Data Drift - Signal Monitor description: Computes the data drift between a baseline and production data assets. -version: 0.3.22 +version: 0.3.23 is_deterministic: true inputs: @@ -116,7 +116,7 @@ jobs: type: aml_token compute_histogram_buckets: type: spark - component: azureml://registries/azureml/components/model_monitor_compute_histogram_buckets/versions/0.3.3 + component: azureml://registries/azureml/components/model_monitor_compute_histogram_buckets/versions/0.3.4 inputs: input_data_1: type: mltable diff --git a/assets/model_monitoring/components/model_monitor/model_monitor_compute_histogram_buckets/spec.yaml b/assets/model_monitoring/components/model_monitor/model_monitor_compute_histogram_buckets/spec.yaml index 033c210d6e..df8d7e2a96 100644 --- a/assets/model_monitoring/components/model_monitor/model_monitor_compute_histogram_buckets/spec.yaml +++ b/assets/model_monitoring/components/model_monitor/model_monitor_compute_histogram_buckets/spec.yaml @@ -4,7 +4,7 @@ type: spark name: model_monitor_compute_histogram_buckets display_name: Model Monitor - Compute Histogram Buckets description: Compute histogram buckets given up to two datasets. -version: 0.3.3 +version: 0.3.4 is_deterministic: true code: ../../src diff --git a/assets/model_monitoring/components/src/shared_utilities/histogram_utils.py b/assets/model_monitoring/components/src/shared_utilities/histogram_utils.py index eb2201b98b..f55d50466a 100644 --- a/assets/model_monitoring/components/src/shared_utilities/histogram_utils.py +++ b/assets/model_monitoring/components/src/shared_utilities/histogram_utils.py @@ -11,8 +11,8 @@ def _get_smaller_df(baseline_df, production_df, baseline_count, production_count return baseline_df if baseline_count < production_count else production_df -def _get_bin_width(baseline_df, production_df, baseline_count, production_count): - """Calculate bin width using struges alogorithm.""" +def _get_optimal_number_of_bins(baseline_df, production_df, baseline_count, production_count): + """Calculate number of bins for histogram using struges alogorithm.""" # TODO: Unnecessary calculation, use count from summary and remove _get_smaller_df() smaller_df = _get_smaller_df( baseline_df, production_df, baseline_count, production_count @@ -25,7 +25,7 @@ def get_dual_histogram_bin_edges( baseline_df, production_df, baseline_count, production_count, numerical_columns ): """Get histogram edges using fixed bin width.""" - num_bins = _get_bin_width( + num_bins = _get_optimal_number_of_bins( baseline_df, production_df, baseline_count, production_count ) all_bin_edges = {} @@ -43,6 +43,13 @@ def get_dual_histogram_bin_edges( bin_width = (max_value - min_value) / num_bins + # If histogram has only one value then we only need a single bucket and + # should skip the for-loop below. + if min_value == max_value: + delta = 0.005 if min_value == 0 else abs(min_value * 0.005) + all_bin_edges[col] = [min_value - delta, min_value + delta] + continue + edges = [] for bin in range(num_bins): bin = bin + 1 diff --git a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_data_type/iris_baseline.csv b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_data_type/iris_baseline.csv index 0a30dddb46..f60d7f09f5 100644 --- a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_data_type/iris_baseline.csv +++ b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_data_type/iris_baseline.csv @@ -11,4 +11,4 @@ sepal_length,sepal_width,petal_length,petal_width,target 4,3,1,5,virginica 5,3,1,6,versicolor 4,3,1,5,virginica -5,3,1,6,versicolor \ No newline at end of file +5,3,1.0,6,versicolor \ No newline at end of file diff --git a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/MLTable b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/MLTable new file mode 100644 index 0000000000..9eb500ca6a --- /dev/null +++ b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/MLTable @@ -0,0 +1,10 @@ +type: mltable + +paths: + - pattern: ./*.csv +transformations: + - read_delimited: + delimiter: ',' + encoding: ascii + header: all_files_same_headers + \ No newline at end of file diff --git a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/iris_baseline.csv b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/iris_baseline.csv new file mode 100644 index 0000000000..2d8bd04ef0 --- /dev/null +++ b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/iris_baseline.csv @@ -0,0 +1,14 @@ +sepal_length,sepal_width,petal_length,petal_width,target,pickup_month +5,3,1,4,setosa,1 +5,3,1,1,setosa,1 +2,3,1,2,setosa,1 +5,3,1,3,setosa,1 +4,3,1,4,setosa,1 +4,3,1,5,virginica,1 +5,3,1,6,versicolor,1 +4,3,1,5,virginica,1 +5,3,1,6,versicolor,1 +4,3,1,5,virginica,1 +5,3,1,6,versicolor,1 +4,3,1,5,virginica,1 +5,3,1.0,6,versicolor,1.0 \ No newline at end of file diff --git a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/MLTable b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/MLTable new file mode 100644 index 0000000000..6c2bd8c423 --- /dev/null +++ b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/MLTable @@ -0,0 +1,9 @@ +type: mltable + +paths: + - pattern: ./*.csv +transformations: + - read_delimited: + delimiter: ',' + encoding: ascii + header: all_files_same_headers \ No newline at end of file diff --git a/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/iris_baseline.csv b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/iris_baseline.csv new file mode 100644 index 0000000000..2a9fa559b3 --- /dev/null +++ b/assets/model_monitoring/components/tests/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/iris_baseline.csv @@ -0,0 +1,11 @@ +sepal_length,sepal_width,petal_length,petal_width,pickup_month +5,3,1.6,0.2,1 +5,3.4,1.6,0.4,1 +5,3.5,1.5,0.2,1 +5,3.4,1.4,0.2,1 +4,3.2,1.6,0.2,1 +4,3.1,1.6,0.2,1 +5,3.4,1.5,0.4,1 +5,4.1,1.5,0.1,1 +5,4.2,1.4,0.2,1 +4,3.1,1.5,0.1,1.0 \ No newline at end of file diff --git a/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py b/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py index 8f816a78ad..9772f45919 100644 --- a/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py +++ b/assets/model_monitoring/components/tests/e2e/test_data_drift_signal_monitor_e2e.py @@ -14,7 +14,9 @@ DATA_ASSET_EMPTY, DATA_ASSET_IRIS_BASELINE_INT_DATA_TYPE, DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_DRIFT_INT_DATA, - DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS + DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_NO_COMMON_COLUMNS, + DATA_ASSET_IRIS_BASELINE_INT_SINGLE_VALUE_HISTOGRAM, + DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_INT_SINGLE_VALUE_HISTOGRAM ) @@ -139,3 +141,18 @@ def test_monitoring_run_empty_production_and_baseline_data( # empty production and target data should fail the job assert pipeline_job.status == "Failed" + + def test_monitoring_run_int_single_distinct_value_histogram( + self, ml_client: MLClient, get_component, download_job_output, + test_suite_name + ): + """Test the scenario where the production data has a column with only one distinct value.""" + pipeline_job = _submit_data_drift_model_monitor_job( + ml_client, + get_component, + test_suite_name, + DATA_ASSET_IRIS_BASELINE_INT_SINGLE_VALUE_HISTOGRAM, + DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_INT_SINGLE_VALUE_HISTOGRAM, + ) + + assert pipeline_job.status == "Completed" diff --git a/assets/model_monitoring/components/tests/e2e/utils/constants.py b/assets/model_monitoring/components/tests/e2e/utils/constants.py index 867f942ce3..2233e749cf 100644 --- a/assets/model_monitoring/components/tests/e2e/utils/constants.py +++ b/assets/model_monitoring/components/tests/e2e/utils/constants.py @@ -82,6 +82,14 @@ "azureml:mltable_iris_preprocessed_model_inputs_no_common_columns:1" ) +# used for checking against histogram regressions where a numerical data-column has a single distinct value +DATA_ASSET_IRIS_PREPROCESSED_MODEL_INPUTS_INT_SINGLE_VALUE_HISTOGRAM = ( + "azureml:mltable_iris_preprocessed_model_inputs_int_single_value_histogram:1" +) +DATA_ASSET_IRIS_BASELINE_INT_SINGLE_VALUE_HISTOGRAM = ( + "azureml:mltable_iris_baseline_int_single_value_histogram:1" +) + # MDC-generated target dataset of an iris model which contains both the input features as well as the inferred results. # The data contains no drift. Output logs have been generated for 2023/01/01/00 and 2023/02/01/00. DATA_ASSET_IRIS_MODEL_INPUTS_OUTPUTS_WITH_NO_DRIFT = ( diff --git a/assets/model_monitoring/components/tests/unit/test_histogram_utils.py b/assets/model_monitoring/components/tests/unit/test_histogram_utils.py new file mode 100644 index 0000000000..ca6de42791 --- /dev/null +++ b/assets/model_monitoring/components/tests/unit/test_histogram_utils.py @@ -0,0 +1,207 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""This file contains unit tests for the df utilities.""" + +from pyspark.sql import SparkSession, DataFrame +import pyspark.sql.functions as pyspark_f +from src.shared_utilities.df_utils import get_numerical_cols_with_df +from src.shared_utilities.histogram_utils import ( + get_dual_histogram_bin_edges + ) +import math +import pandas as pd +import pytest + + +@pytest.mark.unit +class TestDFUtils: + """Test class for histogram utilities.""" + + def _num_bins_by_struges_algorithm(self, df: DataFrame) -> int: + """For testing suite, calculate number of bins for a dataset using struges algorithm.""" + num_bins = math.log2(df.count()) + 1 + return math.ceil(num_bins) + + def test_get_dual_histogram_bin_edges(self): + """Test with mixed columns expect succeed.""" + column_dtype_map = { + 'col1': 'int', + 'col2': 'float', + 'col3': 'double', + 'col4': 'decimal', + 'col5': 'string' + } + baseline_df = pd.DataFrame({ + 'col1': [1, 2, 3, 4, 5], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555] + }) + production_df = pd.DataFrame({ + 'col1': [1, 2, 3, 4, 5, 6, 7], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55, 6.66, 7.77], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555, 6.666, 7.777] + }) + baseline_df = self.init_spark().createDataFrame(baseline_df) + production_df = self.init_spark().createDataFrame(production_df) + numerical_columns = get_numerical_cols_with_df(column_dtype_map, baseline_df) + + all_edges = get_dual_histogram_bin_edges( + baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns + ) + + assert all_edges is not None + for col in numerical_columns: + assert all_edges.get(col, None) is not None + assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 + + calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) + distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) + assert distinct_df.distinct().count() == len(all_edges[col]) + + def test_get_dual_histogram_bin_edges_single_distinct_value_bucket(self): + """Test scenario where we have a single bucket.""" + column_dtype_map = { + 'col1': 'int', + 'col2': 'float', + 'col3': 'double', + 'col4': 'decimal', + 'col5': 'string' + } + baseline_df = pd.DataFrame({ + 'col1': [1, 1, 1, 1, 1], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555] + }) + production_df = pd.DataFrame({ + 'col1': [1, 1, 1, 1, 1], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555] + }) + baseline_df = self.init_spark().createDataFrame(baseline_df) + production_df = self.init_spark().createDataFrame(production_df) + numerical_columns = get_numerical_cols_with_df(column_dtype_map, + baseline_df) + + all_edges = get_dual_histogram_bin_edges( + baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns + ) + + assert all_edges is not None + for col in numerical_columns: + assert all_edges.get(col, None) is not None + + if col == 'col1': + assert len(all_edges[col]) == 2 + min_value = min(baseline_df.agg(pyspark_f.min(col)).collect()[0]) + expected_delta = min_value * 0.005 + assert all_edges[col][0] == (min_value - expected_delta) + assert all_edges[col][1] == (min_value + expected_delta) + else: + assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 + + calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) + distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) + assert distinct_df.distinct().count() == len(all_edges[col]) + + def test_get_dual_histogram_bin_edges_single_distinct_value_bucket_negative(self): + """Test scenario where we have a single bucket with a negative value.""" + column_dtype_map = { + 'col1': 'int', + 'col2': 'float', + 'col3': 'double', + 'col4': 'decimal', + 'col5': 'string' + } + baseline_df = pd.DataFrame({ + 'col1': [-31, -31, -31, -31, -31], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555] + }) + production_df = pd.DataFrame({ + 'col1': [-31, -31, -31, -31, -31], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555] + }) + baseline_df = self.init_spark().createDataFrame(baseline_df) + production_df = self.init_spark().createDataFrame(production_df) + numerical_columns = get_numerical_cols_with_df(column_dtype_map, + baseline_df) + + all_edges = get_dual_histogram_bin_edges( + baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns + ) + + assert all_edges is not None + for col in numerical_columns: + assert all_edges.get(col, None) is not None + + if col == 'col1': + assert len(all_edges[col]) == 2 + min_value = min(baseline_df.agg(pyspark_f.min(col)).collect()[0]) + expected_delta = abs(min_value * 0.005) + assert all_edges[col][0] == (min_value - expected_delta) + assert all_edges[col][1] == (min_value + expected_delta) + else: + assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 + + calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) + distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) + assert distinct_df.distinct().count() == len(all_edges[col]) + + def test_get_dual_histogram_bin_edges_single_distinct_value_bucket_zero(self): + """Test scenario where we have a single bucket with the value as zero.""" + column_dtype_map = { + 'col1': 'int', + 'col2': 'float', + 'col3': 'double', + 'col4': 'decimal', + 'col5': 'string' + } + baseline_df = pd.DataFrame({ + 'col1': [0, 0, 0, 0, 0], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555] + }) + production_df = pd.DataFrame({ + 'col1': [0, 0, 0, 0, 0], + 'col2': [1.1, 2.2, 3.3, 4.4, 5.5], + 'col3': [1.11, 2.22, 3.33, 4.44, 5.55], + 'col4': [1.111, 2.222, 3.333, 4.444, 5.555] + }) + baseline_df = self.init_spark().createDataFrame(baseline_df) + production_df = self.init_spark().createDataFrame(production_df) + numerical_columns = get_numerical_cols_with_df(column_dtype_map, + baseline_df) + + all_edges = get_dual_histogram_bin_edges( + baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns + ) + + assert all_edges is not None + for col in numerical_columns: + assert all_edges.get(col, None) is not None + + if col == 'col1': + assert len(all_edges[col]) == 2 + min_value = min(baseline_df.agg(pyspark_f.min(col)).collect()[0]) + expected_delta = 0.005 + assert all_edges[col][0] == (min_value - expected_delta) + assert all_edges[col][1] == (min_value + expected_delta) + else: + assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 + + calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) + distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) + assert distinct_df.distinct().count() == len(all_edges[col]) + + def init_spark(self): + """Get or create spark session.""" + spark = SparkSession.builder.appName("test").getOrCreate() + return spark