-
Notifications
You must be signed in to change notification settings - Fork 136
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create correct histogram buckets when data column has a single distin…
…ct value (#2100) * working on new testcase for histogram failure * dataset col numerical * check for buckets with single values. update spec * style and documentation edits * syntax * syntax fix * fix column to not be inferred as bool * fix column to not be inferred as bool * consider edge-case min_value is 0 * handle negative min values for delta * add histogram UT * fix linter issues * linter * test expected values in single bucket * fix ut * revert spec version * revert compute histogram spec version * fix UT * fix UT
- Loading branch information
Showing
11 changed files
with
291 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
10 changes: 10 additions & 0 deletions
10
...g/components/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/MLTable
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
type: mltable | ||
|
||
paths: | ||
- pattern: ./*.csv | ||
transformations: | ||
- read_delimited: | ||
delimiter: ',' | ||
encoding: ascii | ||
header: all_files_same_headers | ||
|
14 changes: 14 additions & 0 deletions
14
...ts/tests/e2e/resources/mltable_iris_baseline_int_single_value_histogram/iris_baseline.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
sepal_length,sepal_width,petal_length,petal_width,target,pickup_month | ||
5,3,1,4,setosa,1 | ||
5,3,1,1,setosa,1 | ||
2,3,1,2,setosa,1 | ||
5,3,1,3,setosa,1 | ||
4,3,1,4,setosa,1 | ||
4,3,1,5,virginica,1 | ||
5,3,1,6,versicolor,1 | ||
4,3,1,5,virginica,1 | ||
5,3,1,6,versicolor,1 | ||
4,3,1,5,virginica,1 | ||
5,3,1,6,versicolor,1 | ||
4,3,1,5,virginica,1 | ||
5,3,1.0,6,versicolor,1.0 |
9 changes: 9 additions & 0 deletions
9
...s/e2e/resources/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/MLTable
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
type: mltable | ||
|
||
paths: | ||
- pattern: ./*.csv | ||
transformations: | ||
- read_delimited: | ||
delimiter: ',' | ||
encoding: ascii | ||
header: all_files_same_headers |
11 changes: 11 additions & 0 deletions
11
...urces/mltable_iris_preprocessed_model_inputs_int_single_value_histogram/iris_baseline.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
sepal_length,sepal_width,petal_length,petal_width,pickup_month | ||
5,3,1.6,0.2,1 | ||
5,3.4,1.6,0.4,1 | ||
5,3.5,1.5,0.2,1 | ||
5,3.4,1.4,0.2,1 | ||
4,3.2,1.6,0.2,1 | ||
4,3.1,1.6,0.2,1 | ||
5,3.4,1.5,0.4,1 | ||
5,4.1,1.5,0.1,1 | ||
5,4.2,1.4,0.2,1 | ||
4,3.1,1.5,0.1,1.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
207 changes: 207 additions & 0 deletions
207
assets/model_monitoring/components/tests/unit/test_histogram_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
"""This file contains unit tests for the df utilities.""" | ||
|
||
from pyspark.sql import SparkSession, DataFrame | ||
import pyspark.sql.functions as pyspark_f | ||
from src.shared_utilities.df_utils import get_numerical_cols_with_df | ||
from src.shared_utilities.histogram_utils import ( | ||
get_dual_histogram_bin_edges | ||
) | ||
import math | ||
import pandas as pd | ||
import pytest | ||
|
||
|
||
@pytest.mark.unit | ||
class TestDFUtils: | ||
"""Test class for histogram utilities.""" | ||
|
||
def _num_bins_by_struges_algorithm(self, df: DataFrame) -> int: | ||
"""For testing suite, calculate number of bins for a dataset using struges algorithm.""" | ||
num_bins = math.log2(df.count()) + 1 | ||
return math.ceil(num_bins) | ||
|
||
def test_get_dual_histogram_bin_edges(self): | ||
"""Test with mixed columns expect succeed.""" | ||
column_dtype_map = { | ||
'col1': 'int', | ||
'col2': 'float', | ||
'col3': 'double', | ||
'col4': 'decimal', | ||
'col5': 'string' | ||
} | ||
baseline_df = pd.DataFrame({ | ||
'col1': [1, 2, 3, 4, 5], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555] | ||
}) | ||
production_df = pd.DataFrame({ | ||
'col1': [1, 2, 3, 4, 5, 6, 7], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55, 6.66, 7.77], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555, 6.666, 7.777] | ||
}) | ||
baseline_df = self.init_spark().createDataFrame(baseline_df) | ||
production_df = self.init_spark().createDataFrame(production_df) | ||
numerical_columns = get_numerical_cols_with_df(column_dtype_map, baseline_df) | ||
|
||
all_edges = get_dual_histogram_bin_edges( | ||
baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns | ||
) | ||
|
||
assert all_edges is not None | ||
for col in numerical_columns: | ||
assert all_edges.get(col, None) is not None | ||
assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 | ||
|
||
calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) | ||
distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) | ||
assert distinct_df.distinct().count() == len(all_edges[col]) | ||
|
||
def test_get_dual_histogram_bin_edges_single_distinct_value_bucket(self): | ||
"""Test scenario where we have a single bucket.""" | ||
column_dtype_map = { | ||
'col1': 'int', | ||
'col2': 'float', | ||
'col3': 'double', | ||
'col4': 'decimal', | ||
'col5': 'string' | ||
} | ||
baseline_df = pd.DataFrame({ | ||
'col1': [1, 1, 1, 1, 1], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555] | ||
}) | ||
production_df = pd.DataFrame({ | ||
'col1': [1, 1, 1, 1, 1], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555] | ||
}) | ||
baseline_df = self.init_spark().createDataFrame(baseline_df) | ||
production_df = self.init_spark().createDataFrame(production_df) | ||
numerical_columns = get_numerical_cols_with_df(column_dtype_map, | ||
baseline_df) | ||
|
||
all_edges = get_dual_histogram_bin_edges( | ||
baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns | ||
) | ||
|
||
assert all_edges is not None | ||
for col in numerical_columns: | ||
assert all_edges.get(col, None) is not None | ||
|
||
if col == 'col1': | ||
assert len(all_edges[col]) == 2 | ||
min_value = min(baseline_df.agg(pyspark_f.min(col)).collect()[0]) | ||
expected_delta = min_value * 0.005 | ||
assert all_edges[col][0] == (min_value - expected_delta) | ||
assert all_edges[col][1] == (min_value + expected_delta) | ||
else: | ||
assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 | ||
|
||
calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) | ||
distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) | ||
assert distinct_df.distinct().count() == len(all_edges[col]) | ||
|
||
def test_get_dual_histogram_bin_edges_single_distinct_value_bucket_negative(self): | ||
"""Test scenario where we have a single bucket with a negative value.""" | ||
column_dtype_map = { | ||
'col1': 'int', | ||
'col2': 'float', | ||
'col3': 'double', | ||
'col4': 'decimal', | ||
'col5': 'string' | ||
} | ||
baseline_df = pd.DataFrame({ | ||
'col1': [-31, -31, -31, -31, -31], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555] | ||
}) | ||
production_df = pd.DataFrame({ | ||
'col1': [-31, -31, -31, -31, -31], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555] | ||
}) | ||
baseline_df = self.init_spark().createDataFrame(baseline_df) | ||
production_df = self.init_spark().createDataFrame(production_df) | ||
numerical_columns = get_numerical_cols_with_df(column_dtype_map, | ||
baseline_df) | ||
|
||
all_edges = get_dual_histogram_bin_edges( | ||
baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns | ||
) | ||
|
||
assert all_edges is not None | ||
for col in numerical_columns: | ||
assert all_edges.get(col, None) is not None | ||
|
||
if col == 'col1': | ||
assert len(all_edges[col]) == 2 | ||
min_value = min(baseline_df.agg(pyspark_f.min(col)).collect()[0]) | ||
expected_delta = abs(min_value * 0.005) | ||
assert all_edges[col][0] == (min_value - expected_delta) | ||
assert all_edges[col][1] == (min_value + expected_delta) | ||
else: | ||
assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 | ||
|
||
calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) | ||
distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) | ||
assert distinct_df.distinct().count() == len(all_edges[col]) | ||
|
||
def test_get_dual_histogram_bin_edges_single_distinct_value_bucket_zero(self): | ||
"""Test scenario where we have a single bucket with the value as zero.""" | ||
column_dtype_map = { | ||
'col1': 'int', | ||
'col2': 'float', | ||
'col3': 'double', | ||
'col4': 'decimal', | ||
'col5': 'string' | ||
} | ||
baseline_df = pd.DataFrame({ | ||
'col1': [0, 0, 0, 0, 0], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555] | ||
}) | ||
production_df = pd.DataFrame({ | ||
'col1': [0, 0, 0, 0, 0], | ||
'col2': [1.1, 2.2, 3.3, 4.4, 5.5], | ||
'col3': [1.11, 2.22, 3.33, 4.44, 5.55], | ||
'col4': [1.111, 2.222, 3.333, 4.444, 5.555] | ||
}) | ||
baseline_df = self.init_spark().createDataFrame(baseline_df) | ||
production_df = self.init_spark().createDataFrame(production_df) | ||
numerical_columns = get_numerical_cols_with_df(column_dtype_map, | ||
baseline_df) | ||
|
||
all_edges = get_dual_histogram_bin_edges( | ||
baseline_df, production_df, baseline_df.count(), production_df.count(), numerical_columns | ||
) | ||
|
||
assert all_edges is not None | ||
for col in numerical_columns: | ||
assert all_edges.get(col, None) is not None | ||
|
||
if col == 'col1': | ||
assert len(all_edges[col]) == 2 | ||
min_value = min(baseline_df.agg(pyspark_f.min(col)).collect()[0]) | ||
expected_delta = 0.005 | ||
assert all_edges[col][0] == (min_value - expected_delta) | ||
assert all_edges[col][1] == (min_value + expected_delta) | ||
else: | ||
assert len(all_edges[col]) == self._num_bins_by_struges_algorithm(baseline_df) + 1 | ||
|
||
calculate_distinct_values_df = pd.DataFrame({col: all_edges[col]}) | ||
distinct_df = self.init_spark().createDataFrame(calculate_distinct_values_df) | ||
assert distinct_df.distinct().count() == len(all_edges[col]) | ||
|
||
def init_spark(self): | ||
"""Get or create spark session.""" | ||
spark = SparkSession.builder.appName("test").getOrCreate() | ||
return spark |