From f214350b838803c2d6bbe0fed7c586fb1e0ddd70 Mon Sep 17 00:00:00 2001 From: albrja Date: Wed, 22 Nov 2023 16:38:39 -0800 Subject: [PATCH] Add SSA weighted proportion and started tests --- src/pseudopeople/configuration/generator.py | 1 + src/pseudopeople/configuration/validator.py | 91 ++++++++++++++------- src/pseudopeople/interface.py | 7 +- tests/unit/test_configuration.py | 50 ++++++++++- 4 files changed, 114 insertions(+), 35 deletions(-) diff --git a/src/pseudopeople/configuration/generator.py b/src/pseudopeople/configuration/generator.py index fca2992f..da5284a3 100644 --- a/src/pseudopeople/configuration/generator.py +++ b/src/pseudopeople/configuration/generator.py @@ -98,6 +98,7 @@ def _generate_configuration(is_no_noise: bool) -> ConfigTree: "baseline", "default", "user", + "max_noise_level", ] noising_configuration = ConfigTree(layers=default_config_layers) # Instantiate the configuration file with baseline values diff --git a/src/pseudopeople/configuration/validator.py b/src/pseudopeople/configuration/validator.py index df08564a..cddc4310 100644 --- a/src/pseudopeople/configuration/validator.py +++ b/src/pseudopeople/configuration/validator.py @@ -90,9 +90,10 @@ def validate_overrides(overrides: Dict, default_config: ConfigTree) -> None: default_column_config, noise_type, "noise type", dataset, column ) parameter_config_validator_map = { - NOISE_TYPES.use_nickname.name: { - Keys.CELL_PROBABILITY: _validate_nickname_probability - }, + # Do we want to also override these values or handle them differently? + # NOISE_TYPES.use_nickname.name: { + # Keys.CELL_PROBABILITY: _validate_nickname_probability + # }, NOISE_TYPES.choose_wrong_option.name: { Keys.CELL_PROBABILITY: lambda *args, **kwargs: _validate_choose_wrong_option_probability( *args, **kwargs, column=column @@ -284,13 +285,15 @@ def validate_noise_level_proportions( """ metadata_proportions = pd.read_csv(paths.METADATA_PROPORTIONS) - + dataset_proportions = metadata_proportions.loc[ + metadata_proportions["dataset"] == dataset.name + ] # Set default values for state and year if dataset.name == metadata.DatasetNames.SSA: state = "USA" else: - if len(metadata_proportions["state"].unique()) == 1: - state = metadata_proportions["state"].unique()[0] + if len(dataset_proportions["state"].unique()) == 1: + state = dataset_proportions["state"].unique()[0] else: state = "USA" year = metadata.YEAR_AGGREGATION_VALUE @@ -307,29 +310,59 @@ def validate_noise_level_proportions( break # TODO: weight SSA proportions - dataset_noise_proportions = metadata_proportions.loc[ - (metadata_proportions["dataset"] == dataset.name) - & (metadata_proportions["state"] == state) - & (metadata_proportions["year"] == year) - ] - l = len(dataset_noise_proportions) - # Go through each row in the queried dataset noise proportions to validate the noise levels - for i in range(len(l)): - row = dataset_noise_proportions.iloc[i] - max_noise_level = row["proportion"] - config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][ - row["column"] - ][row["noise_type"]] - if config_noise_level > max_noise_level: - logger.warning( - f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, " - f"but the maximum allowable noise level is {max_noise_level}. " - f"The maximum allowable noise level will be used instead of the configured value. " - f"This value is based on the provided data for '{row['dataset']}'. " - ) - configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][row["column"]][ - row["noise_type"] - ] = max_noise_level + if dataset.name == metadata.DatasetNames.SSA: + dataset_noise_proportions = dataset_proportions.loc[ + (dataset_proportions["state"] == state) & (dataset_proportions["year"] <= year) + ] + dataset_noise_proportions = dataset_proportions.groupby( + ["column", "noise_type"] + ).apply(lambda x: ((x.proportion * x.number_of_rows).sum()) / x.number_of_rows.sum()) + dataset_noise_proportions = dataset_noise_proportions.rename( + "proportion" + ).reset_index() + dataset_noise_proportions["dataset"] = metadata.DatasetNames.SSA + else: + dataset_noise_proportions = dataset_proportions.loc[ + (dataset_proportions["state"] == state) & (dataset_proportions["year"] == year) + ] + + # If there is no data for a queried dataset, we want the user's to hit the correct error that there + # is no data available so we do not throw an error here. + if dataset_noise_proportions.empty: + return configuration_tree + else: + # Go through each row in the queried dataset noise proportions to validate the noise levels + for i in range(len(dataset_noise_proportions)): + row = dataset_noise_proportions.iloc[i] + if row["column"] not in [col.name for col in dataset.columns]: + continue + max_noise_level = row["proportion"] + config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][ + row["column"] + ][row["noise_type"]][Keys.CELL_PROBABILITY] + if config_noise_level > max_noise_level: + logger.warning( + f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, " + f"but the maximum allowable noise level is {max_noise_level}. " + f"The maximum allowable noise level will be used instead of the configured value. " + f"This value is based on the provided data for '{row['dataset']}'. " + ) + # Should we update here in validator or pass values back to interface? + configuration_tree.update( + { + row["dataset"]: { + Keys.COLUMN_NOISE: { + row["column"]: { + row["noise_type"]: { + Keys.CELL_PROBABILITY: max_noise_level + } + } + } + } + }, + layer="max_noise_level", + ) + return configuration_tree DEFAULT_PARAMETER_CONFIG_VALIDATOR_MAP = { diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 85079473..48bfddd1 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -55,10 +55,11 @@ def _generate_dataset( """ configure_logging_to_terminal(verbose) configuration_tree = get_configuration(config) - breakpoint() - # Validate confgiuration noise levels with possible metadata noise level proportions - # configuration_tree = validate_noise_level_proportions(configuration_tree, dataset, user_filters) + configuration_tree = validate_noise_level_proportions( + configuration_tree, dataset, user_filters + ) + if source is None: source = paths.SAMPLE_DATA_ROOT else: diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index 0c7644e8..89b5b974 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -1,12 +1,16 @@ import itertools +import pandas as pd import pytest import yaml from pseudopeople.configuration import NO_NOISE, Keys, get_configuration from pseudopeople.configuration.generator import DEFAULT_NOISE_VALUES from pseudopeople.configuration.interface import get_config -from pseudopeople.configuration.validator import ConfigurationError +from pseudopeople.configuration.validator import ( + ConfigurationError, + validate_noise_level_proportions, +) from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType from pseudopeople.noise_entities import NOISE_TYPES from pseudopeople.schema_entities import COLUMNS, DATASETS @@ -28,7 +32,7 @@ def test_get_default_configuration(mocker): """Tests that the default configuration can be retrieved.""" mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree") _ = get_configuration() - mock.assert_called_once_with(layers=["baseline", "default", "user"]) + mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"]) def test_default_configuration_structure(): @@ -122,7 +126,7 @@ def test_get_configuration_with_user_override(mocker): } } _ = get_configuration(config) - mock.assert_called_once_with(layers=["baseline", "default", "user"]) + mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"]) update_calls = [ call for call in mock.mock_calls @@ -557,3 +561,43 @@ def test_no_noise(): column_noise_dict = dataset_column_dict[column] for column_noise_type in column_noise_dict.keys(): assert column_noise_dict[column_noise_type][Keys.CELL_PROBABILITY] == 0.0 + + +def test_get_configuration_with_max_level_overrides(mocker): + """Tests that the default configuration get updated when a user provides a value + for specific noise types and we update with the maximum allowable value in the + max level override layer.""" + mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree") + config = { + DATASETS.acs.name: { + Keys.COLUMN_NOISE: { + "first_name": {NOISE_TYPES.use_nickname.name: {Keys.CELL_PROBABILITY: 0.95}} + }, + } + } + user_filters = [ + ( + DATASETS.acs.date_column_name, + ">=", + pd.Timestamp(year=2020, month=1, day=1), + ), + ( + DATASETS.acs.date_column_name, + "<=", + pd.Timestamp(year=2020, month=12, day=31), + ), + (DATASETS.acs.state_column_name, "==", "WA"), + ] + # TODO: need to get full configuratio nand pass it instead + _ = validate_noise_level_proportions(config, DATASETS.acs, user_filters) + mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"]) + update_calls = [ + call + for call in mock.mock_calls + if ".update({" in str(call) and "layer='max_noise_level'" in str(call) + ] + assert len(update_calls) == 1 + + +# TODO: add test that logger warning is thrown if user provides a value that is too high +# and it gets adjusted to max level