Mic 4671/user warnings (#353)

Mic 4671/user warnings Updates user warnings when invalid noise levels are provided by users. - *Category*: Feature - *JIRA issue*: [MIC-4671](https://jira.ihme.washington.edu/browse/MIC-4671) -Adds metadata proportion file to validate noise levels to -Throws warning to users when they provide a value larger than the calculated max level noise level from metadata proportions file -Updates tests to reflect feature updates Testing All tests pass
ihmeuw · Dec 1, 2023 · 4170aa7 · 4170aa7
1 parent c01df9a
commit 4170aa7
Show file tree

Hide file tree

Showing 16 changed files with 34,786 additions and 89 deletions.
diff --git a/src/pseudopeople/configuration/generator.py b/src/pseudopeople/configuration/generator.py
@@ -1,15 +1,19 @@
 from pathlib import Path
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
+import pandas as pd
 import yaml
 from vivarium.config_tree import ConfigTree
 
 from pseudopeople.configuration import NO_NOISE, Keys
-from pseudopeople.configuration.validator import validate_overrides
+from pseudopeople.configuration.validator import (
+    validate_noise_level_proportions,
+    validate_overrides,
+)
 from pseudopeople.constants.data_values import DEFAULT_DO_NOT_RESPOND_ROW_PROBABILITY
 from pseudopeople.exceptions import ConfigurationError
 from pseudopeople.noise_entities import NOISE_TYPES
-from pseudopeople.schema_entities import COLUMNS, DATASETS
+from pseudopeople.schema_entities import COLUMNS, DATASETS, Dataset
 
 # Define non-baseline default items
 # NOTE: default values are defined in entity_types.RowNoiseType and entity_types.ColumnNoiseType
@@ -76,7 +80,11 @@
 }
 
 
-def get_configuration(overrides: Optional[Union[Path, str, Dict]] = None) -> ConfigTree:
+def get_configuration(
+    overrides: Optional[Union[Path, str, Dict]] = None,
+    dataset: Dataset = None,
+    user_filters: List[Tuple[Union[str, int, pd.Timestamp]]] = None,
+) -> ConfigTree:
     """
     Gets a noising configuration ConfigTree, optionally overridden by a user-provided YAML.
 
@@ -95,7 +103,7 @@ def get_configuration(overrides: Optional[Union[Path, str, Dict]] = None) -> Con
         is_no_noise = False
     noising_configuration = _generate_configuration(is_no_noise)
     if overrides is not None:
-        add_overrides(noising_configuration, overrides)
+        add_overrides(noising_configuration, overrides, dataset, user_filters)
 
     return noising_configuration
 
@@ -164,10 +172,21 @@ def _generate_configuration(is_no_noise: bool) -> ConfigTree:
     return noising_configuration
 
 
-def add_overrides(noising_configuration: ConfigTree, overrides: Dict) -> None:
+def add_overrides(
+    noising_configuration: ConfigTree,
+    overrides: Dict,
+    dataset: Dataset = None,
+    user_filters: List[Tuple[Union[str, int, pd.Timestamp]]] = None,
+) -> None:
     validate_overrides(overrides, noising_configuration)
     overrides = _format_overrides(noising_configuration, overrides)
     noising_configuration.update(overrides, layer="user")
+    # Note: dataset and user_filters should both be None when using the get_config wrapper
+    # or both be inputs from generate_XXX functions.
+    if (dataset is not None) and (user_filters is not None):
+        # TODO: refactor validate_noise_level_proportions to take overrides as arg and live in validate overrides
+        # Note: validate_noise_level_proportions must happen after user layer configuration update
+        validate_noise_level_proportions(noising_configuration, dataset, user_filters)
 
 
 def _format_overrides(default_config: ConfigTree, user_dict: Dict) -> Dict:

diff --git a/src/pseudopeople/configuration/validator.py b/src/pseudopeople/configuration/validator.py
@@ -1,14 +1,16 @@
-from typing import Callable, Dict, List, Union
+from typing import Callable, Dict, List, Tuple, Union
 
 import numpy as np
+import pandas as pd
 from loguru import logger
 from vivarium.config_tree import ConfigTree, ConfigurationKeyError
 
 from pseudopeople.configuration import Keys
-from pseudopeople.constants import metadata
+from pseudopeople.constants import metadata, paths
 from pseudopeople.exceptions import ConfigurationError
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.noise_scaling import get_options_for_column
+from pseudopeople.schema_entities import Dataset
 
 
 def validate_overrides(overrides: Dict, default_config: ConfigTree) -> None:
@@ -88,9 +90,6 @@ def validate_overrides(overrides: Dict, default_config: ConfigTree) -> None:
                     default_column_config, noise_type, "noise type", dataset, column
                 )
                 parameter_config_validator_map = {
-                    NOISE_TYPES.use_nickname.name: {
-                        Keys.CELL_PROBABILITY: _validate_nickname_probability
-                    },
                     NOISE_TYPES.choose_wrong_option.name: {
                         Keys.CELL_PROBABILITY: lambda *args, **kwargs: _validate_choose_wrong_option_probability(
                             *args, **kwargs, column=column
@@ -241,19 +240,6 @@ def _validate_probability(
         )
 
 
-def _validate_nickname_probability(
-    noise_type_config: Union[int, float], parameter: str, base_error_message: str
-) -> None:
-    _validate_probability(noise_type_config, parameter, base_error_message)
-    if noise_type_config > metadata.NICKNAMES_PROPORTION:
-        logger.warning(
-            base_error_message
-            + f"The configured '{parameter}' is {noise_type_config}, but only approximately "
-            f"{metadata.NICKNAMES_PROPORTION:.2%} of names have a nickname. "
-            f"Replacing as many names with nicknames as possible."
-        )
-
-
 def _validate_choose_wrong_option_probability(
     noise_type_config: Union[int, float], parameter: str, base_error_message: str, column: str
 ):
@@ -272,6 +258,84 @@ def _validate_choose_wrong_option_probability(
         )
 
 
+def validate_noise_level_proportions(
+    configuration_tree: ConfigTree, dataset: Dataset, user_filters: List[Tuple]
+) -> None:
+    """
+    Validates that the noise levels provided do not exceed the allowable proportions from the
+    metadata proportions file. If the provided noise levels are higher than the allowable proportions
+    then throw a warning to the user and adjust the noise level to the allowable proportion.
+    """
+    # TODO: update file and filepath
+    metadata_proportions = pd.read_csv(paths.METADATA_PROPORTIONS)
+    dataset_proportions = metadata_proportions.loc[
+        metadata_proportions["dataset"] == dataset.name
+    ]
+    # Set default values for state and year
+    if dataset.name == metadata.DatasetNames.SSA:
+        state = "USA"
+    else:
+        # Note: This is a shortcoming of our current approach to user warnings and will be fixed
+        # with a future release/our next data upload. We do not have a way to get state in a
+        # case where the user is not filtering on state because they either are using the sample
+        # data or state (Rhode Island) data.
+        if len(dataset_proportions["state"].unique()) == 1:
+            state = dataset_proportions["state"].unique()[0]
+        else:
+            state = "USA"
+    year = metadata.YEAR_AGGREGATION_VALUE
+    # Get the state and year from the user filters
+    for i in range(len(user_filters)):
+        if user_filters[i][0] == dataset.state_column_name:
+            state = user_filters[i][2]
+            break
+    for i in range(len(user_filters)):
+        if user_filters[i][0] == dataset.date_column_name:
+            if isinstance(user_filters[i][2], pd.Timestamp):
+                year = user_filters[i][2].year
+            else:
+                year = user_filters[i][2]
+            break
+
+    # Weight SSA since year filter is queried and all proceeding years
+    if dataset.name == metadata.DatasetNames.SSA:
+        dataset_noise_proportions = dataset_proportions.loc[
+            (dataset_proportions["state"] == state) & (dataset_proportions["year"] <= year)
+        ]
+        dataset_noise_proportions = dataset_proportions.groupby(
+            ["column", "noise_type"]
+        ).apply(lambda x: ((x.proportion * x.number_of_rows).sum()) / x.number_of_rows.sum())
+        dataset_noise_proportions = dataset_noise_proportions.rename(
+            "proportion"
+        ).reset_index()
+        dataset_noise_proportions["dataset"] = metadata.DatasetNames.SSA
+    else:
+        dataset_noise_proportions = dataset_proportions.loc[
+            (dataset_proportions["state"] == state) & (dataset_proportions["year"] == year)
+        ]
+
+    # If there is no data for a queried dataset, we want the user's to hit the correct error that there
+    # is no data available so we do not throw an error here.
+    if dataset_noise_proportions.empty:
+        return configuration_tree
+    else:
+        # Go through each row in the queried dataset noise proportions to validate the noise levels
+        for i in range(len(dataset_noise_proportions)):
+            row = dataset_noise_proportions.iloc[i]
+            if row["column"] not in [col.name for col in dataset.columns]:
+                continue
+            max_noise_level = row["proportion"]
+            config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][
+                row["column"]
+            ][row["noise_type"]][Keys.CELL_PROBABILITY]
+            if config_noise_level > max_noise_level:
+                logger.warning(
+                    f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, "
+                    f"which is higher than the maximum possible value based on the provided data for '{row['dataset']}'. "
+                    "Noising as many rows as possible. "
+                )
+
+
 DEFAULT_PARAMETER_CONFIG_VALIDATOR_MAP = {
     Keys.POSSIBLE_AGE_DIFFERENCES: _validate_possible_age_differences,
     Keys.ZIPCODE_DIGIT_PROBABILITIES: _validate_zipcode_digit_probabilities,

diff --git a/src/pseudopeople/constants/metadata.py b/src/pseudopeople/constants/metadata.py
@@ -98,4 +98,4 @@ class __DateFormats:
 
 INT_COLUMNS = ["age", "wages", "mailing_address_po_box"]
 
-DATA_VERSION = "2.0.0"
+YEAR_AGGREGATION_VALUE = 3000  # value for all years in a dataset for metadata proportions
diff --git a/src/pseudopeople/constants/paths.py b/src/pseudopeople/constants/paths.py
@@ -12,3 +12,5 @@
 PHONETIC_ERRORS_DATA = DATA_ROOT / "phonetic_variations.csv"
 
 SAMPLE_DATA_ROOT = DATA_ROOT / "sample_datasets"
+
+METADATA_PROPORTIONS = SAMPLE_DATA_ROOT / "metadata_proportions.csv"
diff --git a/...dopeople/data/sample_datasets/american_community_survey/american_community_survey.parquet b/...dopeople/data/sample_datasets/american_community_survey/american_community_survey.parquet
diff --git a/...dopeople/data/sample_datasets/current_population_survey/current_population_survey.parquet b/...dopeople/data/sample_datasets/current_population_survey/current_population_survey.parquet
diff --git a/src/pseudopeople/data/sample_datasets/decennial_census/decennial_census.parquet b/src/pseudopeople/data/sample_datasets/decennial_census/decennial_census.parquet
diff --git a/src/pseudopeople/data/sample_datasets/metadata.yaml b/src/pseudopeople/data/sample_datasets/metadata.yaml
Original file line number	Diff line number	Diff line change
Expand Up		@@ -98,4 +98,4 @@ class __DateFormats:

		INT_COLUMNS = ["age", "wages", "mailing_address_po_box"]

		DATA_VERSION = "2.0.0"
		YEAR_AGGREGATION_VALUE = 3000 # value for all years in a dataset for metadata proportions
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,3 +12,5 @@
		PHONETIC_ERRORS_DATA = DATA_ROOT / "phonetic_variations.csv"

		SAMPLE_DATA_ROOT = DATA_ROOT / "sample_datasets"

		METADATA_PROPORTIONS = SAMPLE_DATA_ROOT / "metadata_proportions.csv"