Skip to content

Commit

Permalink
Mic 4671/user warnings (#353)
Browse files Browse the repository at this point in the history
Mic 4671/user warnings

Updates user warnings when invalid noise levels are provided by users.
- *Category*: Feature
- *JIRA issue*: [MIC-4671](https://jira.ihme.washington.edu/browse/MIC-4671)

-Adds metadata proportion file to validate noise levels to
-Throws warning to users when they provide a value larger than the calculated max level noise level from metadata proportions file
-Updates tests to reflect feature updates

Testing
All tests pass
  • Loading branch information
albrja authored Dec 1, 2023
1 parent c01df9a commit 4170aa7
Show file tree
Hide file tree
Showing 16 changed files with 34,786 additions and 89 deletions.
31 changes: 25 additions & 6 deletions src/pseudopeople/configuration/generator.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from pathlib import Path
from typing import Dict, Optional, Union
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
import yaml
from vivarium.config_tree import ConfigTree

from pseudopeople.configuration import NO_NOISE, Keys
from pseudopeople.configuration.validator import validate_overrides
from pseudopeople.configuration.validator import (
validate_noise_level_proportions,
validate_overrides,
)
from pseudopeople.constants.data_values import DEFAULT_DO_NOT_RESPOND_ROW_PROBABILITY
from pseudopeople.exceptions import ConfigurationError
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import COLUMNS, DATASETS
from pseudopeople.schema_entities import COLUMNS, DATASETS, Dataset

# Define non-baseline default items
# NOTE: default values are defined in entity_types.RowNoiseType and entity_types.ColumnNoiseType
Expand Down Expand Up @@ -76,7 +80,11 @@
}


def get_configuration(overrides: Optional[Union[Path, str, Dict]] = None) -> ConfigTree:
def get_configuration(
overrides: Optional[Union[Path, str, Dict]] = None,
dataset: Dataset = None,
user_filters: List[Tuple[Union[str, int, pd.Timestamp]]] = None,
) -> ConfigTree:
"""
Gets a noising configuration ConfigTree, optionally overridden by a user-provided YAML.
Expand All @@ -95,7 +103,7 @@ def get_configuration(overrides: Optional[Union[Path, str, Dict]] = None) -> Con
is_no_noise = False
noising_configuration = _generate_configuration(is_no_noise)
if overrides is not None:
add_overrides(noising_configuration, overrides)
add_overrides(noising_configuration, overrides, dataset, user_filters)

return noising_configuration

Expand Down Expand Up @@ -164,10 +172,21 @@ def _generate_configuration(is_no_noise: bool) -> ConfigTree:
return noising_configuration


def add_overrides(noising_configuration: ConfigTree, overrides: Dict) -> None:
def add_overrides(
noising_configuration: ConfigTree,
overrides: Dict,
dataset: Dataset = None,
user_filters: List[Tuple[Union[str, int, pd.Timestamp]]] = None,
) -> None:
validate_overrides(overrides, noising_configuration)
overrides = _format_overrides(noising_configuration, overrides)
noising_configuration.update(overrides, layer="user")
# Note: dataset and user_filters should both be None when using the get_config wrapper
# or both be inputs from generate_XXX functions.
if (dataset is not None) and (user_filters is not None):
# TODO: refactor validate_noise_level_proportions to take overrides as arg and live in validate overrides
# Note: validate_noise_level_proportions must happen after user layer configuration update
validate_noise_level_proportions(noising_configuration, dataset, user_filters)


def _format_overrides(default_config: ConfigTree, user_dict: Dict) -> Dict:
Expand Down
100 changes: 82 additions & 18 deletions src/pseudopeople/configuration/validator.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from typing import Callable, Dict, List, Union
from typing import Callable, Dict, List, Tuple, Union

import numpy as np
import pandas as pd
from loguru import logger
from vivarium.config_tree import ConfigTree, ConfigurationKeyError

from pseudopeople.configuration import Keys
from pseudopeople.constants import metadata
from pseudopeople.constants import metadata, paths
from pseudopeople.exceptions import ConfigurationError
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.noise_scaling import get_options_for_column
from pseudopeople.schema_entities import Dataset


def validate_overrides(overrides: Dict, default_config: ConfigTree) -> None:
Expand Down Expand Up @@ -88,9 +90,6 @@ def validate_overrides(overrides: Dict, default_config: ConfigTree) -> None:
default_column_config, noise_type, "noise type", dataset, column
)
parameter_config_validator_map = {
NOISE_TYPES.use_nickname.name: {
Keys.CELL_PROBABILITY: _validate_nickname_probability
},
NOISE_TYPES.choose_wrong_option.name: {
Keys.CELL_PROBABILITY: lambda *args, **kwargs: _validate_choose_wrong_option_probability(
*args, **kwargs, column=column
Expand Down Expand Up @@ -241,19 +240,6 @@ def _validate_probability(
)


def _validate_nickname_probability(
noise_type_config: Union[int, float], parameter: str, base_error_message: str
) -> None:
_validate_probability(noise_type_config, parameter, base_error_message)
if noise_type_config > metadata.NICKNAMES_PROPORTION:
logger.warning(
base_error_message
+ f"The configured '{parameter}' is {noise_type_config}, but only approximately "
f"{metadata.NICKNAMES_PROPORTION:.2%} of names have a nickname. "
f"Replacing as many names with nicknames as possible."
)


def _validate_choose_wrong_option_probability(
noise_type_config: Union[int, float], parameter: str, base_error_message: str, column: str
):
Expand All @@ -272,6 +258,84 @@ def _validate_choose_wrong_option_probability(
)


def validate_noise_level_proportions(
configuration_tree: ConfigTree, dataset: Dataset, user_filters: List[Tuple]
) -> None:
"""
Validates that the noise levels provided do not exceed the allowable proportions from the
metadata proportions file. If the provided noise levels are higher than the allowable proportions
then throw a warning to the user and adjust the noise level to the allowable proportion.
"""
# TODO: update file and filepath
metadata_proportions = pd.read_csv(paths.METADATA_PROPORTIONS)
dataset_proportions = metadata_proportions.loc[
metadata_proportions["dataset"] == dataset.name
]
# Set default values for state and year
if dataset.name == metadata.DatasetNames.SSA:
state = "USA"
else:
# Note: This is a shortcoming of our current approach to user warnings and will be fixed
# with a future release/our next data upload. We do not have a way to get state in a
# case where the user is not filtering on state because they either are using the sample
# data or state (Rhode Island) data.
if len(dataset_proportions["state"].unique()) == 1:
state = dataset_proportions["state"].unique()[0]
else:
state = "USA"
year = metadata.YEAR_AGGREGATION_VALUE
# Get the state and year from the user filters
for i in range(len(user_filters)):
if user_filters[i][0] == dataset.state_column_name:
state = user_filters[i][2]
break
for i in range(len(user_filters)):
if user_filters[i][0] == dataset.date_column_name:
if isinstance(user_filters[i][2], pd.Timestamp):
year = user_filters[i][2].year
else:
year = user_filters[i][2]
break

# Weight SSA since year filter is queried and all proceeding years
if dataset.name == metadata.DatasetNames.SSA:
dataset_noise_proportions = dataset_proportions.loc[
(dataset_proportions["state"] == state) & (dataset_proportions["year"] <= year)
]
dataset_noise_proportions = dataset_proportions.groupby(
["column", "noise_type"]
).apply(lambda x: ((x.proportion * x.number_of_rows).sum()) / x.number_of_rows.sum())
dataset_noise_proportions = dataset_noise_proportions.rename(
"proportion"
).reset_index()
dataset_noise_proportions["dataset"] = metadata.DatasetNames.SSA
else:
dataset_noise_proportions = dataset_proportions.loc[
(dataset_proportions["state"] == state) & (dataset_proportions["year"] == year)
]

# If there is no data for a queried dataset, we want the user's to hit the correct error that there
# is no data available so we do not throw an error here.
if dataset_noise_proportions.empty:
return configuration_tree
else:
# Go through each row in the queried dataset noise proportions to validate the noise levels
for i in range(len(dataset_noise_proportions)):
row = dataset_noise_proportions.iloc[i]
if row["column"] not in [col.name for col in dataset.columns]:
continue
max_noise_level = row["proportion"]
config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][
row["column"]
][row["noise_type"]][Keys.CELL_PROBABILITY]
if config_noise_level > max_noise_level:
logger.warning(
f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, "
f"which is higher than the maximum possible value based on the provided data for '{row['dataset']}'. "
"Noising as many rows as possible. "
)


DEFAULT_PARAMETER_CONFIG_VALIDATOR_MAP = {
Keys.POSSIBLE_AGE_DIFFERENCES: _validate_possible_age_differences,
Keys.ZIPCODE_DIGIT_PROBABILITIES: _validate_zipcode_digit_probabilities,
Expand Down
2 changes: 1 addition & 1 deletion src/pseudopeople/constants/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,4 @@ class __DateFormats:

INT_COLUMNS = ["age", "wages", "mailing_address_po_box"]

DATA_VERSION = "2.0.0"
YEAR_AGGREGATION_VALUE = 3000 # value for all years in a dataset for metadata proportions
2 changes: 2 additions & 0 deletions src/pseudopeople/constants/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@
PHONETIC_ERRORS_DATA = DATA_ROOT / "phonetic_variations.csv"

SAMPLE_DATA_ROOT = DATA_ROOT / "sample_datasets"

METADATA_PROPORTIONS = SAMPLE_DATA_ROOT / "metadata_proportions.csv"
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 0 additions & 1 deletion src/pseudopeople/data/sample_datasets/metadata.yaml

This file was deleted.

Loading

0 comments on commit 4170aa7

Please sign in to comment.