Skip to content

Commit

Permalink
Add SSA weighted proportion and started tests
Browse files Browse the repository at this point in the history
  • Loading branch information
albrja committed Nov 23, 2023
1 parent e62cb19 commit f214350
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 35 deletions.
1 change: 1 addition & 0 deletions src/pseudopeople/configuration/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def _generate_configuration(is_no_noise: bool) -> ConfigTree:
"baseline",
"default",
"user",
"max_noise_level",
]
noising_configuration = ConfigTree(layers=default_config_layers)
# Instantiate the configuration file with baseline values
Expand Down
91 changes: 62 additions & 29 deletions src/pseudopeople/configuration/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,10 @@ def validate_overrides(overrides: Dict, default_config: ConfigTree) -> None:
default_column_config, noise_type, "noise type", dataset, column
)
parameter_config_validator_map = {
NOISE_TYPES.use_nickname.name: {
Keys.CELL_PROBABILITY: _validate_nickname_probability
},
# Do we want to also override these values or handle them differently?
# NOISE_TYPES.use_nickname.name: {
# Keys.CELL_PROBABILITY: _validate_nickname_probability
# },
NOISE_TYPES.choose_wrong_option.name: {
Keys.CELL_PROBABILITY: lambda *args, **kwargs: _validate_choose_wrong_option_probability(
*args, **kwargs, column=column
Expand Down Expand Up @@ -284,13 +285,15 @@ def validate_noise_level_proportions(
"""

metadata_proportions = pd.read_csv(paths.METADATA_PROPORTIONS)

dataset_proportions = metadata_proportions.loc[
metadata_proportions["dataset"] == dataset.name
]
# Set default values for state and year
if dataset.name == metadata.DatasetNames.SSA:
state = "USA"
else:
if len(metadata_proportions["state"].unique()) == 1:
state = metadata_proportions["state"].unique()[0]
if len(dataset_proportions["state"].unique()) == 1:
state = dataset_proportions["state"].unique()[0]
else:
state = "USA"
year = metadata.YEAR_AGGREGATION_VALUE
Expand All @@ -307,29 +310,59 @@ def validate_noise_level_proportions(
break

# TODO: weight SSA proportions
dataset_noise_proportions = metadata_proportions.loc[
(metadata_proportions["dataset"] == dataset.name)
& (metadata_proportions["state"] == state)
& (metadata_proportions["year"] == year)
]
l = len(dataset_noise_proportions)
# Go through each row in the queried dataset noise proportions to validate the noise levels
for i in range(len(l)):
row = dataset_noise_proportions.iloc[i]
max_noise_level = row["proportion"]
config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][
row["column"]
][row["noise_type"]]
if config_noise_level > max_noise_level:
logger.warning(
f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, "
f"but the maximum allowable noise level is {max_noise_level}. "
f"The maximum allowable noise level will be used instead of the configured value. "
f"This value is based on the provided data for '{row['dataset']}'. "
)
configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][row["column"]][
row["noise_type"]
] = max_noise_level
if dataset.name == metadata.DatasetNames.SSA:
dataset_noise_proportions = dataset_proportions.loc[
(dataset_proportions["state"] == state) & (dataset_proportions["year"] <= year)
]
dataset_noise_proportions = dataset_proportions.groupby(
["column", "noise_type"]
).apply(lambda x: ((x.proportion * x.number_of_rows).sum()) / x.number_of_rows.sum())
dataset_noise_proportions = dataset_noise_proportions.rename(
"proportion"
).reset_index()
dataset_noise_proportions["dataset"] = metadata.DatasetNames.SSA
else:
dataset_noise_proportions = dataset_proportions.loc[
(dataset_proportions["state"] == state) & (dataset_proportions["year"] == year)
]

# If there is no data for a queried dataset, we want the user's to hit the correct error that there
# is no data available so we do not throw an error here.
if dataset_noise_proportions.empty:
return configuration_tree
else:
# Go through each row in the queried dataset noise proportions to validate the noise levels
for i in range(len(dataset_noise_proportions)):
row = dataset_noise_proportions.iloc[i]
if row["column"] not in [col.name for col in dataset.columns]:
continue
max_noise_level = row["proportion"]
config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][
row["column"]
][row["noise_type"]][Keys.CELL_PROBABILITY]
if config_noise_level > max_noise_level:
logger.warning(
f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, "
f"but the maximum allowable noise level is {max_noise_level}. "
f"The maximum allowable noise level will be used instead of the configured value. "
f"This value is based on the provided data for '{row['dataset']}'. "
)
# Should we update here in validator or pass values back to interface?
configuration_tree.update(
{
row["dataset"]: {
Keys.COLUMN_NOISE: {
row["column"]: {
row["noise_type"]: {
Keys.CELL_PROBABILITY: max_noise_level
}
}
}
}
},
layer="max_noise_level",
)
return configuration_tree


DEFAULT_PARAMETER_CONFIG_VALIDATOR_MAP = {
Expand Down
7 changes: 4 additions & 3 deletions src/pseudopeople/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@ def _generate_dataset(
"""
configure_logging_to_terminal(verbose)
configuration_tree = get_configuration(config)
breakpoint()

# Validate confgiuration noise levels with possible metadata noise level proportions
# configuration_tree = validate_noise_level_proportions(configuration_tree, dataset, user_filters)
configuration_tree = validate_noise_level_proportions(
configuration_tree, dataset, user_filters
)

if source is None:
source = paths.SAMPLE_DATA_ROOT
else:
Expand Down
50 changes: 47 additions & 3 deletions tests/unit/test_configuration.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import itertools

import pandas as pd
import pytest
import yaml

from pseudopeople.configuration import NO_NOISE, Keys, get_configuration
from pseudopeople.configuration.generator import DEFAULT_NOISE_VALUES
from pseudopeople.configuration.interface import get_config
from pseudopeople.configuration.validator import ConfigurationError
from pseudopeople.configuration.validator import (
ConfigurationError,
validate_noise_level_proportions,
)
from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import COLUMNS, DATASETS
Expand All @@ -28,7 +32,7 @@ def test_get_default_configuration(mocker):
"""Tests that the default configuration can be retrieved."""
mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree")
_ = get_configuration()
mock.assert_called_once_with(layers=["baseline", "default", "user"])
mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"])


def test_default_configuration_structure():
Expand Down Expand Up @@ -122,7 +126,7 @@ def test_get_configuration_with_user_override(mocker):
}
}
_ = get_configuration(config)
mock.assert_called_once_with(layers=["baseline", "default", "user"])
mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"])
update_calls = [
call
for call in mock.mock_calls
Expand Down Expand Up @@ -557,3 +561,43 @@ def test_no_noise():
column_noise_dict = dataset_column_dict[column]
for column_noise_type in column_noise_dict.keys():
assert column_noise_dict[column_noise_type][Keys.CELL_PROBABILITY] == 0.0


def test_get_configuration_with_max_level_overrides(mocker):
"""Tests that the default configuration get updated when a user provides a value
for specific noise types and we update with the maximum allowable value in the
max level override layer."""
mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree")
config = {
DATASETS.acs.name: {
Keys.COLUMN_NOISE: {
"first_name": {NOISE_TYPES.use_nickname.name: {Keys.CELL_PROBABILITY: 0.95}}
},
}
}
user_filters = [
(
DATASETS.acs.date_column_name,
">=",
pd.Timestamp(year=2020, month=1, day=1),
),
(
DATASETS.acs.date_column_name,
"<=",
pd.Timestamp(year=2020, month=12, day=31),
),
(DATASETS.acs.state_column_name, "==", "WA"),
]
# TODO: need to get full configuratio nand pass it instead
_ = validate_noise_level_proportions(config, DATASETS.acs, user_filters)
mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"])
update_calls = [
call
for call in mock.mock_calls
if ".update({" in str(call) and "layer='max_noise_level'" in str(call)
]
assert len(update_calls) == 1


# TODO: add test that logger warning is thrown if user provides a value that is too high
# and it gets adjusted to max level

0 comments on commit f214350

Please sign in to comment.