From f214350b838803c2d6bbe0fed7c586fb1e0ddd70 Mon Sep 17 00:00:00 2001
From: albrja <albrja@uw.edu>
Date: Wed, 22 Nov 2023 16:38:39 -0800
Subject: [PATCH] Add SSA weighted proportion and started tests

---
 src/pseudopeople/configuration/generator.py |  1 +
 src/pseudopeople/configuration/validator.py | 91 ++++++++++++++-------
 src/pseudopeople/interface.py               |  7 +-
 tests/unit/test_configuration.py            | 50 ++++++++++-
 4 files changed, 114 insertions(+), 35 deletions(-)

diff --git a/src/pseudopeople/configuration/generator.py b/src/pseudopeople/configuration/generator.py
index fca2992f..da5284a3 100644
--- a/src/pseudopeople/configuration/generator.py
+++ b/src/pseudopeople/configuration/generator.py
@@ -98,6 +98,7 @@ def _generate_configuration(is_no_noise: bool) -> ConfigTree:
         "baseline",
         "default",
         "user",
+        "max_noise_level",
     ]
     noising_configuration = ConfigTree(layers=default_config_layers)
     # Instantiate the configuration file with baseline values
diff --git a/src/pseudopeople/configuration/validator.py b/src/pseudopeople/configuration/validator.py
index df08564a..cddc4310 100644
--- a/src/pseudopeople/configuration/validator.py
+++ b/src/pseudopeople/configuration/validator.py
@@ -90,9 +90,10 @@ def validate_overrides(overrides: Dict, default_config: ConfigTree) -> None:
                     default_column_config, noise_type, "noise type", dataset, column
                 )
                 parameter_config_validator_map = {
-                    NOISE_TYPES.use_nickname.name: {
-                        Keys.CELL_PROBABILITY: _validate_nickname_probability
-                    },
+                    # Do we want to also override these values or handle them differently?
+                    # NOISE_TYPES.use_nickname.name: {
+                    #     Keys.CELL_PROBABILITY: _validate_nickname_probability
+                    # },
                     NOISE_TYPES.choose_wrong_option.name: {
                         Keys.CELL_PROBABILITY: lambda *args, **kwargs: _validate_choose_wrong_option_probability(
                             *args, **kwargs, column=column
@@ -284,13 +285,15 @@ def validate_noise_level_proportions(
     """
 
     metadata_proportions = pd.read_csv(paths.METADATA_PROPORTIONS)
-
+    dataset_proportions = metadata_proportions.loc[
+        metadata_proportions["dataset"] == dataset.name
+    ]
     # Set default values for state and year
     if dataset.name == metadata.DatasetNames.SSA:
         state = "USA"
     else:
-        if len(metadata_proportions["state"].unique()) == 1:
-            state = metadata_proportions["state"].unique()[0]
+        if len(dataset_proportions["state"].unique()) == 1:
+            state = dataset_proportions["state"].unique()[0]
         else:
             state = "USA"
     year = metadata.YEAR_AGGREGATION_VALUE
@@ -307,29 +310,59 @@ def validate_noise_level_proportions(
             break
 
     # TODO: weight SSA proportions
-    dataset_noise_proportions = metadata_proportions.loc[
-        (metadata_proportions["dataset"] == dataset.name)
-        & (metadata_proportions["state"] == state)
-        & (metadata_proportions["year"] == year)
-    ]
-    l = len(dataset_noise_proportions)
-    # Go through each row in the queried dataset noise proportions to validate the noise levels
-    for i in range(len(l)):
-        row = dataset_noise_proportions.iloc[i]
-        max_noise_level = row["proportion"]
-        config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][
-            row["column"]
-        ][row["noise_type"]]
-        if config_noise_level > max_noise_level:
-            logger.warning(
-                f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, "
-                f"but the maximum allowable noise level is {max_noise_level}. "
-                f"The maximum allowable noise level will be used instead of the configured value. "
-                f"This value is based on the provided data for '{row['dataset']}'. "
-            )
-            configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][row["column"]][
-                row["noise_type"]
-            ] = max_noise_level
+    if dataset.name == metadata.DatasetNames.SSA:
+        dataset_noise_proportions = dataset_proportions.loc[
+            (dataset_proportions["state"] == state) & (dataset_proportions["year"] <= year)
+        ]
+        dataset_noise_proportions = dataset_proportions.groupby(
+            ["column", "noise_type"]
+        ).apply(lambda x: ((x.proportion * x.number_of_rows).sum()) / x.number_of_rows.sum())
+        dataset_noise_proportions = dataset_noise_proportions.rename(
+            "proportion"
+        ).reset_index()
+        dataset_noise_proportions["dataset"] = metadata.DatasetNames.SSA
+    else:
+        dataset_noise_proportions = dataset_proportions.loc[
+            (dataset_proportions["state"] == state) & (dataset_proportions["year"] == year)
+        ]
+
+    # If there is no data for a queried dataset, we want the user's to hit the correct error that there
+    # is no data available so we do not throw an error here.
+    if dataset_noise_proportions.empty:
+        return configuration_tree
+    else:
+        # Go through each row in the queried dataset noise proportions to validate the noise levels
+        for i in range(len(dataset_noise_proportions)):
+            row = dataset_noise_proportions.iloc[i]
+            if row["column"] not in [col.name for col in dataset.columns]:
+                continue
+            max_noise_level = row["proportion"]
+            config_noise_level = configuration_tree[row["dataset"]][Keys.COLUMN_NOISE][
+                row["column"]
+            ][row["noise_type"]][Keys.CELL_PROBABILITY]
+            if config_noise_level > max_noise_level:
+                logger.warning(
+                    f"The configured '{row['noise_type']}' noise level for column '{row['column']}' is {config_noise_level}, "
+                    f"but the maximum allowable noise level is {max_noise_level}. "
+                    f"The maximum allowable noise level will be used instead of the configured value. "
+                    f"This value is based on the provided data for '{row['dataset']}'. "
+                )
+                # Should we update here in validator or pass values back to interface?
+                configuration_tree.update(
+                    {
+                        row["dataset"]: {
+                            Keys.COLUMN_NOISE: {
+                                row["column"]: {
+                                    row["noise_type"]: {
+                                        Keys.CELL_PROBABILITY: max_noise_level
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    layer="max_noise_level",
+                )
+        return configuration_tree
 
 
 DEFAULT_PARAMETER_CONFIG_VALIDATOR_MAP = {
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
index 85079473..48bfddd1 100644
--- a/src/pseudopeople/interface.py
+++ b/src/pseudopeople/interface.py
@@ -55,10 +55,11 @@ def _generate_dataset(
     """
     configure_logging_to_terminal(verbose)
     configuration_tree = get_configuration(config)
-    breakpoint()
-
     # Validate confgiuration noise levels with possible metadata noise level proportions
-    # configuration_tree = validate_noise_level_proportions(configuration_tree, dataset, user_filters)
+    configuration_tree = validate_noise_level_proportions(
+        configuration_tree, dataset, user_filters
+    )
+
     if source is None:
         source = paths.SAMPLE_DATA_ROOT
     else:
diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py
index 0c7644e8..89b5b974 100644
--- a/tests/unit/test_configuration.py
+++ b/tests/unit/test_configuration.py
@@ -1,12 +1,16 @@
 import itertools
 
+import pandas as pd
 import pytest
 import yaml
 
 from pseudopeople.configuration import NO_NOISE, Keys, get_configuration
 from pseudopeople.configuration.generator import DEFAULT_NOISE_VALUES
 from pseudopeople.configuration.interface import get_config
-from pseudopeople.configuration.validator import ConfigurationError
+from pseudopeople.configuration.validator import (
+    ConfigurationError,
+    validate_noise_level_proportions,
+)
 from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import COLUMNS, DATASETS
@@ -28,7 +32,7 @@ def test_get_default_configuration(mocker):
     """Tests that the default configuration can be retrieved."""
     mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree")
     _ = get_configuration()
-    mock.assert_called_once_with(layers=["baseline", "default", "user"])
+    mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"])
 
 
 def test_default_configuration_structure():
@@ -122,7 +126,7 @@ def test_get_configuration_with_user_override(mocker):
         }
     }
     _ = get_configuration(config)
-    mock.assert_called_once_with(layers=["baseline", "default", "user"])
+    mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"])
     update_calls = [
         call
         for call in mock.mock_calls
@@ -557,3 +561,43 @@ def test_no_noise():
             column_noise_dict = dataset_column_dict[column]
             for column_noise_type in column_noise_dict.keys():
                 assert column_noise_dict[column_noise_type][Keys.CELL_PROBABILITY] == 0.0
+
+
+def test_get_configuration_with_max_level_overrides(mocker):
+    """Tests that the default configuration get updated when a user provides a value
+    for specific noise types and we update with the maximum allowable value in the
+    max level override layer."""
+    mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree")
+    config = {
+        DATASETS.acs.name: {
+            Keys.COLUMN_NOISE: {
+                "first_name": {NOISE_TYPES.use_nickname.name: {Keys.CELL_PROBABILITY: 0.95}}
+            },
+        }
+    }
+    user_filters = [
+        (
+            DATASETS.acs.date_column_name,
+            ">=",
+            pd.Timestamp(year=2020, month=1, day=1),
+        ),
+        (
+            DATASETS.acs.date_column_name,
+            "<=",
+            pd.Timestamp(year=2020, month=12, day=31),
+        ),
+        (DATASETS.acs.state_column_name, "==", "WA"),
+    ]
+    # TODO: need to get full configuratio nand pass it instead
+    _ = validate_noise_level_proportions(config, DATASETS.acs, user_filters)
+    mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"])
+    update_calls = [
+        call
+        for call in mock.mock_calls
+        if ".update({" in str(call) and "layer='max_noise_level'" in str(call)
+    ]
+    assert len(update_calls) == 1
+
+
+# TODO: add test that logger warning is thrown if user provides a value that is too high
+# and it gets adjusted to max level