From 1bb522ee9815aa34fd190010539ae11ea158d1e4 Mon Sep 17 00:00:00 2001 From: albrja Date: Mon, 20 Nov 2023 16:40:20 -0800 Subject: [PATCH 1/8] Parse user filters and validate noise levels --- src/pseudopeople/interface.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 12cdb87d..1c5f487d 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -52,6 +52,8 @@ def _generate_dataset( configure_logging_to_terminal(verbose) configuration_tree = get_configuration(config, dataset, user_filters) + # Validate confgiuration noise levels with possible metadata noise level proportions + # configuration_tree = validate_noise_level_proportions(configuration_tree, dataset, user_filters) if source is None: source = paths.SAMPLE_DATA_ROOT else: From 22a42d9f4c9daee31f1eecda99a1121efb8da4d2 Mon Sep 17 00:00:00 2001 From: albrja Date: Wed, 22 Nov 2023 16:38:39 -0800 Subject: [PATCH 2/8] Add SSA weighted proportion and started tests --- src/pseudopeople/configuration/generator.py | 1 + src/pseudopeople/interface.py | 5 ++++- tests/unit/test_configuration.py | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pseudopeople/configuration/generator.py b/src/pseudopeople/configuration/generator.py index d00c34e0..f6f8d198 100644 --- a/src/pseudopeople/configuration/generator.py +++ b/src/pseudopeople/configuration/generator.py @@ -113,6 +113,7 @@ def _generate_configuration(is_no_noise: bool) -> ConfigTree: "baseline", "default", "user", + "max_noise_level", ] noising_configuration = ConfigTree(layers=default_config_layers) # Instantiate the configuration file with baseline values diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 1c5f487d..528ee863 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -53,7 +53,10 @@ def _generate_dataset( configuration_tree = get_configuration(config, dataset, user_filters) # Validate confgiuration noise levels with possible metadata noise level proportions - # configuration_tree = validate_noise_level_proportions(configuration_tree, dataset, user_filters) + configuration_tree = validate_noise_level_proportions( + configuration_tree, dataset, user_filters + ) + if source is None: source = paths.SAMPLE_DATA_ROOT else: diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index f620dc31..5b394965 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -32,7 +32,7 @@ def test_get_default_configuration(mocker): """Tests that the default configuration can be retrieved.""" mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree") _ = get_configuration() - mock.assert_called_once_with(layers=["baseline", "default", "user"]) + mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"]) def test_default_configuration_structure(): @@ -126,7 +126,7 @@ def test_get_configuration_with_user_override(mocker): } } _ = get_configuration(config) - mock.assert_called_once_with(layers=["baseline", "default", "user"]) + mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"]) update_calls = [ call for call in mock.mock_calls From 003db1661aaf3cf6aef63ac0e40ff8262482e73a Mon Sep 17 00:00:00 2001 From: albrja Date: Mon, 27 Nov 2023 16:55:31 -0800 Subject: [PATCH 3/8] Update after talking to RT. TODO fix tests --- src/pseudopeople/configuration/generator.py | 1 - src/pseudopeople/interface.py | 4 +--- tests/unit/test_configuration.py | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/pseudopeople/configuration/generator.py b/src/pseudopeople/configuration/generator.py index f6f8d198..d00c34e0 100644 --- a/src/pseudopeople/configuration/generator.py +++ b/src/pseudopeople/configuration/generator.py @@ -113,7 +113,6 @@ def _generate_configuration(is_no_noise: bool) -> ConfigTree: "baseline", "default", "user", - "max_noise_level", ] noising_configuration = ConfigTree(layers=default_config_layers) # Instantiate the configuration file with baseline values diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 528ee863..8e3baca2 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -53,9 +53,7 @@ def _generate_dataset( configuration_tree = get_configuration(config, dataset, user_filters) # Validate confgiuration noise levels with possible metadata noise level proportions - configuration_tree = validate_noise_level_proportions( - configuration_tree, dataset, user_filters - ) + validate_noise_level_proportions(configuration_tree, dataset, user_filters) if source is None: source = paths.SAMPLE_DATA_ROOT diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index 5b394965..f620dc31 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -32,7 +32,7 @@ def test_get_default_configuration(mocker): """Tests that the default configuration can be retrieved.""" mock = mocker.patch("pseudopeople.configuration.generator.ConfigTree") _ = get_configuration() - mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"]) + mock.assert_called_once_with(layers=["baseline", "default", "user"]) def test_default_configuration_structure(): @@ -126,7 +126,7 @@ def test_get_configuration_with_user_override(mocker): } } _ = get_configuration(config) - mock.assert_called_once_with(layers=["baseline", "default", "user", "max_noise_level"]) + mock.assert_called_once_with(layers=["baseline", "default", "user"]) update_calls = [ call for call in mock.mock_calls From 0279e2126a85df9cab202ff1eeb6a2e528181194 Mon Sep 17 00:00:00 2001 From: albrja Date: Tue, 28 Nov 2023 13:41:37 -0800 Subject: [PATCH 4/8] Update test for proportions and remove metadata yaml --- tests/unit/test_configuration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index f620dc31..275dc63e 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -539,7 +539,6 @@ def test_validate_noise_level_proportions(caplog, column, noise_type, noise_leve Tests that a warning is thrown when a user provides configuration overrides that are higher than the calculated metadata proportions for that column noise type pairing. """ - census = DATASETS.get_dataset("decennial_census") user_filters = [ (census.date_column_name, "==", 2020), From ee684595479f499b6d700f867beb6728d8216dbf Mon Sep 17 00:00:00 2001 From: albrja Date: Wed, 29 Nov 2023 15:14:46 -0800 Subject: [PATCH 5/8] Coerce empty strings to np.nan for mailing address street cols --- src/pseudopeople/constants/metadata.py | 5 +++++ src/pseudopeople/interface.py | 21 ++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/pseudopeople/constants/metadata.py b/src/pseudopeople/constants/metadata.py index 54eb61bd..b1aa273d 100644 --- a/src/pseudopeople/constants/metadata.py +++ b/src/pseudopeople/constants/metadata.py @@ -97,5 +97,10 @@ class __DateFormats: INT_COLUMNS = ["age", "wages", "mailing_address_po_box"] +MAILING_ADDRESS_STREET_COLUMNS = [ + "mailing_address_street_name", + "mailing_address_street_number", + "mailing_address_unit_number", +] YEAR_AGGREGATION_VALUE = 3000 # value for all years in a dataset for metadata proportions diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 8e3baca2..e3588d0a 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -1,17 +1,20 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple, Union +import numpy as np import pandas as pd -import yaml from loguru import logger from packaging.version import parse from tqdm import tqdm from pseudopeople import __version__ as psp_version from pseudopeople.configuration import get_configuration -from pseudopeople.configuration.validator import validate_noise_level_proportions from pseudopeople.constants import paths -from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS, INT_COLUMNS +from pseudopeople.constants.metadata import ( + COPY_HOUSEHOLD_MEMBER_COLS, + INT_COLUMNS, + MAILING_ADDRESS_STREET_COLUMNS, +) from pseudopeople.exceptions import DataSourceError from pseudopeople.loader import load_standard_dataset_file from pseudopeople.noise import noise_dataset @@ -99,7 +102,9 @@ def _generate_dataset( # Known pandas bug: pd.concat does not preserve category dtypes so we coerce # again after concat (https://github.com/pandas-dev/pandas/issues/51362) - noised_dataset = _coerce_dtypes(noised_dataset, dataset, cleanse_int_cols=True) + noised_dataset = _coerce_dtypes( + noised_dataset, dataset, cleanse_int_cols=True, cleanse_address_cols=True + ) logger.debug("*** Finished ***") @@ -139,13 +144,19 @@ def _get_data_changelog_version(changelog): def _coerce_dtypes( - data: pd.DataFrame, dataset: Dataset, cleanse_int_cols: bool = False + data: pd.DataFrame, + dataset: Dataset, + cleanse_int_cols: bool = False, + cleanse_address_cols: bool = False, ) -> pd.DataFrame: # Coerce dtypes prior to noising to catch issues early as well as # get most columns away from dtype 'category' and into 'object' (strings) for col in dataset.columns: if cleanse_int_cols and col.name in INT_COLUMNS: data[col.name] = cleanse_integer_columns(data[col.name]) + # Coerce emtpy strings to NaNs for mailing address columns that have PO boxes + if cleanse_address_cols and col.name in MAILING_ADDRESS_STREET_COLUMNS: + data[col.name] = data[col.name].replace("", np.nan) if col.dtype_name != data[col.name].dtype.name: data[col.name] = data[col.name].astype(col.dtype_name) From 009f95dfb29dffb480928688e707e8827ba63ca1 Mon Sep 17 00:00:00 2001 From: albrja Date: Thu, 30 Nov 2023 17:01:57 -0800 Subject: [PATCH 6/8] Lint --- src/pseudopeople/interface.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index e3588d0a..f5461d38 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -55,9 +55,6 @@ def _generate_dataset( configure_logging_to_terminal(verbose) configuration_tree = get_configuration(config, dataset, user_filters) - # Validate confgiuration noise levels with possible metadata noise level proportions - validate_noise_level_proportions(configuration_tree, dataset, user_filters) - if source is None: source = paths.SAMPLE_DATA_ROOT else: @@ -103,7 +100,10 @@ def _generate_dataset( # Known pandas bug: pd.concat does not preserve category dtypes so we coerce # again after concat (https://github.com/pandas-dev/pandas/issues/51362) noised_dataset = _coerce_dtypes( - noised_dataset, dataset, cleanse_int_cols=True, cleanse_address_cols=True + noised_dataset, + dataset, + cleanse_int_cols=True, + cleanse_address_cols=True, ) logger.debug("*** Finished ***") From 6f69c73683a08be101c73173543a189d015616a8 Mon Sep 17 00:00:00 2001 From: albrja Date: Fri, 1 Dec 2023 11:40:21 -0800 Subject: [PATCH 7/8] Reduce code and coerce all empty strings not in int columns --- src/pseudopeople/constants/metadata.py | 5 ----- src/pseudopeople/interface.py | 10 ++-------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/src/pseudopeople/constants/metadata.py b/src/pseudopeople/constants/metadata.py index b1aa273d..54eb61bd 100644 --- a/src/pseudopeople/constants/metadata.py +++ b/src/pseudopeople/constants/metadata.py @@ -97,10 +97,5 @@ class __DateFormats: INT_COLUMNS = ["age", "wages", "mailing_address_po_box"] -MAILING_ADDRESS_STREET_COLUMNS = [ - "mailing_address_street_name", - "mailing_address_street_number", - "mailing_address_unit_number", -] YEAR_AGGREGATION_VALUE = 3000 # value for all years in a dataset for metadata proportions diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index f5461d38..35ed5123 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -10,11 +10,7 @@ from pseudopeople import __version__ as psp_version from pseudopeople.configuration import get_configuration from pseudopeople.constants import paths -from pseudopeople.constants.metadata import ( - COPY_HOUSEHOLD_MEMBER_COLS, - INT_COLUMNS, - MAILING_ADDRESS_STREET_COLUMNS, -) +from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS, INT_COLUMNS from pseudopeople.exceptions import DataSourceError from pseudopeople.loader import load_standard_dataset_file from pseudopeople.noise import noise_dataset @@ -103,7 +99,6 @@ def _generate_dataset( noised_dataset, dataset, cleanse_int_cols=True, - cleanse_address_cols=True, ) logger.debug("*** Finished ***") @@ -147,7 +142,6 @@ def _coerce_dtypes( data: pd.DataFrame, dataset: Dataset, cleanse_int_cols: bool = False, - cleanse_address_cols: bool = False, ) -> pd.DataFrame: # Coerce dtypes prior to noising to catch issues early as well as # get most columns away from dtype 'category' and into 'object' (strings) @@ -155,7 +149,7 @@ def _coerce_dtypes( if cleanse_int_cols and col.name in INT_COLUMNS: data[col.name] = cleanse_integer_columns(data[col.name]) # Coerce emtpy strings to NaNs for mailing address columns that have PO boxes - if cleanse_address_cols and col.name in MAILING_ADDRESS_STREET_COLUMNS: + if cleanse_int_cols and col.name not in INT_COLUMNS: data[col.name] = data[col.name].replace("", np.nan) if col.dtype_name != data[col.name].dtype.name: data[col.name] = data[col.name].astype(col.dtype_name) From 301fcddc75f753307ddc8690399a9775156128d3 Mon Sep 17 00:00:00 2001 From: albrja Date: Fri, 1 Dec 2023 12:43:13 -0800 Subject: [PATCH 8/8] Update comment --- src/pseudopeople/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 35ed5123..31f1fd26 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -148,7 +148,7 @@ def _coerce_dtypes( for col in dataset.columns: if cleanse_int_cols and col.name in INT_COLUMNS: data[col.name] = cleanse_integer_columns(data[col.name]) - # Coerce emtpy strings to NaNs for mailing address columns that have PO boxes + # Coerce empty strings to nans if cleanse_int_cols and col.name not in INT_COLUMNS: data[col.name] = data[col.name].replace("", np.nan) if col.dtype_name != data[col.name].dtype.name: