From b05d20d3d7bee6b8729d1f04f5f98dcadb112c57 Mon Sep 17 00:00:00 2001 From: albrja <37345113+albrja@users.noreply.github.com> Date: Mon, 3 Apr 2023 17:30:05 -0700 Subject: [PATCH 1/6] Numeric miswriting (#26) Numeric miswriting noise function Implementation and tests for numeric miswriting noise function - *Category*: Feature - *JIRA issue*: [MIC-3907](https://jira.ihme.washington.edu/browse/MIC-3907) -Adds numeric miswriting noise function -Adds tests for numeric miswriting noise function Testing Test suites pass with no failures --- src/pseudopeople/default_configuration.yaml | 37 +++++++ src/pseudopeople/noise_functions.py | 43 ++++++-- tests/unit/test_column_noise.py | 111 +++++++++++++++++++- 3 files changed, 179 insertions(+), 12 deletions(-) diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml index 771ef3d7..0a4cfa0f 100644 --- a/src/pseudopeople/default_configuration.yaml +++ b/src/pseudopeople/default_configuration.yaml @@ -41,6 +41,9 @@ decennial_census: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 street_number: missing_data: row_noise_level: 0.01 @@ -48,6 +51,9 @@ decennial_census: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 street_name: missing_data: row_noise_level: 0.01 @@ -62,6 +68,9 @@ decennial_census: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 city: missing_data: row_noise_level: 0.01 @@ -106,12 +115,18 @@ taxes_w2_and_1099: date_of_birth: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 employer_city: missing_data: row_noise_level: 0.01 employer_id: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 employer_name: missing_data: row_noise_level: 0.01 @@ -126,9 +141,15 @@ taxes_w2_and_1099: employer_street_number: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 employer_unit_number: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 employer_zipcode: missing_data: row_noise_level: 0.01 @@ -138,6 +159,9 @@ taxes_w2_and_1099: income: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 tax_form: missing_data: row_noise_level: 0.01 @@ -160,9 +184,19 @@ taxes_w2_and_1099: mailing_address_street_number: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 mailing_address_unit_number: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + mailing_address_po_box: + missing_data: + row_noise_level: 0.01 + token_noise_level: 0.1 mailing_address_zipcode: missing_data: row_noise_level: 0.01 @@ -172,3 +206,6 @@ taxes_w2_and_1099: ssn: missing_data: row_noise_level: 0.01 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index ba5ba25f..2a2d6f97 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -152,21 +152,46 @@ def miswrite_ages( def miswrite_numerics( - form_data: pd.DataFrame, - configuration: float, + column: pd.Series, + configuration: ConfigTree, randomness_stream: RandomnessStream, additional_key: Any, -) -> pd.DataFrame: +) -> pd.Series: """ + Function that noises numeric characters in a series. - :param form_data: - :param configuration: - :param randomness_stream: + :param column: A pd.Series + :param configuration: ConfigTree object containing noise level + :param randomness_stream: RandomnessStream for CRN framework. :param additional_key: Key for RandomnessStream - :return: + + returns: pd.Series with some numeric values experiencing noise. """ - # todo actually duplicate rows - return form_data + + # This is a fix to not replacing the original token for noise options + token_noise_level = configuration.token_noise_level / 0.9 + rng = np.random.default_rng(randomness_stream.seed) + column = column.astype(str) + longest_str = column.str.len().max() + same_len_col = column.str.pad(longest_str, side="right") + is_number = pd.concat( + [same_len_col.str[i].str.isdigit() for i in range(longest_str)], axis=1 + ) + + replace = (rng.random(is_number.shape) < token_noise_level) & is_number + random_digits = rng.choice(list("0123456789"), is_number.shape) + + # Choose and replace values for a noised series + noised_column = pd.Series("", index=column.index) + digits = [] + for i in range(len(is_number.columns)): + digit = np.where(replace.iloc[:, i], random_digits[:, i], same_len_col.str[i]) + digit = pd.Series(digit, index=column.index, name=column.name) + digits.append(digit) + noised_column = noised_column + digits[i] + noised_column.str.strip() + + return noised_column def generate_nicknames( diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index c8c0802d..1c4871bf 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -49,6 +49,14 @@ def categorical_series(): ) +@pytest.fixture(scope="module") +def string_series(): + return pd.Series( + ["Unit 1A", "1234", "12/31/2020", "a1b2c3", "100000.00", "123-45-6789", ""] * 100_000, + name="random_strings", + ) + + @pytest.fixture(scope="module") def default_configuration(): return get_configuration() @@ -130,9 +138,106 @@ def test_miswrite_ages(): pass -@pytest.mark.skip(reason="TODO") -def test_miswrite_numerics(): - pass +def test_miswrite_numerics(string_series): + """ + Validates that only numeric characters are noised in a series at a provided noise level. + """ + config = get_configuration() + config.update( + { + "decennial_census": { + "street_number": { + "numeric_miswriting": { + "row_noise_level": 0.4, + "token_noise_level": 0.5, + }, + }, + }, + } + ) + config = config["decennial_census"]["street_number"]["numeric_miswriting"] + p_row_noise = config.row_noise_level + p_token_noise = config.token_noise_level + data = string_series + noised_data = _validate_seed_and_noise_data( + noise_type=NOISE_TYPES.NUMERIC_MISWRITING, column=data, config=config + ) + + # Get masks for helper groups, each string in categorical string purpose is to mimic possible string types + empty_str = data == "" + unit_number = data == "Unit 1A" + id_number = data == "1234" + alt_str = data == "a1b2c3" + income = data == "100000.00" + date_of_birth = data == "12/31/2020" + ssn = data == "123-45-6789" + expected_noise = p_row_noise * p_token_noise + + # Check empty strings havent changed + assert (noised_data[empty_str] == "").all() + + for i in range(4): # "1234" + assert np.isclose( + expected_noise, + (data[id_number].str[i] != noised_data[id_number].str[i]).mean(), + rtol=0.02, + ) + assert (noised_data[id_number].str[i].str.isdigit()).all() + + for i in range(6): # "a1b2c3" + if i % 2 == 0: + assert (data[alt_str].str[i] == noised_data[alt_str].str[i]).all() + else: + assert np.isclose( + expected_noise, + (data[alt_str].str[i] != noised_data[alt_str].str[i]).mean(), + rtol=0.02, + ) + assert (noised_data[alt_str].str[i].str.isdigit()).all() + + for i in range(7): # "Unit 1A" + if i == 5: + assert np.isclose( + expected_noise, + (data[unit_number].str[i] != noised_data[unit_number].str[i]).mean(), + rtol=0.02, + ) + assert (noised_data[unit_number].str[i].str.isdigit()).all() + else: + assert (data[unit_number].str[i] == noised_data[unit_number].str[i]).all() + + for i in range(9): # "100000.00" + if i == 6: + assert (data[income].str[i] == noised_data[income].str[i]).all() + else: + assert np.isclose( + expected_noise, + (data[income].str[i] != noised_data[income].str[i]).mean(), + rtol=0.02, + ) + assert (noised_data[income].str[i].str.isdigit()).all() + + for i in range(10): # "12/31/2020" + if i in [2, 5]: + assert (data[date_of_birth].str[i] == noised_data[date_of_birth].str[i]).all() + else: + assert np.isclose( + expected_noise, + (data[date_of_birth].str[i] != noised_data[date_of_birth].str[i]).mean(), + rtol=0.02, + ) + assert (noised_data[date_of_birth].str[i].str.isdigit()).all() + + for i in range(11): # "123-45-6789" + if i in [3, 6]: + assert (data[ssn].str[i] == noised_data[ssn].str[i]).all() + else: + assert np.isclose( + expected_noise, + (data[ssn].str[i] != noised_data[ssn].str[i]).mean(), + rtol=0.02, + ) + assert (noised_data[ssn].str[i].str.isdigit()).all() @pytest.mark.skip(reason="TODO") From 3cd699c532dec4e236c18fb7876dacb835d7832f Mon Sep 17 00:00:00 2001 From: Matthew Kappel Date: Tue, 4 Apr 2023 10:16:38 -0700 Subject: [PATCH 2/6] Implement all non-1040 forms (#28) - *Category*: feature - *JIRA issue*: [MIC-3882](https://jira.ihme.washington.edu/browse/MIC-3882) Changes - Adds ACS, CPS, SSA, and WIC forms interfaces and default configuration - Adds ability to pass in DataFrame as source data and pass in a dict as configuration Testing Manual testing, ran each noising function against each sample data. Noising succeeded with noising being seen in diffs between source and noising output. --- src/pseudopeople/__init__.py | 9 +- src/pseudopeople/default_configuration.yaml | 383 +++++++++++++++++++- src/pseudopeople/interface.py | 126 ++++++- src/pseudopeople/schema_entities.py | 4 + src/pseudopeople/utilities.py | 10 +- tests/integration/test_interface.py | 6 +- tests/unit/test_noise_form.py | 2 +- 7 files changed, 512 insertions(+), 28 deletions(-) diff --git a/src/pseudopeople/__init__.py b/src/pseudopeople/__init__.py index ac749c28..02b60370 100644 --- a/src/pseudopeople/__init__.py +++ b/src/pseudopeople/__init__.py @@ -8,4 +8,11 @@ __uri__, __version__, ) -from pseudopeople.interface import generate_decennial_census, generate_w2 +from pseudopeople.interface import ( + generate_american_communities_survey, + generate_current_population_survey, + generate_decennial_census, + generate_social_security, + generate_taxes_w2_and_1099, + generate_women_infants_and_children, +) diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml index 0a4cfa0f..416f7132 100644 --- a/src/pseudopeople/default_configuration.yaml +++ b/src/pseudopeople/default_configuration.yaml @@ -107,29 +107,49 @@ decennial_census: row_noise_level: 0.01 taxes_w2_and_1099: - omission: 0.0145 + omission: 0.0 duplication: 0.05 age: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 date_of_birth: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 employer_city: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 employer_id: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 employer_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 employer_state: missing_data: row_noise_level: 0.01 @@ -138,27 +158,51 @@ taxes_w2_and_1099: employer_street_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 employer_street_number: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 employer_unit_number: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 employer_zipcode: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 first_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 income: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 @@ -170,9 +214,17 @@ taxes_w2_and_1099: last_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 mailing_address_city: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 mailing_address_state: missing_data: row_noise_level: 0.01 @@ -181,15 +233,27 @@ taxes_w2_and_1099: mailing_address_street_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 mailing_address_street_number: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 mailing_address_unit_number: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 @@ -200,12 +264,329 @@ taxes_w2_and_1099: mailing_address_zipcode: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 middle_initial: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 ssn: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 +american_communities_survey: + omission: 0.0145 + duplication: 0.05 + age: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + city: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + date_of_birth: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + first_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + last_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + middle_initial: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + sex: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + state: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + street_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + street_number: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + unit_number: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + zipcode: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 +current_population_survey: + omission: 0.2905 + duplication: 0.05 + age: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + city: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + date_of_birth: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + first_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + last_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + middle_initial: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + sex: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + state: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + street_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + street_number: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + unit_number: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + zipcode: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + +women_infants_and_children: + omission: 0.0 + duplication: 0.05 + age: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + city: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + date_of_birth: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + first_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + last_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + middle_initial: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + race_ethnicity: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + sex: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + state: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + street_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + street_number: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + unit_number: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + zipcode: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 +social_security: + omission: 0.0 + duplication: 0.05 + date_of_birth: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + event_date: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + event_type: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + first_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + last_name: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + middle_initial: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + ssn: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index d767bc26..08dfb293 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -9,40 +9,132 @@ from pseudopeople.utilities import get_configuration +def _generate_form( + form: Form, + source: Union[Path, str, pd.DataFrame], + seed: int, + configuration: Union[Path, str, dict], +): + """ + Helper for generating noised forms from clean data. + + :param form: + Form needing to be noised + :param source: + Clean data input which needs to be noised + :param seed: + Seed for controlling randomness + :param configuration: + Object to configure noise levels + :return: + Noised form data + """ + configuration_tree = get_configuration(configuration) + if isinstance(source, pd.DataFrame): + data = source + else: + data = pd.read_csv(source, dtype=str, keep_default_na=False) + return noise_form(form, data, configuration_tree, seed) + + # TODO: add year as parameter to select the year of the decennial census to generate (MIC-3909) # TODO: add default path: have the package install the small data in a known location and then # to make this parameter optional, with the default being the location of the small data that # is installed with the package (MIC-3884) def generate_decennial_census( - path: Union[Path, str], seed: int = 0, configuration: Union[Path, str] = None + source: Union[Path, str, pd.DataFrame], + seed: int = 0, + configuration: Union[Path, str, dict] = None, ): """ - Generates a noised decennial census data from un-noised data. + Generates noised decennial census data from un-noised data. - :param path: A path to the un-noised source census data + :param source: A path to or pd.DataFrame of the un-noised source census data :param seed: An integer seed for randomness - :param configuration: (optional) A path to a configuration YAML file to modify default values + :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration :return: A pd.DataFrame of noised census data """ - configuration_tree = get_configuration(configuration) - data = pd.read_csv(path, dtype=str, keep_default_na=False) - return noise_form(Form.CENSUS, data, configuration_tree, seed) + return _generate_form(Form.CENSUS, source, seed, configuration) -def generate_w2( - path: Union[Path, str], seed: int = 0, configuration: Union[Path, str] = None +def generate_american_communities_survey( + source: Union[Path, str, pd.DataFrame], + seed: int = 0, + configuration: Union[Path, str, dict] = None, ): """ - Generates a noised W2 data from un-noised data. + Generates noised American Communities Survey (ACS) data from un-noised data. - :param path: A path to the un-noised source W2 data + :param source: A path to or pd.DataFrame of the un-noised source ACS data :param seed: An integer seed for randomness - :param configuration: (optional) A path to a configuration YAML file to modify default values - :return: A pd.DataFrame of noised W2 data + :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration + :return: A pd.DataFrame of noised ACS data """ - configuration_tree = get_configuration(configuration) - data = pd.read_csv(path, dtype=str, keep_default_na=False) - return noise_form(Form.TAX_W2_1099, data, configuration_tree, seed) + return _generate_form(Form.ACS, source, seed, configuration) + + +def generate_current_population_survey( + source: Union[Path, str, pd.DataFrame], + seed: int = 0, + configuration: Union[Path, str, dict] = None, +): + """ + Generates noised Current Population Survey (CPS) data from un-noised data. + + :param source: A path to or pd.DataFrame of the un-noised source CPS data + :param seed: An integer seed for randomness + :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration + :return: A pd.DataFrame of noised CPS data + """ + return _generate_form(Form.CPS, source, seed, configuration) + + +def generate_taxes_w2_and_1099( + source: Union[Path, str, pd.DataFrame], + seed: int = 0, + configuration: Union[Path, str, dict] = None, +): + """ + Generates noised W2 and 1099 data from un-noised data. + + :param source: A path to or pd.DataFrame of the un-noised source W2 and 1099 data + :param seed: An integer seed for randomness + :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration + :return: A pd.DataFrame of noised W2 and 1099 data + """ + return _generate_form(Form.TAX_W2_1099, source, seed, configuration) + + +def generate_women_infants_and_children( + source: Union[Path, str, pd.DataFrame], + seed: int = 0, + configuration: Union[Path, str, dict] = None, +): + """ + Generates noised Women Infants and Children (WIC) data from un-noised data. + + :param source: A path to or pd.DataFrame of the un-noised source WIC data + :param seed: An integer seed for randomness + :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration + :return: A pd.DataFrame of noised WIC data + """ + return _generate_form(Form.WIC, source, seed, configuration) + + +def generate_social_security( + source: Union[Path, str, pd.DataFrame], + seed: int = 0, + configuration: Union[Path, str, dict] = None, +): + """ + Generates noised Social Security (SSA) data from un-noised data. + + :param source: A path to or pd.DataFrame of the un-noised source SSA data + :param seed: An integer seed for randomness + :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration + :return: A pd.DataFrame of noised SSA data + """ + return _generate_form(Form.SSA, source, seed, configuration) # Manual testing helper @@ -51,7 +143,7 @@ def generate_w2( if len(args) == 1: my_path = Path(args[0]) src = pd.read_csv(my_path, dtype=str, keep_default_na=False) - out = generate_w2(my_path) + out = generate_taxes_w2_and_1099(my_path) diff = src[ ~src.astype(str).apply(tuple, 1).isin(out.astype(str).apply(tuple, 1)) ] # get all changed rows diff --git a/src/pseudopeople/schema_entities.py b/src/pseudopeople/schema_entities.py index a2e584cf..64f75946 100644 --- a/src/pseudopeople/schema_entities.py +++ b/src/pseudopeople/schema_entities.py @@ -4,6 +4,10 @@ # todo: is "form" the right word? Ask RT class Form(Enum): + """ + Enum containing all supported forms. + """ + CENSUS = "decennial_census" ACS = "american_communities_survey" CPS = "current_population_survey" diff --git a/src/pseudopeople/utilities.py b/src/pseudopeople/utilities.py index 8d779d62..ded1f07a 100644 --- a/src/pseudopeople/utilities.py +++ b/src/pseudopeople/utilities.py @@ -13,11 +13,11 @@ def get_randomness_stream(form: Form, seed: int) -> RandomnessStream: return RandomnessStream(form.value, lambda: pd.Timestamp("2020-04-01"), seed) -def get_configuration(user_yaml_path: Union[Path, str] = None) -> ConfigTree: +def get_configuration(user_configuration: Union[Path, str, dict] = None) -> ConfigTree: """ Gets a noising configuration ConfigTree, optionally overridden by a user-provided YAML. - :param user_yaml_path: A path to the YAML file defining user overrides for the defaults + :param user_configuration: A dictionary or path to the YAML file defining user overrides for the defaults :return: a ConfigTree object of the noising configuration """ import pseudopeople @@ -30,8 +30,8 @@ def get_configuration(user_yaml_path: Union[Path, str] = None) -> ConfigTree: data=Path(pseudopeople.__file__).resolve().parent / "default_configuration.yaml", layers=default_config_layers, ) - if user_yaml_path: - noising_configuration.update(user_yaml_path, layer="user") + if user_configuration: + noising_configuration.update(user_configuration, layer="user") return noising_configuration @@ -45,7 +45,7 @@ def vectorized_choice( ): """ Function that takes a list of options and uses Vivarium common random numbers framework to make a given number - of razndom choice selections. + of random choice selections. :param options: List and series of possible values to choose :param n_to_choose: Number of choices to make, the length of the returned array of values diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 294f9d81..605c1d0c 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -15,13 +15,13 @@ def test_generate_decennial_census( # TODO: Refactor this check into a separate test noised_data = generate_decennial_census( - path=decennial_census_data_path, seed=0, configuration=user_config_path + source=decennial_census_data_path, seed=0, configuration=user_config_path ) noised_data_same_seed = generate_decennial_census( - path=decennial_census_data_path, seed=0, configuration=user_config_path + source=decennial_census_data_path, seed=0, configuration=user_config_path ) noised_data_different_seed = generate_decennial_census( - path=decennial_census_data_path, seed=1, configuration=user_config_path + source=decennial_census_data_path, seed=1, configuration=user_config_path ) assert noised_data.equals(noised_data_same_seed) diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py index ee099f35..bedefab1 100644 --- a/tests/unit/test_noise_form.py +++ b/tests/unit/test_noise_form.py @@ -169,7 +169,7 @@ def test_correct_forms_are_used(func, form, mocker): if func == "todo": pytest.skip(reason=f"TODO: implement function for {form.value} form") mock = mocker.patch("pseudopeople.interface.noise_form") - mocker.patch("pseudopeople.interface.pd") + mocker.patch("pseudopeople.interface.pd.read_csv") _ = func("dummy/path") assert mock.call_args[0][0] == form From 480d43ccf59177667845ae981d57bf9910171f19 Mon Sep 17 00:00:00 2001 From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com> Date: Tue, 4 Apr 2023 13:29:26 -0600 Subject: [PATCH 3/6] implement age miswriting (#24) * implement age noising * update default_configuration.yaml --- src/pseudopeople/default_configuration.yaml | 184 ++++++++++++++++---- src/pseudopeople/noise_functions.py | 81 +++++---- src/pseudopeople/utilities.py | 95 +++++++++- tests/integration/conftest.py | 10 +- tests/unit/test_column_noise.py | 159 +++++++++++++++-- tests/unit/test_configuration.py | 78 ++++++++- 6 files changed, 512 insertions(+), 95 deletions(-) diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml index 416f7132..44340c8a 100644 --- a/src/pseudopeople/default_configuration.yaml +++ b/src/pseudopeople/default_configuration.yaml @@ -6,54 +6,75 @@ decennial_census: omission: 0.0145 duplication: 0.05 - first_name: + age: missing_data: row_noise_level: 0.01 typographic: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - middle_initial: + age_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + possible_perturbations: {1: 0.5, -1: 0.5} + city: missing_data: row_noise_level: 0.01 typographic: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - last_name: + date_of_birth: missing_data: row_noise_level: 0.01 typographic: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - age: + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + first_name: missing_data: row_noise_level: 0.01 typographic: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - date_of_birth: + last_name: missing_data: row_noise_level: 0.01 typographic: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - numeric_miswriting: - row_noise_level: 0.01 - token_noise_level: 0.1 - street_number: + middle_initial: missing_data: row_noise_level: 0.01 typographic: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - numeric_miswriting: + race_ethnicity: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + relation_to_household_head: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + sex: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + state: + missing_data: + row_noise_level: 0.01 + incorrect_selection: row_noise_level: 0.01 - token_noise_level: 0.1 street_name: missing_data: row_noise_level: 0.01 @@ -61,7 +82,7 @@ decennial_census: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - unit_number: + street_number: missing_data: row_noise_level: 0.01 typographic: @@ -71,18 +92,16 @@ decennial_census: numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 - city: + unit_number: missing_data: row_noise_level: 0.01 typographic: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - state: - missing_data: - row_noise_level: 0.01 - incorrect_selection: + numeric_miswriting: row_noise_level: 0.01 + token_noise_level: 0.1 zipcode: missing_data: row_noise_level: 0.01 @@ -90,21 +109,6 @@ decennial_census: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 - relation_to_household_head: - missing_data: - row_noise_level: 0.01 - incorrect_selection: - row_noise_level: 0.01 - sex: - missing_data: - row_noise_level: 0.01 - incorrect_selection: - row_noise_level: 0.01 - race_ethnicity: - missing_data: - row_noise_level: 0.01 - incorrect_selection: - row_noise_level: 0.01 taxes_w2_and_1099: omission: 0.0 @@ -116,6 +120,10 @@ taxes_w2_and_1099: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + age_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + possible_perturbations: {1: 0.5, -1: 0.5} date_of_birth: missing_data: row_noise_level: 0.01 @@ -206,11 +214,6 @@ taxes_w2_and_1099: numeric_miswriting: row_noise_level: 0.01 token_noise_level: 0.1 - tax_form: - missing_data: - row_noise_level: 0.01 - incorrect_selection: - row_noise_level: 0.01 last_name: missing_data: row_noise_level: 0.01 @@ -261,6 +264,13 @@ taxes_w2_and_1099: missing_data: row_noise_level: 0.01 token_noise_level: 0.1 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 mailing_address_zipcode: missing_data: row_noise_level: 0.01 @@ -282,6 +292,15 @@ taxes_w2_and_1099: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + tax_form: + missing_data: + row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 + american_communities_survey: omission: 0.0145 duplication: 0.05 @@ -292,6 +311,10 @@ american_communities_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + age_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + possible_perturbations: {1: 0.5, -1: 0.5} city: missing_data: row_noise_level: 0.01 @@ -306,6 +329,9 @@ american_communities_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 first_name: missing_data: row_noise_level: 0.01 @@ -320,6 +346,17 @@ american_communities_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + mailing_address_po_box: + missing_data: + row_noise_level: 0.01 + token_noise_level: 0.1 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 middle_initial: missing_data: row_noise_level: 0.01 @@ -351,6 +388,9 @@ american_communities_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 unit_number: missing_data: row_noise_level: 0.01 @@ -358,6 +398,9 @@ american_communities_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 zipcode: missing_data: row_noise_level: 0.01 @@ -365,6 +408,7 @@ american_communities_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + current_population_survey: omission: 0.2905 duplication: 0.05 @@ -375,6 +419,10 @@ current_population_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + age_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + possible_perturbations: {1: 0.5, -1: 0.5} city: missing_data: row_noise_level: 0.01 @@ -389,6 +437,9 @@ current_population_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 first_name: missing_data: row_noise_level: 0.01 @@ -403,6 +454,17 @@ current_population_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + mailing_address_po_box: + missing_data: + row_noise_level: 0.01 + token_noise_level: 0.1 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 middle_initial: missing_data: row_noise_level: 0.01 @@ -434,6 +496,9 @@ current_population_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 unit_number: missing_data: row_noise_level: 0.01 @@ -441,6 +506,9 @@ current_population_survey: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 zipcode: missing_data: row_noise_level: 0.01 @@ -459,6 +527,10 @@ women_infants_and_children: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + age_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + possible_perturbations: {1: 0.5, -1: 0.5} city: missing_data: row_noise_level: 0.01 @@ -473,6 +545,9 @@ women_infants_and_children: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 first_name: missing_data: row_noise_level: 0.01 @@ -487,6 +562,17 @@ women_infants_and_children: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + mailing_address_po_box: + missing_data: + row_noise_level: 0.01 + token_noise_level: 0.1 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 middle_initial: missing_data: row_noise_level: 0.01 @@ -523,6 +609,9 @@ women_infants_and_children: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 unit_number: missing_data: row_noise_level: 0.01 @@ -530,6 +619,9 @@ women_infants_and_children: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 zipcode: missing_data: row_noise_level: 0.01 @@ -537,9 +629,21 @@ women_infants_and_children: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + social_security: omission: 0.0 duplication: 0.05 + age: + missing_data: + row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 + age_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 + possible_perturbations: {1: 0.5, -1: 0.5} date_of_birth: missing_data: row_noise_level: 0.01 @@ -547,6 +651,9 @@ social_security: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 event_date: missing_data: row_noise_level: 0.01 @@ -554,6 +661,9 @@ social_security: row_noise_level: 0.01 token_noise_level: 0.1 include_original_token_level: 0.1 + numeric_miswriting: + row_noise_level: 0.01 + token_noise_level: 0.1 event_type: missing_data: row_noise_level: 0.01 diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index 2a2d6f97..e92a58ce 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -11,8 +11,8 @@ def omit_rows( - form_data: pd.DataFrame, - configuration: float, + form_data: float, + configuration: ConfigTree, randomness_stream: RandomnessStream, ) -> pd.DataFrame: """ @@ -27,8 +27,8 @@ def omit_rows( def duplicate_rows( - form_data: pd.DataFrame, - configuration: float, + form_data: float, + configuration: ConfigTree, randomness_stream: RandomnessStream, ) -> pd.DataFrame: """ @@ -80,75 +80,89 @@ def generate_incorrect_selections( def generate_within_household_copies( - form_data: pd.DataFrame, - configuration: float, + column: pd.Series, + configuration: ConfigTree, randomness_stream: RandomnessStream, additional_key: Any, -) -> pd.DataFrame: +) -> pd.Series: """ - :param form_data: + :param column: :param configuration: :param randomness_stream: :param additional_key: Key for RandomnessStream :return: """ # todo actually duplicate rows - return form_data + return column def swap_months_and_days( - form_data: pd.DataFrame, - configuration: float, + column: pd.Series, + configuration: ConfigTree, randomness_stream: RandomnessStream, additional_key: Any, -) -> pd.DataFrame: +) -> pd.Series: """ - :param form_data: + :param column: :param configuration: :param randomness_stream: :param additional_key: Key for RandomnessStream :return: """ # todo actually duplicate rows - return form_data + return column def miswrite_zipcodes( - form_data: pd.DataFrame, - configuration: float, + column: pd.Series, + configuration: ConfigTree, randomness_stream: RandomnessStream, additional_key: Any, -) -> pd.DataFrame: +) -> pd.Series: """ - :param form_data: + :param column: :param configuration: :param randomness_stream: :param additional_key: Key for RandomnessStream :return: """ # todo actually duplicate rows - return form_data + return column def miswrite_ages( - form_data: pd.DataFrame, - configuration: float, + column: pd.Series, + configuration: ConfigTree, randomness_stream: RandomnessStream, additional_key: Any, -) -> pd.DataFrame: - """ +) -> pd.Series: + """Function to mis-write ages based on perturbation parameters included in + the config file. - :param form_data: - :param configuration: - :param randomness_stream: - :param additional_key: Key for RandomnessStream + :param column: pd.Series of ages + :param configuration: ConfigTree + :param randomness_stream: Vivarium RandomnessStream + :param additional_key: additional key used for randomness_stream calls :return: """ - # todo actually duplicate rows - return form_data + possible_perturbations = configuration.possible_perturbations.to_dict() + perturbations = vectorized_choice( + options=list(possible_perturbations.keys()), + weights=list(possible_perturbations.values()), + n_to_choose=len(column), + randomness_stream=randomness_stream, + additional_key=f"{additional_key}_{column.name}_miswrite_ages", + ) + new_values = column.astype(float).astype(int) + perturbations + # Reflect negative values to positive + new_values[new_values < 0] *= -1 + # If new age == original age, subtract 1 + new_values[new_values == column.astype(int)] -= 1 + + return new_values.astype(str) def miswrite_numerics( @@ -266,7 +280,7 @@ def generate_typographical_errors( additional_key: Any, ) -> pd.Series: """Function that takes a column and applies noise to the string values - representative of keyboard mis-typing. + representative of keyboard mistyping. :param column: pd.Series of data :param configuration: ConfigTree object containing noising parameters @@ -279,7 +293,12 @@ def generate_typographical_errors( qwerty_errors = yaml.full_load(f) def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng): - """Abie's implementation of typographical noising""" + """For each string, loop through each character and determine if + it is to be corrupted. If so, uniformly choose from the appropriate + values to mistype. Also determine which mistyped characters should + include the original value and, if it does, include the original value + after the mistyped value + """ err = "" i = 0 while i < len(truth): diff --git a/src/pseudopeople/utilities.py b/src/pseudopeople/utilities.py index ded1f07a..10297b30 100644 --- a/src/pseudopeople/utilities.py +++ b/src/pseudopeople/utilities.py @@ -1,8 +1,9 @@ from pathlib import Path -from typing import Any, Union +from typing import Any, Dict, Union import numpy as np import pandas as pd +import yaml from vivarium.framework.configuration import ConfigTree from vivarium.framework.randomness import RandomnessStream, random @@ -13,11 +14,11 @@ def get_randomness_stream(form: Form, seed: int) -> RandomnessStream: return RandomnessStream(form.value, lambda: pd.Timestamp("2020-04-01"), seed) -def get_configuration(user_configuration: Union[Path, str, dict] = None) -> ConfigTree: +def get_configuration(user_configuration: Union[Path, str, Dict] = None) -> ConfigTree: """ Gets a noising configuration ConfigTree, optionally overridden by a user-provided YAML. - :param user_configuration: A dictionary or path to the YAML file defining user overrides for the defaults + :param user_configuration: A path to the YAML file or a dictionary defining user overrides for the defaults :return: a ConfigTree object of the noising configuration """ import pseudopeople @@ -31,10 +32,96 @@ def get_configuration(user_configuration: Union[Path, str, dict] = None) -> Conf layers=default_config_layers, ) if user_configuration: + if isinstance(user_configuration, (Path, str)): + with open(user_configuration, "r") as f: + user_configuration = yaml.full_load(f) + user_configuration = format_user_configuration( + user_configuration, noising_configuration + ) noising_configuration.update(user_configuration, layer="user") + + validate_noising_configuration(noising_configuration) + return noising_configuration +def format_user_configuration(user_dict: Dict, default_config) -> Dict: + """Formats the user's configuration file as necessary so it can properly + update noising configuration to be used + """ + user_dict = _format_age_miswriting_perturbations(user_dict, default_config) + + return user_dict + + +def _format_age_miswriting_perturbations(user_dict: Dict, default_config: ConfigTree) -> Dict: + # Format any age perturbation lists as a dictionary with uniform probabilites + for form in user_dict: + user_perturbations = ( + user_dict[form] + .get("age", {}) + .get("age_miswriting", {}) + .get("possible_perturbations", {}) + ) + if not user_perturbations: + continue + formatted = {} + default_perturbations = default_config[form]["age"]["age_miswriting"][ + "possible_perturbations" + ] + # Replace default configuration with 0 probabilities + for perturbation in default_perturbations: + formatted[perturbation] = 0 + if isinstance(user_perturbations, list): + # Add user perturbations with uniform probabilities + uniform_prob = 1 / len(user_perturbations) + for perturbation in user_perturbations: + formatted[perturbation] = uniform_prob + elif isinstance(user_perturbations, dict): + for perturbation, prob in user_perturbations.items(): + formatted[perturbation] = prob + else: + raise NotImplementedError( + "age.age_miswriting.possible_perturbations can only be a list or dict, " + f"received type {type(user_perturbations)}" + ) + user_dict[form]["age"]["age_miswriting"]["possible_perturbations"] = formatted + + return user_dict + + +def validate_noising_configuration(config: ConfigTree) -> None: + """Perform various validation checks on the final noising ConfigTree object""" + _validate_age_miswriting(config) + + +def _validate_age_miswriting(config: ConfigTree) -> None: + possible_perturbations = extract_values(config, "possible_perturbations") + for form_perturbations in possible_perturbations: + form_perturbations_dict = form_perturbations.to_dict() + if 0 in form_perturbations_dict: + # TODO: Find a way to report specific location in config file + raise ValueError("Cannot include 0 in age_miswriting.possible_perturbations") + if sum(form_perturbations_dict.values()) != 1: + raise ValueError( + "The provided possible_perturbation probabilities must sum to 1 but they " + f"currently sum to {sum(form_perturbations_dict.values())}: {form_perturbations_dict}", + ) + + +def extract_values(config: Union[ConfigTree, Dict], key: str): + """Extract values with a specific key from a dict or configtree""" + results = [] + for k, v in config.items(): + if k == key: + results.append(v) + if isinstance(v, (dict, ConfigTree)): + for result in extract_values(v, key): + results.append(result) + + return results + + def vectorized_choice( options: Union[list, pd.Series], n_to_choose: int, @@ -65,6 +152,8 @@ def vectorized_choice( if weights is None: n = len(options) weights = np.ones(n) / n + if isinstance(weights, list): + weights = np.array(weights) # for each of n_to_choose, sample uniformly between 0 and 1 index = pd.Index(np.arange(n_to_choose)) if randomness_stream is None: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 060a5174..c8dfe389 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -113,11 +113,7 @@ def decennial_census_data_path(tmp_path_factory): data = pd.DataFrame( { "housing_type": [random.choice(HOUSING_TYPES) for _ in range(num_rows)], - # TODO: Currently ages are actually floats but a followup pr will ensure ints - "age": [ - str(random.randint(1, 100) + round(random.random(), 6)) - for _ in range(num_rows) - ], + "age": [str(random.randint(1, 100)) for _ in range(num_rows)], "year": [random.choice(["2020", "2030"]) for _ in range(num_rows)], "race_ethnicity": [random.choice(RACE_ETHNICITIES) for _ in range(num_rows)], "guardian_1": [ @@ -134,9 +130,7 @@ def decennial_census_data_path(tmp_path_factory): "relation_to_household_head": [ random.choice(RELATIONS_TO_HOUSEHOLD_HEAD) for _ in range(num_rows) ], - # TODO: currently zipcodes are floats (and thus not zero-padded); - # a followup PR will convert to 5-digit integer strings - "zipcode": [str(random.randint(1, 99999)) + ".0" for _ in range(num_rows)], + "zipcode": [str(random.randint(1, 99999)).zfill(5) for _ in range(num_rows)], "date_of_birth": [ time.strftime( "%Y-%m-%d", diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index 1c4871bf..bd50b21a 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -20,7 +20,7 @@ @pytest.fixture(scope="module") def dummy_dataset(): # Add a column of integer strings - num_simulants = 100_000 + num_simulants = 1_000_000 dummy_idx = pd.Index(range(num_simulants)) integer_series = pd.Series([str(x) for x in range(num_simulants)]) # Add missing data from `generate_missing_data` function @@ -57,13 +57,8 @@ def string_series(): ) -@pytest.fixture(scope="module") -def default_configuration(): - return get_configuration() - - def test_generate_missing_data(dummy_dataset): - config = get_configuration() + config = get_configuration()["decennial_census"]["zipcode"]["missing_data"] config.update( { "decennial_census": { @@ -75,7 +70,6 @@ def test_generate_missing_data(dummy_dataset): }, } ) - config = config["decennial_census"]["zipcode"]["missing_data"] data = dummy_dataset["numbers"] noised_data = _validate_seed_and_noise_data( noise_type=NOISE_TYPES.MISSING_DATA, column=data, config=config @@ -98,8 +92,8 @@ def test_generate_missing_data(dummy_dataset): assert (data[not_noised_idx] == noised_data[not_noised_idx]).all() -def test_incorrect_selection(categorical_series, default_configuration): - config = default_configuration["decennial_census"]["state"]["incorrect_selection"] +def test_incorrect_selection(categorical_series): + config = get_configuration()["decennial_census"]["state"]["incorrect_selection"] noised_data = _validate_seed_and_noise_data( noise_type=NOISE_TYPES.INCORRECT_SELECTION, column=categorical_series, config=config ) @@ -133,9 +127,148 @@ def test_miswrite_zipcodes(): pass -@pytest.mark.skip(reason="TODO") -def test_miswrite_ages(): - pass +def test_miswrite_ages_default_config(dummy_dataset): + """Test that miswritten ages are appropriately handled, including + no perturbation probabilities defaults to uniform distribution, + perturbation probabilities""" + config = get_configuration()["decennial_census"]["age"]["age_miswriting"] + data = dummy_dataset.rename(columns={"numbers": "age"})["age"] + + # Convert to realistic age + maximum_age = 120 + data = data.apply(pd.to_numeric, args=("coerce",)) + data = data / data.max() * (maximum_age + 1) + data[data.isna()] = -1 # temp nan + data = data.astype(int).astype(str) + data[data == "-1"] = "" + noised_data = _validate_seed_and_noise_data( + noise_type=NOISE_TYPES.AGE_MISWRITING, column=data, config=config + ) + + # Check for expected noise level + not_missing_idx = data.index[data != ""] + expected_noise = config["row_noise_level"] + actual_noise = (noised_data[not_missing_idx] != data[not_missing_idx]).mean() + # NOTE: we increase the relative tolerance a bit here because the expected + # noise calculated above does not account for the fact that if a perturbed + # age ends up being the same as the original age, then 1 is subtracted. + assert np.isclose(expected_noise, actual_noise, rtol=0.03) + + # Check that missing data remains missing + original_missing_idx = data.index[data == ""] + noised_missing_idx = noised_data.index[noised_data == ""] + pd.testing.assert_index_equal(original_missing_idx, noised_missing_idx) + + # Check that there are no negative ages generated + assert noised_data[not_missing_idx].astype(int).min() >= 0 + + +def test_miswrite_ages_uniform_probabilities(): + """Test that a list of perturbations passed in results in uniform probabilities""" + num_rows = 100_000 + original_age = 25 + perturbations = [-2, -1, 1] + + config = get_configuration( + { + "decennial_census": { + "age": { + "age_miswriting": { + "row_noise_level": 1, + "possible_perturbations": perturbations, + }, + }, + }, + }, + )["decennial_census"]["age"]["age_miswriting"] + + data = pd.Series([str(original_age)] * num_rows) + noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test") + expected_noise = 1 / len(perturbations) + for perturbation in perturbations: + actual_noise = (noised_data.astype(int) - original_age == perturbation).mean() + assert np.isclose(actual_noise, expected_noise, rtol=0.01) + + +def test_miswrite_ages_provided_probabilities(): + """Test that provided age perturation probabilites are handled""" + num_rows = 100_000 + original_age = 25 + perturbations = {-1: 0.1, 1: 0.9} + + config = get_configuration( + { + "decennial_census": { + "age": { + "age_miswriting": { + "row_noise_level": 1, + "possible_perturbations": perturbations, + }, + }, + }, + }, + )["decennial_census"]["age"]["age_miswriting"] + + data = pd.Series([str(original_age)] * num_rows) + noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test") + for perturbation in perturbations: + expected_noise = perturbations[perturbation] + actual_noise = (noised_data.astype(int) - original_age == perturbation).mean() + assert np.isclose(actual_noise, expected_noise, rtol=0.01) + + +def test_miswrite_ages_handles_perturbation_to_same_age(): + """Tests an edge case. It's possible that after an age is perturbed it ends + up being the original age. In that case, subtract 1. eg, an age of 1 that is + perturbed -2 becomes -1. But we cannot have negative so we flip the sign to +1. + But that's the same as the original age and so should become 1-1=0. + """ + num_rows = 100 + age = 1 + perturbations = [-2] # This will cause -1 which will be flipped to +1 + + config = get_configuration( + { + "decennial_census": { + "age": { + "age_miswriting": { + "row_noise_level": 1, + "possible_perturbations": perturbations, + }, + }, + }, + }, + )["decennial_census"]["age"]["age_miswriting"] + + data = pd.Series([str(age)] * num_rows) + noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test") + + assert (noised_data == "0").all() + + +def test_miswrite_ages_flips_negative_to_positive(): + """Test that any ages perturbed to <0 are reflected to positive values""" + num_rows = 100 + age = 3 + perturbations = [-7] # This will cause -4 and should flip to +4 + + config = get_configuration( + { + "decennial_census": { + "age": { + "age_miswriting": { + "row_noise_level": 1, + "possible_perturbations": perturbations, + }, + }, + }, + }, + )["decennial_census"]["age"]["age_miswriting"] + + data = pd.Series([str(age)] * num_rows) + noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test") + + assert (noised_data == "4").all() def test_miswrite_numerics(string_series): diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index 5ca16d13..8d5d31fc 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -1,11 +1,18 @@ from pathlib import Path +import pandas as pd import pytest import yaml +from vivarium.config_tree import ConfigTree +from vivarium.framework.randomness import RandomnessStream import pseudopeople from pseudopeople.utilities import get_configuration +RANDOMNESS0 = RandomnessStream( + key="test_column_noise", clock=lambda: pd.Timestamp("2020-09-01"), seed=0 +) + @pytest.fixture def user_configuration_yaml(tmp_path): @@ -42,8 +49,73 @@ def test_get_configuration_with_user_override(user_configuration_yaml, mocker): update_calls = [ call for call in mock.mock_calls - if "update" in str(call) - and "user" in str(call) - and str(user_configuration_yaml) in str(call) + if ".update({" in str(call) and "layer='user'" in str(call) ] assert len(update_calls) == 1 + + +def test_validate_miswrite_ages_fails_if_includes_0(): + """Test that a runtime error is thrown if the user includes 0 as a possible perturbation""" + perturbations = [-1, 0, 1] + with pytest.raises(ValueError, match="Cannot include 0"): + get_configuration( + { + "decennial_census": { + "age": { + "age_miswriting": { + "row_noise_level": 1, + "possible_perturbations": perturbations, + }, + }, + }, + }, + ) + + +def test_validate_miswrite_ages_if_probabilities_do_not_add_to_1(): + """Test that runtimerrors if probs do not add up to 1""" + perturbations = {-1: 0.1, 1: 0.8} # does not sum to 1 + + with pytest.raises(ValueError, match="must sum to 1"): + get_configuration( + { + "decennial_census": { + "age": { + "age_miswriting": { + "possible_perturbations": perturbations, + }, + }, + }, + }, + ) + + +@pytest.mark.parametrize("user_config_type", ["dict", "path"]) +def test_format_miswrite_ages(user_config_type, tmp_path): + """Test that user-supplied dictionary properly updates ConfigTree object. + This includes zero-ing out default values that don't exist in the user config + """ + user_config = { + "decennial_census": { + "age": { + "age_miswriting": { + "possible_perturbations": [-2, -1, 2], + }, + }, + }, + } + if user_config_type == "path": + filepath = tmp_path / "user_dict.yaml" + with open(filepath, "w") as file: + yaml.dump(user_config, file) + user_config = filepath + + new_dict = get_configuration(user_config).decennial_census.age.age_miswriting.to_dict() + default_dict = get_configuration().decennial_census.age.age_miswriting.to_dict() + assert default_dict["row_noise_level"] == new_dict["row_noise_level"] + assert default_dict["token_noise_level"] == new_dict["token_noise_level"] + # check that 1 got replaced with 0 probability + assert new_dict["possible_perturbations"][1] == 0 + # check that others have 1/3 probability + for p in [-2, -1, 2]: + assert new_dict["possible_perturbations"][p] == 1 / 3 From 113c1d4ac44d1b90c77cbc4363b5fb86d76b7eea Mon Sep 17 00:00:00 2001 From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com> Date: Tue, 4 Apr 2023 17:37:57 -0600 Subject: [PATCH 4/6] refactor seed-check method into their own pytests (#30) --- tests/unit/test_column_noise.py | 125 +++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 42 deletions(-) diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index bd50b21a..2e05796a 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -19,13 +19,15 @@ @pytest.fixture(scope="module") def dummy_dataset(): - # Add a column of integer strings num_simulants = 1_000_000 dummy_idx = pd.Index(range(num_simulants)) + + # Add a column of integer strings integer_series = pd.Series([str(x) for x in range(num_simulants)]) # Add missing data from `generate_missing_data` function missing_idx = pd.Index([x for x in dummy_idx if x % 3 == 0]) integer_series.loc[missing_idx] = "" + # Add a column of character strings str_length = 6 character_series = pd.Series( @@ -39,7 +41,42 @@ def dummy_dataset(): # Add missing data from `generate_missing_data` function character_series.loc[missing_idx] = "" - return pd.DataFrame({"numbers": integer_series, "characters": character_series}) + # Add a categorical series state column + states_list = ["CA", "WA", "FL", "OR", "CO", "TX", "NY", "VA", "AZ", "''"] + states = pd.Series(states_list * int(num_simulants / len(states_list))) + + # Add age col by converting integer_series + maximum_age = 120 + ages = integer_series.apply(pd.to_numeric, args=("coerce",)) + ages = ages / ages.max() * (maximum_age + 1) + ages[ages.isna()] = -1 # temp nan + ages = ages.astype(int).astype(str) + ages[ages == "-1"] = "" + + # Add a string_series column of mixed letters and numbers + string_list = [ + "foo1", + "bar2", + "baz3", + "Unit 1A", + "1234", + "12/31/2020", + "a1b2c3", + "100000.00", + "123-45-6789", + "", + ] + string_series = pd.Series(string_list * int(num_simulants / len(string_list))) + + return pd.DataFrame( + { + "numbers": integer_series, + "characters": character_series, + "state": states, + "age": ages, + "string_series": string_series, + } + ) @pytest.fixture(scope="module") @@ -71,9 +108,7 @@ def test_generate_missing_data(dummy_dataset): } ) data = dummy_dataset["numbers"] - noised_data = _validate_seed_and_noise_data( - noise_type=NOISE_TYPES.MISSING_DATA, column=data, config=config - ) + noised_data = NOISE_TYPES.MISSING_DATA(data, config, RANDOMNESS0, "test") # Calculate newly missing data, ie data that didn't come in as already missing orig_non_missing_idx = data.index[(data.notna()) & (data != "")] @@ -94,8 +129,8 @@ def test_generate_missing_data(dummy_dataset): def test_incorrect_selection(categorical_series): config = get_configuration()["decennial_census"]["state"]["incorrect_selection"] - noised_data = _validate_seed_and_noise_data( - noise_type=NOISE_TYPES.INCORRECT_SELECTION, column=categorical_series, config=config + noised_data = NOISE_TYPES.INCORRECT_SELECTION( + categorical_series, config, RANDOMNESS0, "test" ) # Check for expected noise level @@ -132,18 +167,8 @@ def test_miswrite_ages_default_config(dummy_dataset): no perturbation probabilities defaults to uniform distribution, perturbation probabilities""" config = get_configuration()["decennial_census"]["age"]["age_miswriting"] - data = dummy_dataset.rename(columns={"numbers": "age"})["age"] - - # Convert to realistic age - maximum_age = 120 - data = data.apply(pd.to_numeric, args=("coerce",)) - data = data / data.max() * (maximum_age + 1) - data[data.isna()] = -1 # temp nan - data = data.astype(int).astype(str) - data[data == "-1"] = "" - noised_data = _validate_seed_and_noise_data( - noise_type=NOISE_TYPES.AGE_MISWRITING, column=data, config=config - ) + data = dummy_dataset["age"] + noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test") # Check for expected noise level not_missing_idx = data.index[data != ""] @@ -292,9 +317,7 @@ def test_miswrite_numerics(string_series): p_row_noise = config.row_noise_level p_token_noise = config.token_noise_level data = string_series - noised_data = _validate_seed_and_noise_data( - noise_type=NOISE_TYPES.NUMERIC_MISWRITING, column=data, config=config - ) + noised_data = NOISE_TYPES.NUMERIC_MISWRITING(data, config, RANDOMNESS0, "test") # Get masks for helper groups, each string in categorical string purpose is to mimic possible string types empty_str = data == "" @@ -417,9 +440,7 @@ def test_generate_typographical_errors(dummy_dataset, column): } ) config = config["decennial_census"][column]["typographic"] - noised_data = _validate_seed_and_noise_data( - noise_type=NOISE_TYPES.TYPOGRAPHIC, column=data, config=config - ) + noised_data = NOISE_TYPES.TYPOGRAPHIC(data, config, RANDOMNESS0, "test") not_missing_idx = data.index[(data.notna()) & (data != "")] check_original = data.loc[not_missing_idx] @@ -455,22 +476,42 @@ def test_generate_typographical_errors(dummy_dataset, column): ).all() -#################### -# HELPER FUNCTIONS # -#################### - - -# TODO: refactor this into its own test parameterized by noise functions -def _validate_seed_and_noise_data(noise_type, column, config): - """Confirms randomness stream behavior and returns the noised data""" - noised_data = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}") - noised_data_same_seed = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}") - noised_data_different_seed = noise_type( - column, config, RANDOMNESS1, f"test_{noise_type.name}" - ) - - assert (noised_data != column).any() +@pytest.mark.parametrize( + "noise_type, data_col, form, form_col", + [ + (NOISE_TYPES.MISSING_DATA, "numbers", "decennial_census", "zipcode"), + (NOISE_TYPES.INCORRECT_SELECTION, "state", "decennial_census", "state"), + (NOISE_TYPES.COPY_FROM_WITHIN_HOUSEHOLD, "todo", "todo", "todo"), + (NOISE_TYPES.MONTH_DAY_SWAP, "todo", "todo", "todo"), + (NOISE_TYPES.ZIP_CODE_MISWRITING, "todo", "todo", "todo"), + (NOISE_TYPES.AGE_MISWRITING, "age", "decennial_census", "age"), + ( + NOISE_TYPES.NUMERIC_MISWRITING, + "string_series", + "decennial_census", + "street_number", + ), + (NOISE_TYPES.NICKNAME, "todo", "todo", "todo"), + (NOISE_TYPES.FAKE_NAME, "todo", "todo", "todo"), + (NOISE_TYPES.PHONETIC, "todo", "todo", "todo"), + (NOISE_TYPES.OCR, "todo", "todo", "todo"), + (NOISE_TYPES.TYPOGRAPHIC, "numbers", "decennial_census", "zipcode"), + (NOISE_TYPES.TYPOGRAPHIC, "characters", "decennial_census", "street_name"), + ], +) +def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_dataset): + """Tests that different seeds produce different results and the same seed + produces the same results + """ + noise = noise_type.name + if data_col == "todo": + pytest.skip(reason=f"TODO: implement for function {noise}") + config = get_configuration()[form][form_col][noise] + data = dummy_dataset[data_col] + noised_data = noise_type(data, config, RANDOMNESS0, f"test_{noise}") + noised_data_same_seed = noise_type(data, config, RANDOMNESS0, f"test_{noise}") + noised_data_different_seed = noise_type(data, config, RANDOMNESS1, f"test_{noise}") + + assert (noised_data != data).any() assert (noised_data == noised_data_same_seed).all() assert (noised_data != noised_data_different_seed).any() - - return noised_data From 0c4290aa8136db282472dec46c6d48390dc23ee9 Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Tue, 4 Apr 2023 17:38:58 -0700 Subject: [PATCH 5/6] read data from an hdf rather than a csv (#29) --- src/pseudopeople/data/incorrect_select_options.csv | 4 ++-- src/pseudopeople/entity_types.py | 6 +++--- src/pseudopeople/interface.py | 4 +++- src/pseudopeople/noise_functions.py | 3 ++- tests/integration/conftest.py | 4 ++-- tests/integration/test_interface.py | 2 +- tests/unit/test_column_noise.py | 11 +++++++---- tests/unit/test_noise_form.py | 4 ++-- 8 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv index e4939387..67fd1629 100644 --- a/src/pseudopeople/data/incorrect_select_options.csv +++ b/src/pseudopeople/data/incorrect_select_options.csv @@ -3,7 +3,7 @@ AL,Reference person,Female,White,W2,creation AK,Opp-sex spouse,Male,Black,1099,death AZ,Opp-sex partner,,Asian,, AR,Same-sex spouse,,AIAN,, -CA,Same-sex partne,,NHOPI,, +CA,Same-sex partner,,NHOPI,, CO,Biological child,,Multiracial or Other,, CT,Adopted child,,Latino,, DE,Stepchild,,,, @@ -16,7 +16,7 @@ IN,Other relative,,,, IA,Roommate,,,, KS,Foster child,,,, KY,Other nonrelative,,,, -LA,Institutionalized GQ po,,,, +LA,Institutionalized GQ pop,,,, ME,Noninstitutionalized GQ pop,,,, MD,,,,, MA,,,,, diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py index 500cf27c..7749ce7b 100644 --- a/src/pseudopeople/entity_types.py +++ b/src/pseudopeople/entity_types.py @@ -23,7 +23,7 @@ class RowNoiseType: """ name: str - noise_function: Callable[[pd.DataFrame, float, RandomnessStream, str], pd.DataFrame] + noise_function: Callable[[pd.DataFrame, float, RandomnessStream], pd.DataFrame] def __call__( self, @@ -63,8 +63,8 @@ def __call__( to_noise_idx = get_index_to_noise( column, noise_level, randomness_stream, f"{self.name}_{additional_key}" ) - column.loc[to_noise_idx] = self.noise_function( + noised_data = self.noise_function( column.loc[to_noise_idx], configuration, randomness_stream, additional_key ) - + column.loc[to_noise_idx] = noised_data return column diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 08dfb293..7a543478 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -33,7 +33,9 @@ def _generate_form( if isinstance(source, pd.DataFrame): data = source else: - data = pd.read_csv(source, dtype=str, keep_default_na=False) + data = pd.read_hdf(source) + if not isinstance(data, pd.DataFrame): + raise TypeError(f"File located at {source} must contain a pandas DataFrame.") return noise_form(form, data, configuration_tree, seed) diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index e92a58ce..03826d95 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -270,7 +270,7 @@ def generate_missing_data(column: pd.Series, *_: Any) -> pd.Series: :returns: pd.Series of empty strings with the index of column. """ - return pd.Series("", index=column.index) + return pd.Series(pd.NA, index=column.index) def generate_typographical_errors( @@ -322,6 +322,7 @@ def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng): include_original_token_level = configuration.include_original_token_level rng = np.random.default_rng(seed=randomness_stream.seed) + column = column.astype(str) for idx in column.index: noised_value = keyboard_corrupt( column[idx], diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c8dfe389..c20a62bc 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -163,7 +163,7 @@ def decennial_census_data_path(tmp_path_factory): } ) - data_path = tmp_path_factory.getbasetemp() / "dummy_data.csv" - data.to_csv(data_path, index=False) + data_path = tmp_path_factory.getbasetemp() / "dummy_data.hdf" + data.to_hdf(data_path, "data") return data_path diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 605c1d0c..e2fda8ca 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -11,7 +11,7 @@ def test_generate_decennial_census( decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str] ): - data = pd.read_csv(decennial_census_data_path, dtype=str, keep_default_na=False) + data = pd.read_hdf(decennial_census_data_path) # TODO: Refactor this check into a separate test noised_data = generate_decennial_census( diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index 2e05796a..0a1b95bc 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -113,7 +113,7 @@ def test_generate_missing_data(dummy_dataset): # Calculate newly missing data, ie data that didn't come in as already missing orig_non_missing_idx = data.index[(data.notna()) & (data != "")] newly_missing_idx = noised_data.index[ - (noised_data.index.isin(orig_non_missing_idx)) & (noised_data == "") + (noised_data.index.isin(orig_non_missing_idx)) & (noised_data.isna()) ] # Check for expected noise level @@ -122,8 +122,7 @@ def test_generate_missing_data(dummy_dataset): assert np.isclose(expected_noise, actual_noise, rtol=0.02) # Check that un-noised values are unchanged - not_noised_idx = noised_data.index[noised_data != ""] - assert "" not in noised_data[not_noised_idx].values + not_noised_idx = noised_data.index[noised_data.notna()] assert (data[not_noised_idx] == noised_data[not_noised_idx]).all() @@ -513,5 +512,9 @@ def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_da noised_data_different_seed = noise_type(data, config, RANDOMNESS1, f"test_{noise}") assert (noised_data != data).any() - assert (noised_data == noised_data_same_seed).all() + assert (noised_data.isna() == noised_data_same_seed.isna()).all() + assert ( + noised_data[noised_data.notna()] + == noised_data_same_seed[noised_data_same_seed.notna()] + ).all() assert (noised_data != noised_data_different_seed).any() diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py index bedefab1..0459ea91 100644 --- a/tests/unit/test_noise_form.py +++ b/tests/unit/test_noise_form.py @@ -7,7 +7,7 @@ import pytest from vivarium.config_tree import ConfigTree -from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType +from pseudopeople.entity_types import ColumnNoiseType from pseudopeople.interface import generate_decennial_census from pseudopeople.noise import noise_form from pseudopeople.noise_entities import NOISE_TYPES @@ -169,7 +169,7 @@ def test_correct_forms_are_used(func, form, mocker): if func == "todo": pytest.skip(reason=f"TODO: implement function for {form.value} form") mock = mocker.patch("pseudopeople.interface.noise_form") - mocker.patch("pseudopeople.interface.pd.read_csv") + mocker.patch("pseudopeople.interface.pd.read_hdf", return_value=pd.DataFrame()) _ = func("dummy/path") assert mock.call_args[0][0] == form From 80cd56b87d4076ec172c3e5cc9f46c1729ffc83d Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Tue, 4 Apr 2023 17:47:27 -0700 Subject: [PATCH 6/6] update changelog and version (#31) --- CHANGELOG.rst | 7 +++++++ src/pseudopeople/__about__.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c3b38b91..e9b15155 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,10 @@ +**0.3.0 - 04/04/23** + + - Implement numeric miswriting noise function + - Implement age miswriting noise function + - Implement additional forms: ACS, CPS, WIC, and SSA + - Read data in from HDF files instead of CSV files + **0.2.1 - 03/31/23** - Fix bug preventing generation of W2/1099 forms diff --git a/src/pseudopeople/__about__.py b/src/pseudopeople/__about__.py index 8167c4ad..b145723e 100644 --- a/src/pseudopeople/__about__.py +++ b/src/pseudopeople/__about__.py @@ -13,7 +13,7 @@ __summary__ = "pseudopeople is package which adds noise to simulated census-scale data using standard scientific Python tools." __uri__ = "https://github.com/ihmeuw/pseudopeople" -__version__ = "0.2.1" +__version__ = "0.3.0" __author__ = "The pseudopeople developers" __email__ = "vivarium.dev@gmail.com"