From b05d20d3d7bee6b8729d1f04f5f98dcadb112c57 Mon Sep 17 00:00:00 2001
From: albrja <37345113+albrja@users.noreply.github.com>
Date: Mon, 3 Apr 2023 17:30:05 -0700
Subject: [PATCH 1/6] Numeric miswriting (#26)

Numeric miswriting noise function

Implementation and tests for numeric miswriting noise function
- *Category*: Feature
- *JIRA issue*: [MIC-3907](https://jira.ihme.washington.edu/browse/MIC-3907)

-Adds numeric miswriting noise function
-Adds tests for numeric miswriting noise function

Testing
Test suites pass with no failures
---
 src/pseudopeople/default_configuration.yaml |  37 +++++++
 src/pseudopeople/noise_functions.py         |  43 ++++++--
 tests/unit/test_column_noise.py             | 111 +++++++++++++++++++-
 3 files changed, 179 insertions(+), 12 deletions(-)

diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml
index 771ef3d7..0a4cfa0f 100644
--- a/src/pseudopeople/default_configuration.yaml
+++ b/src/pseudopeople/default_configuration.yaml
@@ -41,6 +41,9 @@ decennial_census:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     street_number:
         missing_data:
             row_noise_level: 0.01
@@ -48,6 +51,9 @@ decennial_census:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     street_name:
         missing_data:
             row_noise_level: 0.01
@@ -62,6 +68,9 @@ decennial_census:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     city:
         missing_data:
             row_noise_level: 0.01
@@ -106,12 +115,18 @@ taxes_w2_and_1099:
     date_of_birth:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     employer_city:
         missing_data:
             row_noise_level: 0.01
     employer_id:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     employer_name:
         missing_data:
             row_noise_level: 0.01
@@ -126,9 +141,15 @@ taxes_w2_and_1099:
     employer_street_number:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     employer_unit_number:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     employer_zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -138,6 +159,9 @@ taxes_w2_and_1099:
     income:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     tax_form:
         missing_data:
             row_noise_level: 0.01
@@ -160,9 +184,19 @@ taxes_w2_and_1099:
     mailing_address_street_number:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     mailing_address_unit_number:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+    mailing_address_po_box:
+        missing_data:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     mailing_address_zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -172,3 +206,6 @@ taxes_w2_and_1099:
     ssn:
         missing_data:
             row_noise_level: 0.01
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py
index ba5ba25f..2a2d6f97 100644
--- a/src/pseudopeople/noise_functions.py
+++ b/src/pseudopeople/noise_functions.py
@@ -152,21 +152,46 @@ def miswrite_ages(
 
 
 def miswrite_numerics(
-    form_data: pd.DataFrame,
-    configuration: float,
+    column: pd.Series,
+    configuration: ConfigTree,
     randomness_stream: RandomnessStream,
     additional_key: Any,
-) -> pd.DataFrame:
+) -> pd.Series:
     """
+    Function that noises numeric characters in a series.
 
-    :param form_data:
-    :param configuration:
-    :param randomness_stream:
+    :param column: A pd.Series
+    :param configuration: ConfigTree object containing noise level
+    :param randomness_stream: RandomnessStream for CRN framework.
     :param additional_key: Key for RandomnessStream
-    :return:
+
+    returns: pd.Series with some numeric values experiencing noise.
     """
-    # todo actually duplicate rows
-    return form_data
+
+    # This is a fix to not replacing the original token for noise options
+    token_noise_level = configuration.token_noise_level / 0.9
+    rng = np.random.default_rng(randomness_stream.seed)
+    column = column.astype(str)
+    longest_str = column.str.len().max()
+    same_len_col = column.str.pad(longest_str, side="right")
+    is_number = pd.concat(
+        [same_len_col.str[i].str.isdigit() for i in range(longest_str)], axis=1
+    )
+
+    replace = (rng.random(is_number.shape) < token_noise_level) & is_number
+    random_digits = rng.choice(list("0123456789"), is_number.shape)
+
+    # Choose and replace values for a noised series
+    noised_column = pd.Series("", index=column.index)
+    digits = []
+    for i in range(len(is_number.columns)):
+        digit = np.where(replace.iloc[:, i], random_digits[:, i], same_len_col.str[i])
+        digit = pd.Series(digit, index=column.index, name=column.name)
+        digits.append(digit)
+        noised_column = noised_column + digits[i]
+    noised_column.str.strip()
+
+    return noised_column
 
 
 def generate_nicknames(
diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py
index c8c0802d..1c4871bf 100644
--- a/tests/unit/test_column_noise.py
+++ b/tests/unit/test_column_noise.py
@@ -49,6 +49,14 @@ def categorical_series():
     )
 
 
+@pytest.fixture(scope="module")
+def string_series():
+    return pd.Series(
+        ["Unit 1A", "1234", "12/31/2020", "a1b2c3", "100000.00", "123-45-6789", ""] * 100_000,
+        name="random_strings",
+    )
+
+
 @pytest.fixture(scope="module")
 def default_configuration():
     return get_configuration()
@@ -130,9 +138,106 @@ def test_miswrite_ages():
     pass
 
 
-@pytest.mark.skip(reason="TODO")
-def test_miswrite_numerics():
-    pass
+def test_miswrite_numerics(string_series):
+    """
+    Validates that only numeric characters are noised in a series at a provided noise level.
+    """
+    config = get_configuration()
+    config.update(
+        {
+            "decennial_census": {
+                "street_number": {
+                    "numeric_miswriting": {
+                        "row_noise_level": 0.4,
+                        "token_noise_level": 0.5,
+                    },
+                },
+            },
+        }
+    )
+    config = config["decennial_census"]["street_number"]["numeric_miswriting"]
+    p_row_noise = config.row_noise_level
+    p_token_noise = config.token_noise_level
+    data = string_series
+    noised_data = _validate_seed_and_noise_data(
+        noise_type=NOISE_TYPES.NUMERIC_MISWRITING, column=data, config=config
+    )
+
+    # Get masks for helper groups, each string in categorical string purpose is to mimic possible string types
+    empty_str = data == ""
+    unit_number = data == "Unit 1A"
+    id_number = data == "1234"
+    alt_str = data == "a1b2c3"
+    income = data == "100000.00"
+    date_of_birth = data == "12/31/2020"
+    ssn = data == "123-45-6789"
+    expected_noise = p_row_noise * p_token_noise
+
+    # Check empty strings havent changed
+    assert (noised_data[empty_str] == "").all()
+
+    for i in range(4):  # "1234"
+        assert np.isclose(
+            expected_noise,
+            (data[id_number].str[i] != noised_data[id_number].str[i]).mean(),
+            rtol=0.02,
+        )
+        assert (noised_data[id_number].str[i].str.isdigit()).all()
+
+    for i in range(6):  # "a1b2c3"
+        if i % 2 == 0:
+            assert (data[alt_str].str[i] == noised_data[alt_str].str[i]).all()
+        else:
+            assert np.isclose(
+                expected_noise,
+                (data[alt_str].str[i] != noised_data[alt_str].str[i]).mean(),
+                rtol=0.02,
+            )
+            assert (noised_data[alt_str].str[i].str.isdigit()).all()
+
+    for i in range(7):  # "Unit 1A"
+        if i == 5:
+            assert np.isclose(
+                expected_noise,
+                (data[unit_number].str[i] != noised_data[unit_number].str[i]).mean(),
+                rtol=0.02,
+            )
+            assert (noised_data[unit_number].str[i].str.isdigit()).all()
+        else:
+            assert (data[unit_number].str[i] == noised_data[unit_number].str[i]).all()
+
+    for i in range(9):  # "100000.00"
+        if i == 6:
+            assert (data[income].str[i] == noised_data[income].str[i]).all()
+        else:
+            assert np.isclose(
+                expected_noise,
+                (data[income].str[i] != noised_data[income].str[i]).mean(),
+                rtol=0.02,
+            )
+            assert (noised_data[income].str[i].str.isdigit()).all()
+
+    for i in range(10):  # "12/31/2020"
+        if i in [2, 5]:
+            assert (data[date_of_birth].str[i] == noised_data[date_of_birth].str[i]).all()
+        else:
+            assert np.isclose(
+                expected_noise,
+                (data[date_of_birth].str[i] != noised_data[date_of_birth].str[i]).mean(),
+                rtol=0.02,
+            )
+            assert (noised_data[date_of_birth].str[i].str.isdigit()).all()
+
+    for i in range(11):  # "123-45-6789"
+        if i in [3, 6]:
+            assert (data[ssn].str[i] == noised_data[ssn].str[i]).all()
+        else:
+            assert np.isclose(
+                expected_noise,
+                (data[ssn].str[i] != noised_data[ssn].str[i]).mean(),
+                rtol=0.02,
+            )
+            assert (noised_data[ssn].str[i].str.isdigit()).all()
 
 
 @pytest.mark.skip(reason="TODO")

From 3cd699c532dec4e236c18fb7876dacb835d7832f Mon Sep 17 00:00:00 2001
From: Matthew Kappel <mkappel@uw.edu>
Date: Tue, 4 Apr 2023 10:16:38 -0700
Subject: [PATCH 2/6] Implement all non-1040 forms (#28)

- *Category*: feature
- *JIRA issue*: [MIC-3882](https://jira.ihme.washington.edu/browse/MIC-3882)

Changes
- Adds ACS, CPS, SSA, and WIC forms interfaces and default configuration
- Adds ability to pass in DataFrame as source data and pass in a dict as configuration


Testing
Manual testing, ran each noising function against each sample data. Noising succeeded with noising being seen in diffs between source and noising output.
---
 src/pseudopeople/__init__.py                |   9 +-
 src/pseudopeople/default_configuration.yaml | 383 +++++++++++++++++++-
 src/pseudopeople/interface.py               | 126 ++++++-
 src/pseudopeople/schema_entities.py         |   4 +
 src/pseudopeople/utilities.py               |  10 +-
 tests/integration/test_interface.py         |   6 +-
 tests/unit/test_noise_form.py               |   2 +-
 7 files changed, 512 insertions(+), 28 deletions(-)

diff --git a/src/pseudopeople/__init__.py b/src/pseudopeople/__init__.py
index ac749c28..02b60370 100644
--- a/src/pseudopeople/__init__.py
+++ b/src/pseudopeople/__init__.py
@@ -8,4 +8,11 @@
     __uri__,
     __version__,
 )
-from pseudopeople.interface import generate_decennial_census, generate_w2
+from pseudopeople.interface import (
+    generate_american_communities_survey,
+    generate_current_population_survey,
+    generate_decennial_census,
+    generate_social_security,
+    generate_taxes_w2_and_1099,
+    generate_women_infants_and_children,
+)
diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml
index 0a4cfa0f..416f7132 100644
--- a/src/pseudopeople/default_configuration.yaml
+++ b/src/pseudopeople/default_configuration.yaml
@@ -107,29 +107,49 @@ decennial_census:
             row_noise_level: 0.01
 
 taxes_w2_and_1099:
-    omission: 0.0145
+    omission: 0.0
     duplication: 0.05
     age:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     date_of_birth:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
     employer_city:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     employer_id:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
     employer_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     employer_state:
         missing_data:
             row_noise_level: 0.01
@@ -138,27 +158,51 @@ taxes_w2_and_1099:
     employer_street_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     employer_street_number:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
     employer_unit_number:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
     employer_zipcode:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     first_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     income:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
@@ -170,9 +214,17 @@ taxes_w2_and_1099:
     last_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     mailing_address_city:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     mailing_address_state:
         missing_data:
             row_noise_level: 0.01
@@ -181,15 +233,27 @@ taxes_w2_and_1099:
     mailing_address_street_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     mailing_address_street_number:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
     mailing_address_unit_number:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
@@ -200,12 +264,329 @@ taxes_w2_and_1099:
     mailing_address_zipcode:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     middle_initial:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     ssn:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+american_communities_survey:
+    omission: 0.0145
+    duplication: 0.05
+    age:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    city:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    date_of_birth:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    first_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    last_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    middle_initial:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    sex:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    state:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    street_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    street_number:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    unit_number:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    zipcode:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+current_population_survey:
+    omission: 0.2905
+    duplication: 0.05
+    age:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    city:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    date_of_birth:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    first_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    last_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    middle_initial:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    sex:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    state:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    street_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    street_number:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    unit_number:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    zipcode:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+
+women_infants_and_children:
+    omission: 0.0
+    duplication: 0.05
+    age:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    city:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    date_of_birth:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    first_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    last_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    middle_initial:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    race_ethnicity:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    sex:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    state:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    street_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    street_number:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    unit_number:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    zipcode:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+social_security:
+    omission: 0.0
+    duplication: 0.05
+    date_of_birth:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    event_date:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    event_type:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    first_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    last_name:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    middle_initial:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+    ssn:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
index d767bc26..08dfb293 100644
--- a/src/pseudopeople/interface.py
+++ b/src/pseudopeople/interface.py
@@ -9,40 +9,132 @@
 from pseudopeople.utilities import get_configuration
 
 
+def _generate_form(
+    form: Form,
+    source: Union[Path, str, pd.DataFrame],
+    seed: int,
+    configuration: Union[Path, str, dict],
+):
+    """
+    Helper for generating noised forms from clean data.
+
+    :param form:
+        Form needing to be noised
+    :param source:
+        Clean data input which needs to be noised
+    :param seed:
+        Seed for controlling randomness
+    :param configuration:
+        Object to configure noise levels
+    :return:
+        Noised form data
+    """
+    configuration_tree = get_configuration(configuration)
+    if isinstance(source, pd.DataFrame):
+        data = source
+    else:
+        data = pd.read_csv(source, dtype=str, keep_default_na=False)
+    return noise_form(form, data, configuration_tree, seed)
+
+
 # TODO: add year as parameter to select the year of the decennial census to generate (MIC-3909)
 # TODO: add default path: have the package install the small data in a known location and then
 #  to make this parameter optional, with the default being the location of the small data that
 #  is installed with the package (MIC-3884)
 def generate_decennial_census(
-    path: Union[Path, str], seed: int = 0, configuration: Union[Path, str] = None
+    source: Union[Path, str, pd.DataFrame],
+    seed: int = 0,
+    configuration: Union[Path, str, dict] = None,
 ):
     """
-    Generates a noised decennial census data from un-noised data.
+    Generates noised decennial census data from un-noised data.
 
-    :param path: A path to the un-noised source census data
+    :param source: A path to or pd.DataFrame of the un-noised source census data
     :param seed: An integer seed for randomness
-    :param configuration: (optional) A path to a configuration YAML file to modify default values
+    :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
     :return: A pd.DataFrame of noised census data
     """
-    configuration_tree = get_configuration(configuration)
-    data = pd.read_csv(path, dtype=str, keep_default_na=False)
-    return noise_form(Form.CENSUS, data, configuration_tree, seed)
+    return _generate_form(Form.CENSUS, source, seed, configuration)
 
 
-def generate_w2(
-    path: Union[Path, str], seed: int = 0, configuration: Union[Path, str] = None
+def generate_american_communities_survey(
+    source: Union[Path, str, pd.DataFrame],
+    seed: int = 0,
+    configuration: Union[Path, str, dict] = None,
 ):
     """
-    Generates a noised W2 data from un-noised data.
+    Generates noised American Communities Survey (ACS) data from un-noised data.
 
-    :param path: A path to the un-noised source W2 data
+    :param source: A path to or pd.DataFrame of the un-noised source ACS data
     :param seed: An integer seed for randomness
-    :param configuration: (optional) A path to a configuration YAML file to modify default values
-    :return: A pd.DataFrame of noised W2 data
+    :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
+    :return: A pd.DataFrame of noised ACS data
     """
-    configuration_tree = get_configuration(configuration)
-    data = pd.read_csv(path, dtype=str, keep_default_na=False)
-    return noise_form(Form.TAX_W2_1099, data, configuration_tree, seed)
+    return _generate_form(Form.ACS, source, seed, configuration)
+
+
+def generate_current_population_survey(
+    source: Union[Path, str, pd.DataFrame],
+    seed: int = 0,
+    configuration: Union[Path, str, dict] = None,
+):
+    """
+    Generates noised Current Population Survey (CPS) data from un-noised data.
+
+    :param source: A path to or pd.DataFrame of the un-noised source CPS data
+    :param seed: An integer seed for randomness
+    :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
+    :return: A pd.DataFrame of noised CPS data
+    """
+    return _generate_form(Form.CPS, source, seed, configuration)
+
+
+def generate_taxes_w2_and_1099(
+    source: Union[Path, str, pd.DataFrame],
+    seed: int = 0,
+    configuration: Union[Path, str, dict] = None,
+):
+    """
+    Generates noised W2 and 1099 data from un-noised data.
+
+    :param source: A path to or pd.DataFrame of the un-noised source W2 and 1099 data
+    :param seed: An integer seed for randomness
+    :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
+    :return: A pd.DataFrame of noised W2 and 1099 data
+    """
+    return _generate_form(Form.TAX_W2_1099, source, seed, configuration)
+
+
+def generate_women_infants_and_children(
+    source: Union[Path, str, pd.DataFrame],
+    seed: int = 0,
+    configuration: Union[Path, str, dict] = None,
+):
+    """
+    Generates noised Women Infants and Children (WIC) data from un-noised data.
+
+    :param source: A path to or pd.DataFrame of the un-noised source WIC data
+    :param seed: An integer seed for randomness
+    :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
+    :return: A pd.DataFrame of noised WIC data
+    """
+    return _generate_form(Form.WIC, source, seed, configuration)
+
+
+def generate_social_security(
+    source: Union[Path, str, pd.DataFrame],
+    seed: int = 0,
+    configuration: Union[Path, str, dict] = None,
+):
+    """
+    Generates noised Social Security (SSA) data from un-noised data.
+
+    :param source: A path to or pd.DataFrame of the un-noised source SSA data
+    :param seed: An integer seed for randomness
+    :param configuration: (optional) A path to a configuration YAML file or a dictionary to override the default configuration
+    :return: A pd.DataFrame of noised SSA data
+    """
+    return _generate_form(Form.SSA, source, seed, configuration)
 
 
 # Manual testing helper
@@ -51,7 +143,7 @@ def generate_w2(
     if len(args) == 1:
         my_path = Path(args[0])
         src = pd.read_csv(my_path, dtype=str, keep_default_na=False)
-        out = generate_w2(my_path)
+        out = generate_taxes_w2_and_1099(my_path)
         diff = src[
             ~src.astype(str).apply(tuple, 1).isin(out.astype(str).apply(tuple, 1))
         ]  # get all changed rows
diff --git a/src/pseudopeople/schema_entities.py b/src/pseudopeople/schema_entities.py
index a2e584cf..64f75946 100644
--- a/src/pseudopeople/schema_entities.py
+++ b/src/pseudopeople/schema_entities.py
@@ -4,6 +4,10 @@
 
 # todo: is "form" the right word? Ask RT
 class Form(Enum):
+    """
+    Enum containing all supported forms.
+    """
+
     CENSUS = "decennial_census"
     ACS = "american_communities_survey"
     CPS = "current_population_survey"
diff --git a/src/pseudopeople/utilities.py b/src/pseudopeople/utilities.py
index 8d779d62..ded1f07a 100644
--- a/src/pseudopeople/utilities.py
+++ b/src/pseudopeople/utilities.py
@@ -13,11 +13,11 @@ def get_randomness_stream(form: Form, seed: int) -> RandomnessStream:
     return RandomnessStream(form.value, lambda: pd.Timestamp("2020-04-01"), seed)
 
 
-def get_configuration(user_yaml_path: Union[Path, str] = None) -> ConfigTree:
+def get_configuration(user_configuration: Union[Path, str, dict] = None) -> ConfigTree:
     """
     Gets a noising configuration ConfigTree, optionally overridden by a user-provided YAML.
 
-    :param user_yaml_path: A path to the YAML file defining user overrides for the defaults
+    :param user_configuration: A dictionary or path to the YAML file defining user overrides for the defaults
     :return: a ConfigTree object of the noising configuration
     """
     import pseudopeople
@@ -30,8 +30,8 @@ def get_configuration(user_yaml_path: Union[Path, str] = None) -> ConfigTree:
         data=Path(pseudopeople.__file__).resolve().parent / "default_configuration.yaml",
         layers=default_config_layers,
     )
-    if user_yaml_path:
-        noising_configuration.update(user_yaml_path, layer="user")
+    if user_configuration:
+        noising_configuration.update(user_configuration, layer="user")
     return noising_configuration
 
 
@@ -45,7 +45,7 @@ def vectorized_choice(
 ):
     """
     Function that takes a list of options and uses Vivarium common random numbers framework to make a given number
-    of razndom choice selections.
+    of random choice selections.
 
     :param options: List and series of possible values to choose
     :param n_to_choose: Number of choices to make, the length of the returned array of values
diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
index 294f9d81..605c1d0c 100644
--- a/tests/integration/test_interface.py
+++ b/tests/integration/test_interface.py
@@ -15,13 +15,13 @@ def test_generate_decennial_census(
 
     # TODO: Refactor this check into a separate test
     noised_data = generate_decennial_census(
-        path=decennial_census_data_path, seed=0, configuration=user_config_path
+        source=decennial_census_data_path, seed=0, configuration=user_config_path
     )
     noised_data_same_seed = generate_decennial_census(
-        path=decennial_census_data_path, seed=0, configuration=user_config_path
+        source=decennial_census_data_path, seed=0, configuration=user_config_path
     )
     noised_data_different_seed = generate_decennial_census(
-        path=decennial_census_data_path, seed=1, configuration=user_config_path
+        source=decennial_census_data_path, seed=1, configuration=user_config_path
     )
 
     assert noised_data.equals(noised_data_same_seed)
diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py
index ee099f35..bedefab1 100644
--- a/tests/unit/test_noise_form.py
+++ b/tests/unit/test_noise_form.py
@@ -169,7 +169,7 @@ def test_correct_forms_are_used(func, form, mocker):
     if func == "todo":
         pytest.skip(reason=f"TODO: implement function for {form.value} form")
     mock = mocker.patch("pseudopeople.interface.noise_form")
-    mocker.patch("pseudopeople.interface.pd")
+    mocker.patch("pseudopeople.interface.pd.read_csv")
     _ = func("dummy/path")
 
     assert mock.call_args[0][0] == form

From 480d43ccf59177667845ae981d57bf9910171f19 Mon Sep 17 00:00:00 2001
From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com>
Date: Tue, 4 Apr 2023 13:29:26 -0600
Subject: [PATCH 3/6] implement age miswriting (#24)

* implement age noising
* update default_configuration.yaml
---
 src/pseudopeople/default_configuration.yaml | 184 ++++++++++++++++----
 src/pseudopeople/noise_functions.py         |  81 +++++----
 src/pseudopeople/utilities.py               |  95 +++++++++-
 tests/integration/conftest.py               |  10 +-
 tests/unit/test_column_noise.py             | 159 +++++++++++++++--
 tests/unit/test_configuration.py            |  78 ++++++++-
 6 files changed, 512 insertions(+), 95 deletions(-)

diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml
index 416f7132..44340c8a 100644
--- a/src/pseudopeople/default_configuration.yaml
+++ b/src/pseudopeople/default_configuration.yaml
@@ -6,54 +6,75 @@
 decennial_census:
     omission: 0.0145
     duplication: 0.05
-    first_name:
+    age:
         missing_data:
             row_noise_level: 0.01
         typographic:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-    middle_initial:
+        age_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            possible_perturbations: {1: 0.5, -1: 0.5}
+    city:
         missing_data:
             row_noise_level: 0.01
         typographic:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-    last_name:
+    date_of_birth:
         missing_data:
             row_noise_level: 0.01
         typographic:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-    age:
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+    first_name:
         missing_data:
             row_noise_level: 0.01
         typographic:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-    date_of_birth:
+    last_name:
         missing_data:
             row_noise_level: 0.01
         typographic:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-        numeric_miswriting:
-            row_noise_level: 0.01
-            token_noise_level: 0.1
-    street_number:
+    middle_initial:
         missing_data:
             row_noise_level: 0.01
         typographic:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-        numeric_miswriting:
+    race_ethnicity:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    relation_to_household_head:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    sex:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+    state:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
             row_noise_level: 0.01
-            token_noise_level: 0.1
     street_name:
         missing_data:
             row_noise_level: 0.01
@@ -61,7 +82,7 @@ decennial_census:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-    unit_number:
+    street_number:
         missing_data:
             row_noise_level: 0.01
         typographic:
@@ -71,18 +92,16 @@ decennial_census:
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
-    city:
+    unit_number:
         missing_data:
             row_noise_level: 0.01
         typographic:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-    state:
-        missing_data:
-            row_noise_level: 0.01
-        incorrect_selection:
+        numeric_miswriting:
             row_noise_level: 0.01
+            token_noise_level: 0.1
     zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -90,21 +109,6 @@ decennial_census:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
-    relation_to_household_head:
-        missing_data:
-            row_noise_level: 0.01
-        incorrect_selection:
-            row_noise_level: 0.01
-    sex:
-        missing_data:
-            row_noise_level: 0.01
-        incorrect_selection:
-            row_noise_level: 0.01
-    race_ethnicity:
-        missing_data:
-            row_noise_level: 0.01
-        incorrect_selection:
-            row_noise_level: 0.01
 
 taxes_w2_and_1099:
     omission: 0.0
@@ -116,6 +120,10 @@ taxes_w2_and_1099:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        age_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            possible_perturbations: {1: 0.5, -1: 0.5}
     date_of_birth:
         missing_data:
             row_noise_level: 0.01
@@ -206,11 +214,6 @@ taxes_w2_and_1099:
         numeric_miswriting:
             row_noise_level: 0.01
             token_noise_level: 0.1
-    tax_form:
-        missing_data:
-            row_noise_level: 0.01
-        incorrect_selection:
-            row_noise_level: 0.01
     last_name:
         missing_data:
             row_noise_level: 0.01
@@ -261,6 +264,13 @@ taxes_w2_and_1099:
         missing_data:
             row_noise_level: 0.01
             token_noise_level: 0.1
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     mailing_address_zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -282,6 +292,15 @@ taxes_w2_and_1099:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+    tax_form:
+        missing_data:
+            row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
+
 american_communities_survey:
     omission: 0.0145
     duplication: 0.05
@@ -292,6 +311,10 @@ american_communities_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        age_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            possible_perturbations: {1: 0.5, -1: 0.5}
     city:
         missing_data:
             row_noise_level: 0.01
@@ -306,6 +329,9 @@ american_communities_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     first_name:
         missing_data:
             row_noise_level: 0.01
@@ -320,6 +346,17 @@ american_communities_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+    mailing_address_po_box:
+        missing_data:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     middle_initial:
         missing_data:
             row_noise_level: 0.01
@@ -351,6 +388,9 @@ american_communities_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     unit_number:
         missing_data:
             row_noise_level: 0.01
@@ -358,6 +398,9 @@ american_communities_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -365,6 +408,7 @@ american_communities_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+
 current_population_survey:
     omission: 0.2905
     duplication: 0.05
@@ -375,6 +419,10 @@ current_population_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        age_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            possible_perturbations: {1: 0.5, -1: 0.5}
     city:
         missing_data:
             row_noise_level: 0.01
@@ -389,6 +437,9 @@ current_population_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     first_name:
         missing_data:
             row_noise_level: 0.01
@@ -403,6 +454,17 @@ current_population_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+    mailing_address_po_box:
+        missing_data:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     middle_initial:
         missing_data:
             row_noise_level: 0.01
@@ -434,6 +496,9 @@ current_population_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     unit_number:
         missing_data:
             row_noise_level: 0.01
@@ -441,6 +506,9 @@ current_population_survey:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -459,6 +527,10 @@ women_infants_and_children:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        age_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            possible_perturbations: {1: 0.5, -1: 0.5}
     city:
         missing_data:
             row_noise_level: 0.01
@@ -473,6 +545,9 @@ women_infants_and_children:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     first_name:
         missing_data:
             row_noise_level: 0.01
@@ -487,6 +562,17 @@ women_infants_and_children:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+    mailing_address_po_box:
+        missing_data:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     middle_initial:
         missing_data:
             row_noise_level: 0.01
@@ -523,6 +609,9 @@ women_infants_and_children:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     unit_number:
         missing_data:
             row_noise_level: 0.01
@@ -530,6 +619,9 @@ women_infants_and_children:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -537,9 +629,21 @@ women_infants_and_children:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+
 social_security:
     omission: 0.0
     duplication: 0.05
+    age:
+        missing_data:
+            row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
+        age_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            possible_perturbations: {1: 0.5, -1: 0.5}
     date_of_birth:
         missing_data:
             row_noise_level: 0.01
@@ -547,6 +651,9 @@ social_security:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     event_date:
         missing_data:
             row_noise_level: 0.01
@@ -554,6 +661,9 @@ social_security:
             row_noise_level: 0.01
             token_noise_level: 0.1
             include_original_token_level: 0.1
+        numeric_miswriting:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
     event_type:
         missing_data:
             row_noise_level: 0.01
diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py
index 2a2d6f97..e92a58ce 100644
--- a/src/pseudopeople/noise_functions.py
+++ b/src/pseudopeople/noise_functions.py
@@ -11,8 +11,8 @@
 
 
 def omit_rows(
-    form_data: pd.DataFrame,
-    configuration: float,
+    form_data: float,
+    configuration: ConfigTree,
     randomness_stream: RandomnessStream,
 ) -> pd.DataFrame:
     """
@@ -27,8 +27,8 @@ def omit_rows(
 
 
 def duplicate_rows(
-    form_data: pd.DataFrame,
-    configuration: float,
+    form_data: float,
+    configuration: ConfigTree,
     randomness_stream: RandomnessStream,
 ) -> pd.DataFrame:
     """
@@ -80,75 +80,89 @@ def generate_incorrect_selections(
 
 
 def generate_within_household_copies(
-    form_data: pd.DataFrame,
-    configuration: float,
+    column: pd.Series,
+    configuration: ConfigTree,
     randomness_stream: RandomnessStream,
     additional_key: Any,
-) -> pd.DataFrame:
+) -> pd.Series:
     """
 
-    :param form_data:
+    :param column:
     :param configuration:
     :param randomness_stream:
     :param additional_key: Key for RandomnessStream
     :return:
     """
     # todo actually duplicate rows
-    return form_data
+    return column
 
 
 def swap_months_and_days(
-    form_data: pd.DataFrame,
-    configuration: float,
+    column: pd.Series,
+    configuration: ConfigTree,
     randomness_stream: RandomnessStream,
     additional_key: Any,
-) -> pd.DataFrame:
+) -> pd.Series:
     """
 
-    :param form_data:
+    :param column:
     :param configuration:
     :param randomness_stream:
     :param additional_key: Key for RandomnessStream
     :return:
     """
     # todo actually duplicate rows
-    return form_data
+    return column
 
 
 def miswrite_zipcodes(
-    form_data: pd.DataFrame,
-    configuration: float,
+    column: pd.Series,
+    configuration: ConfigTree,
     randomness_stream: RandomnessStream,
     additional_key: Any,
-) -> pd.DataFrame:
+) -> pd.Series:
     """
 
-    :param form_data:
+    :param column:
     :param configuration:
     :param randomness_stream:
     :param additional_key: Key for RandomnessStream
     :return:
     """
     # todo actually duplicate rows
-    return form_data
+    return column
 
 
 def miswrite_ages(
-    form_data: pd.DataFrame,
-    configuration: float,
+    column: pd.Series,
+    configuration: ConfigTree,
     randomness_stream: RandomnessStream,
     additional_key: Any,
-) -> pd.DataFrame:
-    """
+) -> pd.Series:
+    """Function to mis-write ages based on perturbation parameters included in
+    the config file.
 
-    :param form_data:
-    :param configuration:
-    :param randomness_stream:
-    :param additional_key: Key for RandomnessStream
+    :param column: pd.Series of ages
+    :param configuration: ConfigTree
+    :param randomness_stream: Vivarium RandomnessStream
+    :param additional_key: additional key used for randomness_stream calls
     :return:
     """
-    # todo actually duplicate rows
-    return form_data
+    possible_perturbations = configuration.possible_perturbations.to_dict()
+    perturbations = vectorized_choice(
+        options=list(possible_perturbations.keys()),
+        weights=list(possible_perturbations.values()),
+        n_to_choose=len(column),
+        randomness_stream=randomness_stream,
+        additional_key=f"{additional_key}_{column.name}_miswrite_ages",
+    )
+    new_values = column.astype(float).astype(int) + perturbations
+    # Reflect negative values to positive
+    new_values[new_values < 0] *= -1
+    # If new age == original age, subtract 1
+    new_values[new_values == column.astype(int)] -= 1
+
+    return new_values.astype(str)
 
 
 def miswrite_numerics(
@@ -266,7 +280,7 @@ def generate_typographical_errors(
     additional_key: Any,
 ) -> pd.Series:
     """Function that takes a column and applies noise to the string values
-    representative of keyboard mis-typing.
+    representative of keyboard mistyping.
 
     :param column:  pd.Series of data
     :param configuration: ConfigTree object containing noising parameters
@@ -279,7 +293,12 @@ def generate_typographical_errors(
         qwerty_errors = yaml.full_load(f)
 
     def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng):
-        """Abie's implementation of typographical noising"""
+        """For each string, loop through each character and determine if
+        it is to be corrupted. If so, uniformly choose from the appropriate
+        values to mistype. Also determine which mistyped characters should
+        include the original value and, if it does, include the original value
+        after the mistyped value
+        """
         err = ""
         i = 0
         while i < len(truth):
diff --git a/src/pseudopeople/utilities.py b/src/pseudopeople/utilities.py
index ded1f07a..10297b30 100644
--- a/src/pseudopeople/utilities.py
+++ b/src/pseudopeople/utilities.py
@@ -1,8 +1,9 @@
 from pathlib import Path
-from typing import Any, Union
+from typing import Any, Dict, Union
 
 import numpy as np
 import pandas as pd
+import yaml
 from vivarium.framework.configuration import ConfigTree
 from vivarium.framework.randomness import RandomnessStream, random
 
@@ -13,11 +14,11 @@ def get_randomness_stream(form: Form, seed: int) -> RandomnessStream:
     return RandomnessStream(form.value, lambda: pd.Timestamp("2020-04-01"), seed)
 
 
-def get_configuration(user_configuration: Union[Path, str, dict] = None) -> ConfigTree:
+def get_configuration(user_configuration: Union[Path, str, Dict] = None) -> ConfigTree:
     """
     Gets a noising configuration ConfigTree, optionally overridden by a user-provided YAML.
 
-    :param user_configuration: A dictionary or path to the YAML file defining user overrides for the defaults
+    :param user_configuration: A path to the YAML file or a dictionary defining user overrides for the defaults
     :return: a ConfigTree object of the noising configuration
     """
     import pseudopeople
@@ -31,10 +32,96 @@ def get_configuration(user_configuration: Union[Path, str, dict] = None) -> Conf
         layers=default_config_layers,
     )
     if user_configuration:
+        if isinstance(user_configuration, (Path, str)):
+            with open(user_configuration, "r") as f:
+                user_configuration = yaml.full_load(f)
+        user_configuration = format_user_configuration(
+            user_configuration, noising_configuration
+        )
         noising_configuration.update(user_configuration, layer="user")
+
+    validate_noising_configuration(noising_configuration)
+
     return noising_configuration
 
 
+def format_user_configuration(user_dict: Dict, default_config) -> Dict:
+    """Formats the user's configuration file as necessary so it can properly
+    update noising configuration to be used
+    """
+    user_dict = _format_age_miswriting_perturbations(user_dict, default_config)
+
+    return user_dict
+
+
+def _format_age_miswriting_perturbations(user_dict: Dict, default_config: ConfigTree) -> Dict:
+    # Format any age perturbation lists as a dictionary with uniform probabilites
+    for form in user_dict:
+        user_perturbations = (
+            user_dict[form]
+            .get("age", {})
+            .get("age_miswriting", {})
+            .get("possible_perturbations", {})
+        )
+        if not user_perturbations:
+            continue
+        formatted = {}
+        default_perturbations = default_config[form]["age"]["age_miswriting"][
+            "possible_perturbations"
+        ]
+        # Replace default configuration with 0 probabilities
+        for perturbation in default_perturbations:
+            formatted[perturbation] = 0
+        if isinstance(user_perturbations, list):
+            # Add user perturbations with uniform probabilities
+            uniform_prob = 1 / len(user_perturbations)
+            for perturbation in user_perturbations:
+                formatted[perturbation] = uniform_prob
+        elif isinstance(user_perturbations, dict):
+            for perturbation, prob in user_perturbations.items():
+                formatted[perturbation] = prob
+        else:
+            raise NotImplementedError(
+                "age.age_miswriting.possible_perturbations can only be a list or dict, "
+                f"received type {type(user_perturbations)}"
+            )
+        user_dict[form]["age"]["age_miswriting"]["possible_perturbations"] = formatted
+
+    return user_dict
+
+
+def validate_noising_configuration(config: ConfigTree) -> None:
+    """Perform various validation checks on the final noising ConfigTree object"""
+    _validate_age_miswriting(config)
+
+
+def _validate_age_miswriting(config: ConfigTree) -> None:
+    possible_perturbations = extract_values(config, "possible_perturbations")
+    for form_perturbations in possible_perturbations:
+        form_perturbations_dict = form_perturbations.to_dict()
+        if 0 in form_perturbations_dict:
+            # TODO: Find a way to report specific location in config file
+            raise ValueError("Cannot include 0 in age_miswriting.possible_perturbations")
+        if sum(form_perturbations_dict.values()) != 1:
+            raise ValueError(
+                "The provided possible_perturbation probabilities must sum to 1 but they "
+                f"currently sum to {sum(form_perturbations_dict.values())}: {form_perturbations_dict}",
+            )
+
+
+def extract_values(config: Union[ConfigTree, Dict], key: str):
+    """Extract values with a specific key from a dict or configtree"""
+    results = []
+    for k, v in config.items():
+        if k == key:
+            results.append(v)
+        if isinstance(v, (dict, ConfigTree)):
+            for result in extract_values(v, key):
+                results.append(result)
+
+    return results
+
+
 def vectorized_choice(
     options: Union[list, pd.Series],
     n_to_choose: int,
@@ -65,6 +152,8 @@ def vectorized_choice(
     if weights is None:
         n = len(options)
         weights = np.ones(n) / n
+    if isinstance(weights, list):
+        weights = np.array(weights)
     # for each of n_to_choose, sample uniformly between 0 and 1
     index = pd.Index(np.arange(n_to_choose))
     if randomness_stream is None:
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 060a5174..c8dfe389 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -113,11 +113,7 @@ def decennial_census_data_path(tmp_path_factory):
     data = pd.DataFrame(
         {
             "housing_type": [random.choice(HOUSING_TYPES) for _ in range(num_rows)],
-            # TODO: Currently ages are actually floats but a followup pr will ensure ints
-            "age": [
-                str(random.randint(1, 100) + round(random.random(), 6))
-                for _ in range(num_rows)
-            ],
+            "age": [str(random.randint(1, 100)) for _ in range(num_rows)],
             "year": [random.choice(["2020", "2030"]) for _ in range(num_rows)],
             "race_ethnicity": [random.choice(RACE_ETHNICITIES) for _ in range(num_rows)],
             "guardian_1": [
@@ -134,9 +130,7 @@ def decennial_census_data_path(tmp_path_factory):
             "relation_to_household_head": [
                 random.choice(RELATIONS_TO_HOUSEHOLD_HEAD) for _ in range(num_rows)
             ],
-            # TODO: currently zipcodes are floats (and thus not zero-padded);
-            # a followup PR will convert to 5-digit integer strings
-            "zipcode": [str(random.randint(1, 99999)) + ".0" for _ in range(num_rows)],
+            "zipcode": [str(random.randint(1, 99999)).zfill(5) for _ in range(num_rows)],
             "date_of_birth": [
                 time.strftime(
                     "%Y-%m-%d",
diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py
index 1c4871bf..bd50b21a 100644
--- a/tests/unit/test_column_noise.py
+++ b/tests/unit/test_column_noise.py
@@ -20,7 +20,7 @@
 @pytest.fixture(scope="module")
 def dummy_dataset():
     # Add a column of integer strings
-    num_simulants = 100_000
+    num_simulants = 1_000_000
     dummy_idx = pd.Index(range(num_simulants))
     integer_series = pd.Series([str(x) for x in range(num_simulants)])
     # Add missing data from `generate_missing_data` function
@@ -57,13 +57,8 @@ def string_series():
     )
 
 
-@pytest.fixture(scope="module")
-def default_configuration():
-    return get_configuration()
-
-
 def test_generate_missing_data(dummy_dataset):
-    config = get_configuration()
+    config = get_configuration()["decennial_census"]["zipcode"]["missing_data"]
     config.update(
         {
             "decennial_census": {
@@ -75,7 +70,6 @@ def test_generate_missing_data(dummy_dataset):
             },
         }
     )
-    config = config["decennial_census"]["zipcode"]["missing_data"]
     data = dummy_dataset["numbers"]
     noised_data = _validate_seed_and_noise_data(
         noise_type=NOISE_TYPES.MISSING_DATA, column=data, config=config
@@ -98,8 +92,8 @@ def test_generate_missing_data(dummy_dataset):
     assert (data[not_noised_idx] == noised_data[not_noised_idx]).all()
 
 
-def test_incorrect_selection(categorical_series, default_configuration):
-    config = default_configuration["decennial_census"]["state"]["incorrect_selection"]
+def test_incorrect_selection(categorical_series):
+    config = get_configuration()["decennial_census"]["state"]["incorrect_selection"]
     noised_data = _validate_seed_and_noise_data(
         noise_type=NOISE_TYPES.INCORRECT_SELECTION, column=categorical_series, config=config
     )
@@ -133,9 +127,148 @@ def test_miswrite_zipcodes():
     pass
 
 
-@pytest.mark.skip(reason="TODO")
-def test_miswrite_ages():
-    pass
+def test_miswrite_ages_default_config(dummy_dataset):
+    """Test that miswritten ages are appropriately handled, including
+    no perturbation probabilities defaults to uniform distribution,
+    perturbation probabilities"""
+    config = get_configuration()["decennial_census"]["age"]["age_miswriting"]
+    data = dummy_dataset.rename(columns={"numbers": "age"})["age"]
+
+    # Convert to realistic age
+    maximum_age = 120
+    data = data.apply(pd.to_numeric, args=("coerce",))
+    data = data / data.max() * (maximum_age + 1)
+    data[data.isna()] = -1  # temp nan
+    data = data.astype(int).astype(str)
+    data[data == "-1"] = ""
+    noised_data = _validate_seed_and_noise_data(
+        noise_type=NOISE_TYPES.AGE_MISWRITING, column=data, config=config
+    )
+
+    # Check for expected noise level
+    not_missing_idx = data.index[data != ""]
+    expected_noise = config["row_noise_level"]
+    actual_noise = (noised_data[not_missing_idx] != data[not_missing_idx]).mean()
+    # NOTE: we increase the relative tolerance a bit here because the expected
+    # noise calculated above does not account for the fact that if a perturbed
+    # age ends up being the same as the original age, then 1 is subtracted.
+    assert np.isclose(expected_noise, actual_noise, rtol=0.03)
+
+    # Check that missing data remains missing
+    original_missing_idx = data.index[data == ""]
+    noised_missing_idx = noised_data.index[noised_data == ""]
+    pd.testing.assert_index_equal(original_missing_idx, noised_missing_idx)
+
+    # Check that there are no negative ages generated
+    assert noised_data[not_missing_idx].astype(int).min() >= 0
+
+
+def test_miswrite_ages_uniform_probabilities():
+    """Test that a list of perturbations passed in results in uniform probabilities"""
+    num_rows = 100_000
+    original_age = 25
+    perturbations = [-2, -1, 1]
+
+    config = get_configuration(
+        {
+            "decennial_census": {
+                "age": {
+                    "age_miswriting": {
+                        "row_noise_level": 1,
+                        "possible_perturbations": perturbations,
+                    },
+                },
+            },
+        },
+    )["decennial_census"]["age"]["age_miswriting"]
+
+    data = pd.Series([str(original_age)] * num_rows)
+    noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test")
+    expected_noise = 1 / len(perturbations)
+    for perturbation in perturbations:
+        actual_noise = (noised_data.astype(int) - original_age == perturbation).mean()
+        assert np.isclose(actual_noise, expected_noise, rtol=0.01)
+
+
+def test_miswrite_ages_provided_probabilities():
+    """Test that provided age perturation probabilites are handled"""
+    num_rows = 100_000
+    original_age = 25
+    perturbations = {-1: 0.1, 1: 0.9}
+
+    config = get_configuration(
+        {
+            "decennial_census": {
+                "age": {
+                    "age_miswriting": {
+                        "row_noise_level": 1,
+                        "possible_perturbations": perturbations,
+                    },
+                },
+            },
+        },
+    )["decennial_census"]["age"]["age_miswriting"]
+
+    data = pd.Series([str(original_age)] * num_rows)
+    noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test")
+    for perturbation in perturbations:
+        expected_noise = perturbations[perturbation]
+        actual_noise = (noised_data.astype(int) - original_age == perturbation).mean()
+        assert np.isclose(actual_noise, expected_noise, rtol=0.01)
+
+
+def test_miswrite_ages_handles_perturbation_to_same_age():
+    """Tests an edge case. It's possible that after an age is perturbed it ends
+    up being the original age. In that case, subtract 1. eg, an age of 1 that is
+    perturbed -2 becomes -1. But we cannot have negative so we flip the sign to +1.
+    But that's the same as the original age and so should become 1-1=0.
+    """
+    num_rows = 100
+    age = 1
+    perturbations = [-2]  # This will cause -1 which will be flipped to +1
+
+    config = get_configuration(
+        {
+            "decennial_census": {
+                "age": {
+                    "age_miswriting": {
+                        "row_noise_level": 1,
+                        "possible_perturbations": perturbations,
+                    },
+                },
+            },
+        },
+    )["decennial_census"]["age"]["age_miswriting"]
+
+    data = pd.Series([str(age)] * num_rows)
+    noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test")
+
+    assert (noised_data == "0").all()
+
+
+def test_miswrite_ages_flips_negative_to_positive():
+    """Test that any ages perturbed to <0 are reflected to positive values"""
+    num_rows = 100
+    age = 3
+    perturbations = [-7]  # This will cause -4 and should flip to +4
+
+    config = get_configuration(
+        {
+            "decennial_census": {
+                "age": {
+                    "age_miswriting": {
+                        "row_noise_level": 1,
+                        "possible_perturbations": perturbations,
+                    },
+                },
+            },
+        },
+    )["decennial_census"]["age"]["age_miswriting"]
+
+    data = pd.Series([str(age)] * num_rows)
+    noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test")
+
+    assert (noised_data == "4").all()
 
 
 def test_miswrite_numerics(string_series):
diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py
index 5ca16d13..8d5d31fc 100644
--- a/tests/unit/test_configuration.py
+++ b/tests/unit/test_configuration.py
@@ -1,11 +1,18 @@
 from pathlib import Path
 
+import pandas as pd
 import pytest
 import yaml
+from vivarium.config_tree import ConfigTree
+from vivarium.framework.randomness import RandomnessStream
 
 import pseudopeople
 from pseudopeople.utilities import get_configuration
 
+RANDOMNESS0 = RandomnessStream(
+    key="test_column_noise", clock=lambda: pd.Timestamp("2020-09-01"), seed=0
+)
+
 
 @pytest.fixture
 def user_configuration_yaml(tmp_path):
@@ -42,8 +49,73 @@ def test_get_configuration_with_user_override(user_configuration_yaml, mocker):
     update_calls = [
         call
         for call in mock.mock_calls
-        if "update" in str(call)
-        and "user" in str(call)
-        and str(user_configuration_yaml) in str(call)
+        if ".update({" in str(call) and "layer='user'" in str(call)
     ]
     assert len(update_calls) == 1
+
+
+def test_validate_miswrite_ages_fails_if_includes_0():
+    """Test that a runtime error is thrown if the user includes 0 as a possible perturbation"""
+    perturbations = [-1, 0, 1]
+    with pytest.raises(ValueError, match="Cannot include 0"):
+        get_configuration(
+            {
+                "decennial_census": {
+                    "age": {
+                        "age_miswriting": {
+                            "row_noise_level": 1,
+                            "possible_perturbations": perturbations,
+                        },
+                    },
+                },
+            },
+        )
+
+
+def test_validate_miswrite_ages_if_probabilities_do_not_add_to_1():
+    """Test that runtimerrors if probs do not add up to 1"""
+    perturbations = {-1: 0.1, 1: 0.8}  # does not sum to 1
+
+    with pytest.raises(ValueError, match="must sum to 1"):
+        get_configuration(
+            {
+                "decennial_census": {
+                    "age": {
+                        "age_miswriting": {
+                            "possible_perturbations": perturbations,
+                        },
+                    },
+                },
+            },
+        )
+
+
+@pytest.mark.parametrize("user_config_type", ["dict", "path"])
+def test_format_miswrite_ages(user_config_type, tmp_path):
+    """Test that user-supplied dictionary properly updates ConfigTree object.
+    This includes zero-ing out default values that don't exist in the user config
+    """
+    user_config = {
+        "decennial_census": {
+            "age": {
+                "age_miswriting": {
+                    "possible_perturbations": [-2, -1, 2],
+                },
+            },
+        },
+    }
+    if user_config_type == "path":
+        filepath = tmp_path / "user_dict.yaml"
+        with open(filepath, "w") as file:
+            yaml.dump(user_config, file)
+        user_config = filepath
+
+    new_dict = get_configuration(user_config).decennial_census.age.age_miswriting.to_dict()
+    default_dict = get_configuration().decennial_census.age.age_miswriting.to_dict()
+    assert default_dict["row_noise_level"] == new_dict["row_noise_level"]
+    assert default_dict["token_noise_level"] == new_dict["token_noise_level"]
+    # check that 1 got replaced with 0 probability
+    assert new_dict["possible_perturbations"][1] == 0
+    # check that others have 1/3 probability
+    for p in [-2, -1, 2]:
+        assert new_dict["possible_perturbations"][p] == 1 / 3

From 113c1d4ac44d1b90c77cbc4363b5fb86d76b7eea Mon Sep 17 00:00:00 2001
From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com>
Date: Tue, 4 Apr 2023 17:37:57 -0600
Subject: [PATCH 4/6] refactor seed-check method into their own pytests (#30)

---
 tests/unit/test_column_noise.py | 125 +++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 42 deletions(-)

diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py
index bd50b21a..2e05796a 100644
--- a/tests/unit/test_column_noise.py
+++ b/tests/unit/test_column_noise.py
@@ -19,13 +19,15 @@
 
 @pytest.fixture(scope="module")
 def dummy_dataset():
-    # Add a column of integer strings
     num_simulants = 1_000_000
     dummy_idx = pd.Index(range(num_simulants))
+
+    # Add a column of integer strings
     integer_series = pd.Series([str(x) for x in range(num_simulants)])
     # Add missing data from `generate_missing_data` function
     missing_idx = pd.Index([x for x in dummy_idx if x % 3 == 0])
     integer_series.loc[missing_idx] = ""
+
     # Add a column of character strings
     str_length = 6
     character_series = pd.Series(
@@ -39,7 +41,42 @@ def dummy_dataset():
     # Add missing data from `generate_missing_data` function
     character_series.loc[missing_idx] = ""
 
-    return pd.DataFrame({"numbers": integer_series, "characters": character_series})
+    # Add a categorical series state column
+    states_list = ["CA", "WA", "FL", "OR", "CO", "TX", "NY", "VA", "AZ", "''"]
+    states = pd.Series(states_list * int(num_simulants / len(states_list)))
+
+    # Add age col by converting integer_series
+    maximum_age = 120
+    ages = integer_series.apply(pd.to_numeric, args=("coerce",))
+    ages = ages / ages.max() * (maximum_age + 1)
+    ages[ages.isna()] = -1  # temp nan
+    ages = ages.astype(int).astype(str)
+    ages[ages == "-1"] = ""
+
+    # Add a string_series column of mixed letters and numbers
+    string_list = [
+        "foo1",
+        "bar2",
+        "baz3",
+        "Unit 1A",
+        "1234",
+        "12/31/2020",
+        "a1b2c3",
+        "100000.00",
+        "123-45-6789",
+        "",
+    ]
+    string_series = pd.Series(string_list * int(num_simulants / len(string_list)))
+
+    return pd.DataFrame(
+        {
+            "numbers": integer_series,
+            "characters": character_series,
+            "state": states,
+            "age": ages,
+            "string_series": string_series,
+        }
+    )
 
 
 @pytest.fixture(scope="module")
@@ -71,9 +108,7 @@ def test_generate_missing_data(dummy_dataset):
         }
     )
     data = dummy_dataset["numbers"]
-    noised_data = _validate_seed_and_noise_data(
-        noise_type=NOISE_TYPES.MISSING_DATA, column=data, config=config
-    )
+    noised_data = NOISE_TYPES.MISSING_DATA(data, config, RANDOMNESS0, "test")
 
     # Calculate newly missing data, ie data that didn't come in as already missing
     orig_non_missing_idx = data.index[(data.notna()) & (data != "")]
@@ -94,8 +129,8 @@ def test_generate_missing_data(dummy_dataset):
 
 def test_incorrect_selection(categorical_series):
     config = get_configuration()["decennial_census"]["state"]["incorrect_selection"]
-    noised_data = _validate_seed_and_noise_data(
-        noise_type=NOISE_TYPES.INCORRECT_SELECTION, column=categorical_series, config=config
+    noised_data = NOISE_TYPES.INCORRECT_SELECTION(
+        categorical_series, config, RANDOMNESS0, "test"
     )
 
     # Check for expected noise level
@@ -132,18 +167,8 @@ def test_miswrite_ages_default_config(dummy_dataset):
     no perturbation probabilities defaults to uniform distribution,
     perturbation probabilities"""
     config = get_configuration()["decennial_census"]["age"]["age_miswriting"]
-    data = dummy_dataset.rename(columns={"numbers": "age"})["age"]
-
-    # Convert to realistic age
-    maximum_age = 120
-    data = data.apply(pd.to_numeric, args=("coerce",))
-    data = data / data.max() * (maximum_age + 1)
-    data[data.isna()] = -1  # temp nan
-    data = data.astype(int).astype(str)
-    data[data == "-1"] = ""
-    noised_data = _validate_seed_and_noise_data(
-        noise_type=NOISE_TYPES.AGE_MISWRITING, column=data, config=config
-    )
+    data = dummy_dataset["age"]
+    noised_data = NOISE_TYPES.AGE_MISWRITING(data, config, RANDOMNESS0, "test")
 
     # Check for expected noise level
     not_missing_idx = data.index[data != ""]
@@ -292,9 +317,7 @@ def test_miswrite_numerics(string_series):
     p_row_noise = config.row_noise_level
     p_token_noise = config.token_noise_level
     data = string_series
-    noised_data = _validate_seed_and_noise_data(
-        noise_type=NOISE_TYPES.NUMERIC_MISWRITING, column=data, config=config
-    )
+    noised_data = NOISE_TYPES.NUMERIC_MISWRITING(data, config, RANDOMNESS0, "test")
 
     # Get masks for helper groups, each string in categorical string purpose is to mimic possible string types
     empty_str = data == ""
@@ -417,9 +440,7 @@ def test_generate_typographical_errors(dummy_dataset, column):
         }
     )
     config = config["decennial_census"][column]["typographic"]
-    noised_data = _validate_seed_and_noise_data(
-        noise_type=NOISE_TYPES.TYPOGRAPHIC, column=data, config=config
-    )
+    noised_data = NOISE_TYPES.TYPOGRAPHIC(data, config, RANDOMNESS0, "test")
 
     not_missing_idx = data.index[(data.notna()) & (data != "")]
     check_original = data.loc[not_missing_idx]
@@ -455,22 +476,42 @@ def test_generate_typographical_errors(dummy_dataset, column):
     ).all()
 
 
-####################
-# HELPER FUNCTIONS #
-####################
-
-
-# TODO: refactor this into its own test parameterized by noise functions
-def _validate_seed_and_noise_data(noise_type, column, config):
-    """Confirms randomness stream behavior and returns the noised data"""
-    noised_data = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}")
-    noised_data_same_seed = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}")
-    noised_data_different_seed = noise_type(
-        column, config, RANDOMNESS1, f"test_{noise_type.name}"
-    )
-
-    assert (noised_data != column).any()
+@pytest.mark.parametrize(
+    "noise_type, data_col, form, form_col",
+    [
+        (NOISE_TYPES.MISSING_DATA, "numbers", "decennial_census", "zipcode"),
+        (NOISE_TYPES.INCORRECT_SELECTION, "state", "decennial_census", "state"),
+        (NOISE_TYPES.COPY_FROM_WITHIN_HOUSEHOLD, "todo", "todo", "todo"),
+        (NOISE_TYPES.MONTH_DAY_SWAP, "todo", "todo", "todo"),
+        (NOISE_TYPES.ZIP_CODE_MISWRITING, "todo", "todo", "todo"),
+        (NOISE_TYPES.AGE_MISWRITING, "age", "decennial_census", "age"),
+        (
+            NOISE_TYPES.NUMERIC_MISWRITING,
+            "string_series",
+            "decennial_census",
+            "street_number",
+        ),
+        (NOISE_TYPES.NICKNAME, "todo", "todo", "todo"),
+        (NOISE_TYPES.FAKE_NAME, "todo", "todo", "todo"),
+        (NOISE_TYPES.PHONETIC, "todo", "todo", "todo"),
+        (NOISE_TYPES.OCR, "todo", "todo", "todo"),
+        (NOISE_TYPES.TYPOGRAPHIC, "numbers", "decennial_census", "zipcode"),
+        (NOISE_TYPES.TYPOGRAPHIC, "characters", "decennial_census", "street_name"),
+    ],
+)
+def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_dataset):
+    """Tests that different seeds produce different results and the same seed
+    produces the same results
+    """
+    noise = noise_type.name
+    if data_col == "todo":
+        pytest.skip(reason=f"TODO: implement for function {noise}")
+    config = get_configuration()[form][form_col][noise]
+    data = dummy_dataset[data_col]
+    noised_data = noise_type(data, config, RANDOMNESS0, f"test_{noise}")
+    noised_data_same_seed = noise_type(data, config, RANDOMNESS0, f"test_{noise}")
+    noised_data_different_seed = noise_type(data, config, RANDOMNESS1, f"test_{noise}")
+
+    assert (noised_data != data).any()
     assert (noised_data == noised_data_same_seed).all()
     assert (noised_data != noised_data_different_seed).any()
-
-    return noised_data

From 0c4290aa8136db282472dec46c6d48390dc23ee9 Mon Sep 17 00:00:00 2001
From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com>
Date: Tue, 4 Apr 2023 17:38:58 -0700
Subject: [PATCH 5/6] read data from an hdf rather than a csv (#29)

---
 src/pseudopeople/data/incorrect_select_options.csv |  4 ++--
 src/pseudopeople/entity_types.py                   |  6 +++---
 src/pseudopeople/interface.py                      |  4 +++-
 src/pseudopeople/noise_functions.py                |  3 ++-
 tests/integration/conftest.py                      |  4 ++--
 tests/integration/test_interface.py                |  2 +-
 tests/unit/test_column_noise.py                    | 11 +++++++----
 tests/unit/test_noise_form.py                      |  4 ++--
 8 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv
index e4939387..67fd1629 100644
--- a/src/pseudopeople/data/incorrect_select_options.csv
+++ b/src/pseudopeople/data/incorrect_select_options.csv
@@ -3,7 +3,7 @@ AL,Reference person,Female,White,W2,creation
 AK,Opp-sex spouse,Male,Black,1099,death
 AZ,Opp-sex partner,,Asian,,
 AR,Same-sex spouse,,AIAN,,
-CA,Same-sex partne,,NHOPI,,
+CA,Same-sex partner,,NHOPI,,
 CO,Biological child,,Multiracial or Other,,
 CT,Adopted child,,Latino,,
 DE,Stepchild,,,,
@@ -16,7 +16,7 @@ IN,Other relative,,,,
 IA,Roommate,,,,
 KS,Foster child,,,,
 KY,Other nonrelative,,,,
-LA,Institutionalized GQ po,,,,
+LA,Institutionalized GQ pop,,,,
 ME,Noninstitutionalized GQ pop,,,,
 MD,,,,,
 MA,,,,,
diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py
index 500cf27c..7749ce7b 100644
--- a/src/pseudopeople/entity_types.py
+++ b/src/pseudopeople/entity_types.py
@@ -23,7 +23,7 @@ class RowNoiseType:
     """
 
     name: str
-    noise_function: Callable[[pd.DataFrame, float, RandomnessStream, str], pd.DataFrame]
+    noise_function: Callable[[pd.DataFrame, float, RandomnessStream], pd.DataFrame]
 
     def __call__(
         self,
@@ -63,8 +63,8 @@ def __call__(
         to_noise_idx = get_index_to_noise(
             column, noise_level, randomness_stream, f"{self.name}_{additional_key}"
         )
-        column.loc[to_noise_idx] = self.noise_function(
+        noised_data = self.noise_function(
             column.loc[to_noise_idx], configuration, randomness_stream, additional_key
         )
-
+        column.loc[to_noise_idx] = noised_data
         return column
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
index 08dfb293..7a543478 100644
--- a/src/pseudopeople/interface.py
+++ b/src/pseudopeople/interface.py
@@ -33,7 +33,9 @@ def _generate_form(
     if isinstance(source, pd.DataFrame):
         data = source
     else:
-        data = pd.read_csv(source, dtype=str, keep_default_na=False)
+        data = pd.read_hdf(source)
+        if not isinstance(data, pd.DataFrame):
+            raise TypeError(f"File located at {source} must contain a pandas DataFrame.")
     return noise_form(form, data, configuration_tree, seed)
 
 
diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py
index e92a58ce..03826d95 100644
--- a/src/pseudopeople/noise_functions.py
+++ b/src/pseudopeople/noise_functions.py
@@ -270,7 +270,7 @@ def generate_missing_data(column: pd.Series, *_: Any) -> pd.Series:
     :returns: pd.Series of empty strings with the index of column.
     """
 
-    return pd.Series("", index=column.index)
+    return pd.Series(pd.NA, index=column.index)
 
 
 def generate_typographical_errors(
@@ -322,6 +322,7 @@ def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng):
     include_original_token_level = configuration.include_original_token_level
 
     rng = np.random.default_rng(seed=randomness_stream.seed)
+    column = column.astype(str)
     for idx in column.index:
         noised_value = keyboard_corrupt(
             column[idx],
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index c8dfe389..c20a62bc 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -163,7 +163,7 @@ def decennial_census_data_path(tmp_path_factory):
         }
     )
 
-    data_path = tmp_path_factory.getbasetemp() / "dummy_data.csv"
-    data.to_csv(data_path, index=False)
+    data_path = tmp_path_factory.getbasetemp() / "dummy_data.hdf"
+    data.to_hdf(data_path, "data")
 
     return data_path
diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
index 605c1d0c..e2fda8ca 100644
--- a/tests/integration/test_interface.py
+++ b/tests/integration/test_interface.py
@@ -11,7 +11,7 @@
 def test_generate_decennial_census(
     decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str]
 ):
-    data = pd.read_csv(decennial_census_data_path, dtype=str, keep_default_na=False)
+    data = pd.read_hdf(decennial_census_data_path)
 
     # TODO: Refactor this check into a separate test
     noised_data = generate_decennial_census(
diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py
index 2e05796a..0a1b95bc 100644
--- a/tests/unit/test_column_noise.py
+++ b/tests/unit/test_column_noise.py
@@ -113,7 +113,7 @@ def test_generate_missing_data(dummy_dataset):
     # Calculate newly missing data, ie data that didn't come in as already missing
     orig_non_missing_idx = data.index[(data.notna()) & (data != "")]
     newly_missing_idx = noised_data.index[
-        (noised_data.index.isin(orig_non_missing_idx)) & (noised_data == "")
+        (noised_data.index.isin(orig_non_missing_idx)) & (noised_data.isna())
     ]
 
     # Check for expected noise level
@@ -122,8 +122,7 @@ def test_generate_missing_data(dummy_dataset):
     assert np.isclose(expected_noise, actual_noise, rtol=0.02)
 
     # Check that un-noised values are unchanged
-    not_noised_idx = noised_data.index[noised_data != ""]
-    assert "" not in noised_data[not_noised_idx].values
+    not_noised_idx = noised_data.index[noised_data.notna()]
     assert (data[not_noised_idx] == noised_data[not_noised_idx]).all()
 
 
@@ -513,5 +512,9 @@ def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_da
     noised_data_different_seed = noise_type(data, config, RANDOMNESS1, f"test_{noise}")
 
     assert (noised_data != data).any()
-    assert (noised_data == noised_data_same_seed).all()
+    assert (noised_data.isna() == noised_data_same_seed.isna()).all()
+    assert (
+        noised_data[noised_data.notna()]
+        == noised_data_same_seed[noised_data_same_seed.notna()]
+    ).all()
     assert (noised_data != noised_data_different_seed).any()
diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py
index bedefab1..0459ea91 100644
--- a/tests/unit/test_noise_form.py
+++ b/tests/unit/test_noise_form.py
@@ -7,7 +7,7 @@
 import pytest
 from vivarium.config_tree import ConfigTree
 
-from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
+from pseudopeople.entity_types import ColumnNoiseType
 from pseudopeople.interface import generate_decennial_census
 from pseudopeople.noise import noise_form
 from pseudopeople.noise_entities import NOISE_TYPES
@@ -169,7 +169,7 @@ def test_correct_forms_are_used(func, form, mocker):
     if func == "todo":
         pytest.skip(reason=f"TODO: implement function for {form.value} form")
     mock = mocker.patch("pseudopeople.interface.noise_form")
-    mocker.patch("pseudopeople.interface.pd.read_csv")
+    mocker.patch("pseudopeople.interface.pd.read_hdf", return_value=pd.DataFrame())
     _ = func("dummy/path")
 
     assert mock.call_args[0][0] == form

From 80cd56b87d4076ec172c3e5cc9f46c1729ffc83d Mon Sep 17 00:00:00 2001
From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com>
Date: Tue, 4 Apr 2023 17:47:27 -0700
Subject: [PATCH 6/6] update changelog and version (#31)

---
 CHANGELOG.rst                 | 7 +++++++
 src/pseudopeople/__about__.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c3b38b91..e9b15155 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,3 +1,10 @@
+**0.3.0 - 04/04/23**
+
+ - Implement numeric miswriting noise function
+ - Implement age miswriting noise function
+ - Implement additional forms: ACS, CPS, WIC, and SSA
+ - Read data in from HDF files instead of CSV files
+
 **0.2.1 - 03/31/23**
 
  - Fix bug preventing generation of W2/1099 forms
diff --git a/src/pseudopeople/__about__.py b/src/pseudopeople/__about__.py
index 8167c4ad..b145723e 100644
--- a/src/pseudopeople/__about__.py
+++ b/src/pseudopeople/__about__.py
@@ -13,7 +13,7 @@
 __summary__ = "pseudopeople is package which adds noise to simulated census-scale data using standard scientific Python tools."
 __uri__ = "https://github.com/ihmeuw/pseudopeople"
 
-__version__ = "0.2.1"
+__version__ = "0.3.0"
 
 __author__ = "The pseudopeople developers"
 __email__ = "vivarium.dev@gmail.com"