Skip to content

Commit

Permalink
Fake names (#38)
Browse files Browse the repository at this point in the history
Fake names noise function

Implementation of fake names noise function
- *Category*: Feature
- *JIRA issue*: [MIC-3878](https://jira.ihme.washington.edu/browse/MIC-3878)

-Adds implementation of fake names noise functions and unit tests
-Adds fake names module to data directory for raw data

Testing
All tests pass.
  • Loading branch information
albrja authored Apr 7, 2023
1 parent a77b937 commit ea2bf70
Show file tree
Hide file tree
Showing 4 changed files with 311 additions and 11 deletions.
188 changes: 188 additions & 0 deletions src/pseudopeople/data/fake_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
"""
This module includes lists of fake first and last names copied from a .pdf
version of a NORC report on PVS.
TODO: Find a link to the NORC report Abie used.
"""

fake_first_names = [
"GIRL",
"MOM",
"A",
"GOH",
"MOTHER",
"ADULT",
"GRANDCHILD",
"MR",
"ADULT MALE",
"GRANDDAUGHTER",
"MRS",
"B",
"GRANDSON",
"MS",
"BABY",
"H",
"N",
"BOY",
"HIJA",
"NEPHEW",
"BROTHER",
"HIJO",
"NINO",
"C",
"HOUSE",
"O",
"CHILD",
"HUSBAND",
"OLDEST",
"CHILD F",
"INMATE",
"ONE",
"COH",
"J",
"P",
"D",
"K",
"PERSON",
"DAD",
"KID",
"R",
"DAU",
"L",
"RESIDENT",
"DAUGHTER",
"LADY",
"RESPONDENT",
"DAUGHTER OF",
"LADY IN THE",
"S",
"DOH",
"LADY OF",
"SENOR",
"E",
"LADY OF HOUSE",
"SENORA",
"F",
"LADY OF THE",
"SISTER",
"FATHER",
"LOH",
"SOH",
"FEMALE",
"M",
"SON",
"FEMALE CHILD",
"MALE",
"SON OF",
"FRIEND",
"MALE CHILD",
"T",
"G",
"MAN",
"V",
"GENT",
"MAN IN THE",
"W",
"GENTELMAN",
"MAN OF",
"WIFE",
"GENTLE",
"MAN OF THE",
"WOMAN",
"GENTLEMAN",
"MINOR",
"YOUNGEST",
"GENTLEMAN OF",
"MISS",
"GENTLEMEN",
"MOH",
]

fake_last_names = [
"HH",
"OF THE HOUSE",
"A",
"HHM",
"ONE",
"ADULT",
"HOME",
"OWNER",
"ANON",
"HOUSE",
"P",
"ANONYMOUS",
"HOUSEHOLD",
"PARENT",
"APELLIDO",
"HOUSEHOLDER",
"PERSON",
"B",
"HUSBAND",
"R",
"BOY",
"J",
"REF",
"C",
"K",
"REFUSE",
"CASA",
"L",
"RESIDENT",
"CHILD",
"LADY",
"RESP",
"COH",
"LADY OF HOUSE",
"RESPONDANT",
"D",
"LADY OF THE HOUSE",
"RESPONDENT",
"DAUGHTER",
"LAST NAME",
"S",
"DE CASA",
"LOH",
"SOH",
"DE LA CASA",
"M",
"SON",
"DECLINED",
"MALE",
"T",
"DOE",
"MAN",
"THE HOUSE",
"DOH",
"MAN OF THE HOUSE",
"THREE",
"DONT KNOW",
"MOH",
"TWO",
"E",
"N",
"UNK",
"F",
"NA",
"UNKNOWN",
"FEMALE",
"NO",
"W",
"FOUR",
"NO LAST NAME",
"WIFE",
"FRIEND",
"NO NAME",
"X",
"G",
"NONE",
"XXX",
"GIRL",
"O",
"Y",
"GOH",
"OCCUPANT",
"YOUNGER",
"H",
"OF HOUSE",
"H AGE",
"OF THE HOME",
]
24 changes: 24 additions & 0 deletions src/pseudopeople/default_configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,17 @@ decennial_census:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
last_name:
missing_data:
row_noise_level: 0.01
typographic:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
middle_initial:
missing_data:
row_noise_level: 0.01
Expand Down Expand Up @@ -214,6 +218,8 @@ taxes_w2_and_1099:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
income:
missing_data:
row_noise_level: 0.01
Expand All @@ -231,6 +237,8 @@ taxes_w2_and_1099:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
mailing_address_city:
missing_data:
row_noise_level: 0.01
Expand Down Expand Up @@ -354,13 +362,17 @@ american_communities_survey:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
last_name:
missing_data:
row_noise_level: 0.01
typographic:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
mailing_address_po_box:
missing_data:
row_noise_level: 0.01
Expand Down Expand Up @@ -467,13 +479,17 @@ current_population_survey:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
last_name:
missing_data:
row_noise_level: 0.01
typographic:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
mailing_address_po_box:
missing_data:
row_noise_level: 0.01
Expand Down Expand Up @@ -580,13 +596,17 @@ women_infants_and_children:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
last_name:
missing_data:
row_noise_level: 0.01
typographic:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
mailing_address_po_box:
missing_data:
row_noise_level: 0.01
Expand Down Expand Up @@ -706,13 +726,17 @@ social_security:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
last_name:
missing_data:
row_noise_level: 0.01
typographic:
row_noise_level: 0.01
token_noise_level: 0.1
include_original_token_level: 0.1
fake_names:
row_noise_level: 0.01
middle_initial:
missing_data:
row_noise_level: 0.01
Expand Down
26 changes: 19 additions & 7 deletions src/pseudopeople/noise_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from vivarium.framework.randomness import RandomnessStream

from pseudopeople.constants import paths
from pseudopeople.data.fake_names import fake_first_names, fake_last_names
from pseudopeople.utilities import vectorized_choice


Expand Down Expand Up @@ -73,7 +74,7 @@ def generate_incorrect_selections(
options=options,
n_to_choose=len(column),
randomness_stream=randomness_stream,
additional_key=f"{additional_key}_{column.name}_incorrect_select_choice",
additional_key=f"{additional_key}_incorrect_select_choice",
).to_numpy()

return pd.Series(new_values, index=column.index)
Expand Down Expand Up @@ -258,20 +259,31 @@ def generate_nicknames(

def generate_fake_names(
column: pd.Series,
configuration: ConfigTree,
_: ConfigTree,
randomness_stream: RandomnessStream,
additional_key: Any,
) -> pd.Series:
"""
:param column:
:param configuration:
:param randomness_stream:
:param column: pd.Series of names
:param _: ConfigTree object with noise level values
:param randomness_stream: RandomnessStream instance of vivarium
:param additional_key: Key for RandomnessStream
:return:
"""
# todo actually generate fake names
return column
name = column.name
fake_first = fake_first_names
fake_last = fake_last_names
fake_names = {"first_name": fake_first, "last_name": fake_last}
options = fake_names[name]

new_values = vectorized_choice(
options=options,
n_to_choose=len(column),
randomness_stream=randomness_stream,
additional_key=f"{additional_key}_fake_names",
)
return pd.Series(new_values, index=column.index)


def generate_phonetic_errors(
Expand Down
Loading

0 comments on commit ea2bf70

Please sign in to comment.