Skip to content

Commit

Permalink
Mic-4566/omit-row-update (#352)
Browse files Browse the repository at this point in the history
Mic-4566/omit-row-update

Add omit row to census and household surveys
- *Category*: Feature
- *JIRA issue*: [MIC-4566](https://jira.ihme.washington.edu/browse/MIC-4566)

Updates
-adds omit row to census and household surveys (ACS and CPS) -> docs: https://vivarium-research.readthedocs.io/en/latest/models/concept_models/vivarium_census_synthdata/concept_model.html#row-noise

Testing
-Updated tests because do not respond and omit row are no logner mutually exclusive. All tests pass
  • Loading branch information
albrja committed Feb 12, 2024
1 parent 529b88d commit 8d528ca
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 16 deletions.
7 changes: 7 additions & 0 deletions src/pseudopeople/configuration/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@
},
},
},
DATASETS.wic.name: {
Keys.ROW_NOISE: {
NOISE_TYPES.omit_row.name: {
Keys.ROW_PROBABILITY: 0.005,
},
},
},
# No noise of any kind for SSN in the SSA observer
DATASETS.ssa.name: {
Keys.COLUMN_NOISE: {
Expand Down
4 changes: 2 additions & 2 deletions src/pseudopeople/noise_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class __NoiseTypes(NamedTuple):
"""Container for all noise types in the order in which they should be applied:
omit_row, do_not_respond, duplicate_row, leave_blank, choose_wrong_option,
do_not_respond, omit_row, duplicate_row, leave_blank, choose_wrong_option,
copy_from_household_member, swap_month_and_day, write_wrong_zipcode_digits,
misreport_age, write_wrong_digits, use_nickname, use_fake_name,
make_phonetic_errors, make_ocr_errors, make_typos
Expand All @@ -16,10 +16,10 @@ class __NoiseTypes(NamedTuple):
in the "baseline" ConfigTree layer.
"""

omit_row: RowNoiseType = RowNoiseType("omit_row", noise_functions.omit_rows)
do_not_respond: RowNoiseType = RowNoiseType(
"do_not_respond", noise_functions.apply_do_not_respond
)
omit_row: RowNoiseType = RowNoiseType("omit_row", noise_functions.omit_rows)
# duplicate_row: RowNoiseType = RowNoiseType("duplicate_row", noise_functions.duplicate_rows)
leave_blank: ColumnNoiseType = ColumnNoiseType(
"leave_blank",
Expand Down
3 changes: 3 additions & 0 deletions src/pseudopeople/schema_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ class __Datasets(NamedTuple):
state_column_name=COLUMNS.state.name,
row_noise_types=(
NOISE_TYPES.do_not_respond,
NOISE_TYPES.omit_row,
# NOISE_TYPES.duplication,
),
date_format=DATEFORMATS.MM_DD_YYYY,
Expand Down Expand Up @@ -641,6 +642,7 @@ class __Datasets(NamedTuple):
state_column_name=COLUMNS.state.name,
row_noise_types=(
NOISE_TYPES.do_not_respond,
NOISE_TYPES.omit_row,
# NOISE_TYPES.duplication,
),
date_format=DATEFORMATS.MM_DD_YYYY,
Expand Down Expand Up @@ -669,6 +671,7 @@ class __Datasets(NamedTuple):
state_column_name=COLUMNS.state.name,
row_noise_types=(
NOISE_TYPES.do_not_respond,
NOISE_TYPES.omit_row,
# NOISE_TYPES.duplication,
),
date_format=DATEFORMATS.MM_DD_YYYY,
Expand Down
7 changes: 6 additions & 1 deletion tests/integration/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,12 @@ def test_row_noising_omit_row_or_do_not_respond(dataset_name: str, config, reque
noise_type = [
n for n in config if n in [NOISE_TYPES.omit_row.name, NOISE_TYPES.do_not_respond.name]
]
assert len(noise_type) < 2 # omit_row and do_not_respond should be mutually exclusive
if dataset_name in [DATASETS.census.name, DATASETS.acs.name, DATASETS.cps.name]:
# Census and household surveys have do_not_respond and omit_row.
# For all other datasets they are mutually exclusive
assert len(noise_type) == 2
else:
assert len(noise_type) < 2
if not noise_type: # Check that there are no missing indexes
assert noised_data.index.symmetric_difference(data.index).empty
else: # Check that there are some omissions
Expand Down
13 changes: 0 additions & 13 deletions tests/unit/test_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,19 +468,6 @@ def test_get_config(caplog):
assert column_noise_dict[column_noise][Keys.CELL_PROBABILITY] == 0.0


def test_omit_rows_do_not_respond_mutex_default_configuration():
"""Test that omit_rows and do_not_respond are not both defined in the default configuration"""
config = get_configuration()
for dataset in DATASETS:
has_omit_rows = (
NOISE_TYPES.omit_row.name in config[dataset.name][Keys.ROW_NOISE].keys()
)
has_do_not_respond = (
NOISE_TYPES.do_not_respond.name in config[dataset.name][Keys.ROW_NOISE].keys()
)
assert not has_do_not_respond or not has_omit_rows


def test_validate_nickname_configuration(caplog):
"""
Tests that warning is thrown if cell probability is higher than nickname proportion. Also tests noise leve
Expand Down

0 comments on commit 8d528ca

Please sign in to comment.