diff --git a/src/pseudopeople/configuration/generator.py b/src/pseudopeople/configuration/generator.py index fca2992f..bbeb4736 100644 --- a/src/pseudopeople/configuration/generator.py +++ b/src/pseudopeople/configuration/generator.py @@ -55,6 +55,13 @@ }, }, }, + DATASETS.wic.name: { + Keys.ROW_NOISE: { + NOISE_TYPES.omit_row.name: { + Keys.ROW_PROBABILITY: 0.005, + }, + }, + }, # No noise of any kind for SSN in the SSA observer DATASETS.ssa.name: { Keys.COLUMN_NOISE: { diff --git a/src/pseudopeople/noise_entities.py b/src/pseudopeople/noise_entities.py index 8ff94558..e9631a57 100644 --- a/src/pseudopeople/noise_entities.py +++ b/src/pseudopeople/noise_entities.py @@ -7,7 +7,7 @@ class __NoiseTypes(NamedTuple): """Container for all noise types in the order in which they should be applied: - omit_row, do_not_respond, duplicate_row, leave_blank, choose_wrong_option, + do_not_respond, omit_row, duplicate_row, leave_blank, choose_wrong_option, copy_from_household_member, swap_month_and_day, write_wrong_zipcode_digits, misreport_age, write_wrong_digits, use_nickname, use_fake_name, make_phonetic_errors, make_ocr_errors, make_typos @@ -16,10 +16,10 @@ class __NoiseTypes(NamedTuple): in the "baseline" ConfigTree layer. """ - omit_row: RowNoiseType = RowNoiseType("omit_row", noise_functions.omit_rows) do_not_respond: RowNoiseType = RowNoiseType( "do_not_respond", noise_functions.apply_do_not_respond ) + omit_row: RowNoiseType = RowNoiseType("omit_row", noise_functions.omit_rows) # duplicate_row: RowNoiseType = RowNoiseType("duplicate_row", noise_functions.duplicate_rows) leave_blank: ColumnNoiseType = ColumnNoiseType( "leave_blank", diff --git a/src/pseudopeople/schema_entities.py b/src/pseudopeople/schema_entities.py index 64009ecd..f47d7356 100644 --- a/src/pseudopeople/schema_entities.py +++ b/src/pseudopeople/schema_entities.py @@ -611,6 +611,7 @@ class __Datasets(NamedTuple): state_column_name=COLUMNS.state.name, row_noise_types=( NOISE_TYPES.do_not_respond, + NOISE_TYPES.omit_row, # NOISE_TYPES.duplication, ), date_format=DATEFORMATS.MM_DD_YYYY, @@ -641,6 +642,7 @@ class __Datasets(NamedTuple): state_column_name=COLUMNS.state.name, row_noise_types=( NOISE_TYPES.do_not_respond, + NOISE_TYPES.omit_row, # NOISE_TYPES.duplication, ), date_format=DATEFORMATS.MM_DD_YYYY, @@ -669,6 +671,7 @@ class __Datasets(NamedTuple): state_column_name=COLUMNS.state.name, row_noise_types=( NOISE_TYPES.do_not_respond, + NOISE_TYPES.omit_row, # NOISE_TYPES.duplication, ), date_format=DATEFORMATS.MM_DD_YYYY, diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 4332651f..fb89be08 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -270,7 +270,12 @@ def test_row_noising_omit_row_or_do_not_respond(dataset_name: str, config, reque noise_type = [ n for n in config if n in [NOISE_TYPES.omit_row.name, NOISE_TYPES.do_not_respond.name] ] - assert len(noise_type) < 2 # omit_row and do_not_respond should be mutually exclusive + if dataset_name in [DATASETS.census.name, DATASETS.acs.name, DATASETS.cps.name]: + # Census and household surveys have do_not_respond and omit_row. + # For all other datasets they are mutually exclusive + assert len(noise_type) == 2 + else: + assert len(noise_type) < 2 if not noise_type: # Check that there are no missing indexes assert noised_data.index.symmetric_difference(data.index).empty else: # Check that there are some omissions diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py index 0c7644e8..23f9b0db 100644 --- a/tests/unit/test_configuration.py +++ b/tests/unit/test_configuration.py @@ -468,19 +468,6 @@ def test_get_config(caplog): assert column_noise_dict[column_noise][Keys.CELL_PROBABILITY] == 0.0 -def test_omit_rows_do_not_respond_mutex_default_configuration(): - """Test that omit_rows and do_not_respond are not both defined in the default configuration""" - config = get_configuration() - for dataset in DATASETS: - has_omit_rows = ( - NOISE_TYPES.omit_row.name in config[dataset.name][Keys.ROW_NOISE].keys() - ) - has_do_not_respond = ( - NOISE_TYPES.do_not_respond.name in config[dataset.name][Keys.ROW_NOISE].keys() - ) - assert not has_do_not_respond or not has_omit_rows - - def test_validate_nickname_configuration(caplog): """ Tests that warning is thrown if cell probability is higher than nickname proportion. Also tests noise leve