initialize sample data

ihmeuw · Dec 6, 2024 · ddba32e · ddba32e
1 parent 8345b20
commit ddba32e
Show file tree

Hide file tree

Showing 7 changed files with 29 additions and 24 deletions.
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -23,6 +23,7 @@
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
 from pseudopeople.utilities import coerce_dtypes
+from tests.utilities import initialize_dataset_with_sample
 
 ROW_PROBABILITY = 0.05
 CELL_PROBABILITY = 0.25
@@ -209,19 +210,11 @@ def noised_sample_data_taxes_1040(config: dict[str, Any]) -> pd.DataFrame:
 
 
 def get_unnoised_data(dataset_name: str) -> Dataset:
-    result = _initialize_dataset_with_sample(dataset_name)
+    result = initialize_dataset_with_sample(dataset_name)
     result.data = coerce_dtypes(result.data, result.dataset_schema)
     return result
 
 
-def _initialize_dataset_with_sample(dataset_name: str) -> Dataset:
-    dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
-    data_path = paths.SAMPLE_DATA_ROOT / dataset_name / f"{dataset_name}.parquet"
-    dataset = Dataset(dataset_schema, pd.read_parquet(data_path), SEED)
-
-    return dataset
-
-
 def _get_common_datasets(
     unnoised_dataset: Dataset, noised_dataset: pd.DataFrame
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Index[int]]:

diff --git a/tests/integration/test_dataset.py b/tests/integration/test_dataset.py
@@ -5,7 +5,7 @@
 from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import DATASET_SCHEMAS
-from tests.integration.conftest import _initialize_dataset_with_sample
+from tests.utilities import initialize_dataset_with_sample
 
 
 @pytest.mark.parametrize(
@@ -22,7 +22,7 @@
 )
 def test_dataset_missingness(dataset_name: str) -> None:
     """Tests that missingness is accurate with dataset.data."""
-    dataset = _initialize_dataset_with_sample(dataset_name)
+    dataset = initialize_dataset_with_sample(dataset_name)
     # We must manually clean the data for noising since we are recreating our main noising loop
     dataset._clean_input_data()
     dataset._reformat_dates_for_noising()

diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
@@ -32,10 +32,9 @@
     SEED,
     STATE,
     _get_common_datasets,
-    _initialize_dataset_with_sample,
     get_unnoised_data,
 )
-from tests.utilities import run_column_noising_tests, run_omit_row_or_do_not_respond_tests, validate_column_noise_level
+from tests.utilities import initialize_dataset_with_sample, run_column_noising_tests, run_omit_row_or_do_not_respond_tests, validate_column_noise_level
 
 
 DATASET_GENERATION_FUNCS: dict[str, Callable[..., Any]] = {
@@ -84,7 +83,7 @@ def test_generate_dataset_from_multiple_shards(
         pytest.skip(reason=dataset_name)
     mocker.patch("pseudopeople.interface.validate_source_compatibility")
     generation_function = DATASET_GENERATION_FUNCS[dataset_name]
-    original = _initialize_dataset_with_sample(dataset_name)
+    original = initialize_dataset_with_sample(dataset_name)
     noised_sample = request.getfixturevalue(f"noised_sample_data_{dataset_name}")
 
     noised_dataset = generation_function(
@@ -273,7 +272,7 @@ def test_column_noising(
     """Tests that columns are noised as expected"""
     if "TODO" in dataset_name:
         pytest.skip(reason=dataset_name)
-    original = _initialize_dataset_with_sample(dataset_name)
+    original = initialize_dataset_with_sample(dataset_name)
     if engine == "dask":
         generation_function = DATASET_GENERATION_FUNCS[dataset_name]
         noised_data = generation_function(

diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py
@@ -4,8 +4,8 @@
 from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
 from tests.integration.conftest import (
     _get_common_datasets,
-    _initialize_dataset_with_sample,
 )
+from tests.utilities import initialize_dataset_with_sample
 
 
 @pytest.mark.parametrize(
@@ -29,7 +29,7 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
     unnoised_id_cols = [COLUMNS.simulant_id.name]
     if dataset_name != DATASET_SCHEMAS.ssa.name:
         unnoised_id_cols.append(COLUMNS.household_id.name)
-    original = _initialize_dataset_with_sample(dataset_name)
+    original = initialize_dataset_with_sample(dataset_name)
     noised_data = request.getfixturevalue(f"noised_sample_data_{dataset_name}")
     check_noised, check_original, _ = _get_common_datasets(original, noised_data)
     assert (

diff --git a/tests/release/conftest.py b/tests/release/conftest.py
@@ -22,6 +22,7 @@
 )
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
+from tests.utilities import initialize_dataset_with_sample
 
 
 DATASET_GENERATION_FUNCS: dict[str, Callable[..., Any]] = {
@@ -157,10 +158,12 @@ def unnoised_dataset(
     request: pytest.FixtureRequest,
     config: dict[str, Any],
 ) -> pd.DataFrame:
-    population = request.config.getoption('--population', default='sample')
-    if population == 'sample':
-        # get sample data
     dataset_arg, dataset_func, source, year, state, engine = dataset_params
+    dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_arg]
+
+    if source is None:
+        return initialize_dataset_with_sample(dataset_name)
+
     no_noise_config = get_configuration("no_noise").to_dict()
 
     if dataset_func == generate_social_security:
@@ -172,7 +175,6 @@ def unnoised_dataset(
             source=source, year=year, state=state, engine=engine, config=no_noise_config
         )
 
-    dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_arg]
     dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
     return Dataset(dataset_schema, unnoised_data, SEED)
 

diff --git a/tests/release/test_release.py b/tests/release/test_release.py
@@ -29,10 +29,10 @@
 from tests.integration.conftest import (
     IDX_COLS,
     _get_common_datasets,
-    _initialize_dataset_with_sample,
     get_unnoised_data,
 )
 from tests.utilities import (
+    initialize_dataset_with_sample,
     run_column_noising_tests,
     run_omit_row_or_do_not_respond_tests,
 )
@@ -102,7 +102,7 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
     unnoised_id_cols = [COLUMNS.simulant_id.name]
     if dataset_name != DATASET_SCHEMAS.ssa.name:
         unnoised_id_cols.append(COLUMNS.household_id.name)
-    original = _initialize_dataset_with_sample(dataset_name)
+    original = initialize_dataset_with_sample(dataset_name)
     noised_data = request.getfixturevalue("data")
     check_noised, check_original, _ = _get_common_datasets(original, noised_data)
     assert (

diff --git a/tests/utilities.py b/tests/utilities.py
@@ -12,6 +12,8 @@
 
 from pseudopeople.configuration import Keys, get_configuration
 from pseudopeople.configuration.noise_configuration import NoiseConfiguration
+from pseudopeople.constants import paths
+from pseudopeople.dataset import Dataset
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS, Column
 from pseudopeople.utilities import (
@@ -194,4 +196,13 @@ def validate_column_noise_level(
         observed_denominator=len(check_data.loc[check_idx, col.name]),
         target_proportion=expected_noise,
         name_additional=f"{dataset_name}_{col.name}_{col_noise_type.name}",
-    )
+    )
+
+
+def initialize_dataset_with_sample(dataset_name: str) -> Dataset:
+    SEED = 0
+    dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
+    data_path = paths.SAMPLE_DATA_ROOT / dataset_name / f"{dataset_name}.parquet"
+    dataset = Dataset(dataset_schema, pd.read_parquet(data_path), SEED)
+
+    return dataset