Skip to content

Commit

Permalink
initialize sample data
Browse files Browse the repository at this point in the history
  • Loading branch information
hussain-jafari committed Dec 6, 2024
1 parent 8345b20 commit ddba32e
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 24 deletions.
11 changes: 2 additions & 9 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
from pseudopeople.utilities import coerce_dtypes
from tests.utilities import initialize_dataset_with_sample

ROW_PROBABILITY = 0.05
CELL_PROBABILITY = 0.25
Expand Down Expand Up @@ -209,19 +210,11 @@ def noised_sample_data_taxes_1040(config: dict[str, Any]) -> pd.DataFrame:


def get_unnoised_data(dataset_name: str) -> Dataset:
result = _initialize_dataset_with_sample(dataset_name)
result = initialize_dataset_with_sample(dataset_name)
result.data = coerce_dtypes(result.data, result.dataset_schema)
return result


def _initialize_dataset_with_sample(dataset_name: str) -> Dataset:
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
data_path = paths.SAMPLE_DATA_ROOT / dataset_name / f"{dataset_name}.parquet"
dataset = Dataset(dataset_schema, pd.read_parquet(data_path), SEED)

return dataset


def _get_common_datasets(
unnoised_dataset: Dataset, noised_dataset: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Index[int]]:
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import DATASET_SCHEMAS
from tests.integration.conftest import _initialize_dataset_with_sample
from tests.utilities import initialize_dataset_with_sample


@pytest.mark.parametrize(
Expand All @@ -22,7 +22,7 @@
)
def test_dataset_missingness(dataset_name: str) -> None:
"""Tests that missingness is accurate with dataset.data."""
dataset = _initialize_dataset_with_sample(dataset_name)
dataset = initialize_dataset_with_sample(dataset_name)
# We must manually clean the data for noising since we are recreating our main noising loop
dataset._clean_input_data()
dataset._reformat_dates_for_noising()
Expand Down
7 changes: 3 additions & 4 deletions tests/integration/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@
SEED,
STATE,
_get_common_datasets,
_initialize_dataset_with_sample,
get_unnoised_data,
)
from tests.utilities import run_column_noising_tests, run_omit_row_or_do_not_respond_tests, validate_column_noise_level
from tests.utilities import initialize_dataset_with_sample, run_column_noising_tests, run_omit_row_or_do_not_respond_tests, validate_column_noise_level


DATASET_GENERATION_FUNCS: dict[str, Callable[..., Any]] = {
Expand Down Expand Up @@ -84,7 +83,7 @@ def test_generate_dataset_from_multiple_shards(
pytest.skip(reason=dataset_name)
mocker.patch("pseudopeople.interface.validate_source_compatibility")
generation_function = DATASET_GENERATION_FUNCS[dataset_name]
original = _initialize_dataset_with_sample(dataset_name)
original = initialize_dataset_with_sample(dataset_name)
noised_sample = request.getfixturevalue(f"noised_sample_data_{dataset_name}")

noised_dataset = generation_function(
Expand Down Expand Up @@ -273,7 +272,7 @@ def test_column_noising(
"""Tests that columns are noised as expected"""
if "TODO" in dataset_name:
pytest.skip(reason=dataset_name)
original = _initialize_dataset_with_sample(dataset_name)
original = initialize_dataset_with_sample(dataset_name)
if engine == "dask":
generation_function = DATASET_GENERATION_FUNCS[dataset_name]
noised_data = generation_function(
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
from tests.integration.conftest import (
_get_common_datasets,
_initialize_dataset_with_sample,
)
from tests.utilities import initialize_dataset_with_sample


@pytest.mark.parametrize(
Expand All @@ -29,7 +29,7 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
unnoised_id_cols = [COLUMNS.simulant_id.name]
if dataset_name != DATASET_SCHEMAS.ssa.name:
unnoised_id_cols.append(COLUMNS.household_id.name)
original = _initialize_dataset_with_sample(dataset_name)
original = initialize_dataset_with_sample(dataset_name)
noised_data = request.getfixturevalue(f"noised_sample_data_{dataset_name}")
check_noised, check_original, _ = _get_common_datasets(original, noised_data)
assert (
Expand Down
10 changes: 6 additions & 4 deletions tests/release/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
from tests.utilities import initialize_dataset_with_sample


DATASET_GENERATION_FUNCS: dict[str, Callable[..., Any]] = {
Expand Down Expand Up @@ -157,10 +158,12 @@ def unnoised_dataset(
request: pytest.FixtureRequest,
config: dict[str, Any],
) -> pd.DataFrame:
population = request.config.getoption('--population', default='sample')
if population == 'sample':
# get sample data
dataset_arg, dataset_func, source, year, state, engine = dataset_params
dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_arg]

if source is None:
return initialize_dataset_with_sample(dataset_name)

no_noise_config = get_configuration("no_noise").to_dict()

if dataset_func == generate_social_security:
Expand All @@ -172,7 +175,6 @@ def unnoised_dataset(
source=source, year=year, state=state, engine=engine, config=no_noise_config
)

dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_arg]
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
return Dataset(dataset_schema, unnoised_data, SEED)

Expand Down
4 changes: 2 additions & 2 deletions tests/release/test_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
from tests.integration.conftest import (
IDX_COLS,
_get_common_datasets,
_initialize_dataset_with_sample,
get_unnoised_data,
)
from tests.utilities import (
initialize_dataset_with_sample,
run_column_noising_tests,
run_omit_row_or_do_not_respond_tests,
)
Expand Down Expand Up @@ -102,7 +102,7 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
unnoised_id_cols = [COLUMNS.simulant_id.name]
if dataset_name != DATASET_SCHEMAS.ssa.name:
unnoised_id_cols.append(COLUMNS.household_id.name)
original = _initialize_dataset_with_sample(dataset_name)
original = initialize_dataset_with_sample(dataset_name)
noised_data = request.getfixturevalue("data")
check_noised, check_original, _ = _get_common_datasets(original, noised_data)
assert (
Expand Down
13 changes: 12 additions & 1 deletion tests/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

from pseudopeople.configuration import Keys, get_configuration
from pseudopeople.configuration.noise_configuration import NoiseConfiguration
from pseudopeople.constants import paths
from pseudopeople.dataset import Dataset
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS, Column
from pseudopeople.utilities import (
Expand Down Expand Up @@ -194,4 +196,13 @@ def validate_column_noise_level(
observed_denominator=len(check_data.loc[check_idx, col.name]),
target_proportion=expected_noise,
name_additional=f"{dataset_name}_{col.name}_{col_noise_type.name}",
)
)


def initialize_dataset_with_sample(dataset_name: str) -> Dataset:
SEED = 0
dataset_schema = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
data_path = paths.SAMPLE_DATA_ROOT / dataset_name / f"{dataset_name}.parquet"
dataset = Dataset(dataset_schema, pd.read_parquet(data_path), SEED)

return dataset

0 comments on commit ddba32e

Please sign in to comment.