Skip to content

Commit

Permalink
intermediate push
Browse files Browse the repository at this point in the history
  • Loading branch information
Hussain Jafari committed Jan 22, 2025
1 parent e313609 commit 7fed824
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 20 deletions.
5 changes: 4 additions & 1 deletion src/pseudopeople/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,10 @@ def _reformat_dates_for_noising(self) -> None:
# https://github.com/pandas-dev/pandas/issues/44764
# Year is already guaranteed to be 4-digit: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits
is_na = data[column].isna()
data_column = data.loc[~is_na, column]
try:
data_column = pd.to_datetime(data.loc[~is_na, column])
except:
breakpoint()
year_string = data_column.dt.year.astype(str)
month_string = _zfill_fast(data_column.dt.month.astype(str), 2)
day_string = _zfill_fast(data_column.dt.day.astype(str), 2)
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/release/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@
"cps": "current_population_survey",
"census": "decennial_census",
"ssa": "social_security",
"taxes_1040": "taxes_1040",
"tax_1040": "taxes_1040",
"taxes_w2_and_1099": "taxes_w2_and_1099",
"wic": "women_infants_and_children",
}

SEED = 0
CLI_DEFAULT_DATASET = "acs"
CLI_DEFAULT_POP = "sample"
CLI_DEFAULT_POP = "USA"
CLI_DEFAULT_YEAR = 2020
CLI_DEFAULT_STATE = None
CLI_DEFAULT_ENGINE = "pandas"
Expand Down
17 changes: 14 additions & 3 deletions tests/integration/release/test_release.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from pseudopeople.dataset import Dataset
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
from pytest_check import check
from tests.constants import DATASET_GENERATION_FUNCS
from tests.integration.conftest import IDX_COLS, _get_common_datasets, get_unnoised_data
from tests.utilities import (
Expand Down Expand Up @@ -36,18 +37,18 @@ def test_column_noising(


def test_row_noising_omit_row_or_do_not_respond(
unnoised_dataset: Dataset,
noised_data: pd.DataFrame,
dataset_name: str,
config: dict[str, Any],
request: FixtureRequest,
) -> None:
"""Tests that omit_row and do_not_respond row noising are being applied"""
idx_cols = IDX_COLS.get(dataset_name)
original = get_unnoised_data(dataset_name)
original_data = original.data.set_index(idx_cols)
unnoised_data = unnoised_dataset.data.set_index(idx_cols)
noised_data = noised_data.set_index(idx_cols)

run_omit_row_or_do_not_respond_tests(dataset_name, config, original_data, noised_data)
run_omit_row_or_do_not_respond_tests(dataset_name, config, unnoised_data, noised_data)


def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
Expand All @@ -68,3 +69,13 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
.all()
.all()
)


def test_example():
a = 1
b = 2
c = [2, 4, 6]
check.greater(a, b)
check.less_equal(b, a)
check.is_in(a, c, "Is 1 in the list")
check.is_not_in(b, c, "make sure 2 isn't in list")
9 changes: 5 additions & 4 deletions tests/integration/release/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
@pytest.mark.parametrize(
"pytest_args",
[
([]),
(["--dataset", "acs"]),
(["--dataset", "cps"]),
(["--population", "sample"]),
(["--dataset", "acs", "--population", "sample"]),
#(["--dataset", "cps"]),
# (["--dataset", "acs", "--population", "USA"]),
# (["--dataset", "acs", "--population", "USA", "--state", "RI"]),
(["--dataset", "wic", "--year", "2015"]),
#(["--dataset", "wic", "--year", "2015"]),
# (["--dataset", "wic", "--population", "USA", "--state", "RI", "--year", "2015"]),
],
)
Expand All @@ -22,6 +22,7 @@ def test_release_tests(pytest_args: list[str]) -> None:
base_cmd = ["pytest", "--release", "test_release.py"]
cmd = base_cmd + pytest_args
result = subprocess.run(cmd, capture_output=True, text=True)
breakpoint()
assert result.returncode == 0


Expand Down
26 changes: 16 additions & 10 deletions tests/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import numpy.typing as npt
import pandas as pd
from pytest_check import check
from vivarium_testing_utils import FuzzyChecker

from pseudopeople.configuration import Keys, get_configuration
Expand Down Expand Up @@ -43,7 +44,9 @@ def run_column_noising_tests(
check_original.loc[to_compare_idx, col.name].values
!= check_noised.loc[to_compare_idx, col.name].values
)
assert different_check.any()
#with check:
# assert different_check.any()
check.is_true(different_check.any())

noise_level = different_check.sum()

Expand Down Expand Up @@ -97,7 +100,6 @@ def run_omit_row_or_do_not_respond_tests(
assert noised_data.index.difference(original_data.index).empty
assert not original_data.index.difference(noised_data.index).empty


def validate_column_noise_level(
dataset_name: str,
check_data: pd.DataFrame,
Expand Down Expand Up @@ -169,14 +171,18 @@ def validate_column_noise_level(
)

expected_noise = 1 - not_noised
# Fuzzy checker
validator.fuzzy_assert_proportion(
name=fuzzy_name,
observed_numerator=noise_level,
observed_denominator=len(check_data.loc[check_idx, col.name]),
target_proportion=expected_noise,
name_additional=f"{dataset_name}_{col.name}_{col_noise_type.name}",
)

try:
# Fuzzy checker
validator.fuzzy_assert_proportion(
name=fuzzy_name,
observed_numerator=noise_level,
observed_denominator=len(check_data.loc[check_idx, col.name]),
target_proportion=expected_noise,
name_additional=f"{dataset_name}_{col.name}_{col_noise_type.name}",
)
except:
print(f"{dataset_name} and {col.name} have expected {expected_noise} and actual {noise_level / len(check_data.loc[check_idx, col.name])}")


def initialize_dataset_with_sample(dataset_name: str) -> Dataset:
Expand Down

0 comments on commit 7fed824

Please sign in to comment.