Mic-716/Coerce empty strings to np.nan for PO Box addresses (#354)

Mic-716/Coerce empty strings to np.nan for PO box addresses Coerces null value from empty string to np.nan for addresses with a PO box. - *Category*: Bugfix - *JIRA issue*: [MIC-4716](https://jira.ihme.washington.edu/browse/MIC-4717) -Coerces null value from empty string to np.nan for addresses with a PO box. This issue was present for full USA dataset. Note the new full USA data did not have this bug so I suspect the recent pandas 2.1 update may have fixed this issue. Testing Tested on two different full USA datasets and both will output correct null values for mailing address columns.
ihmeuw · Dec 1, 2023 · ffcad1e · ffcad1e
1 parent 4170aa7
commit ffcad1e
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 5 deletions.
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
@@ -1,15 +1,14 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import pandas as pd
-import yaml
 from loguru import logger
 from packaging.version import parse
 from tqdm import tqdm
 
 from pseudopeople import __version__ as psp_version
 from pseudopeople.configuration import get_configuration
-from pseudopeople.configuration.validator import validate_noise_level_proportions
 from pseudopeople.constants import paths
 from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS, INT_COLUMNS
 from pseudopeople.exceptions import DataSourceError
@@ -96,7 +95,11 @@ def _generate_dataset(
 
     # Known pandas bug: pd.concat does not preserve category dtypes so we coerce
     # again after concat (https://github.com/pandas-dev/pandas/issues/51362)
-    noised_dataset = _coerce_dtypes(noised_dataset, dataset, cleanse_int_cols=True)
+    noised_dataset = _coerce_dtypes(
+        noised_dataset,
+        dataset,
+        cleanse_int_cols=True,
+    )
 
     logger.debug("*** Finished ***")
 
@@ -136,13 +139,18 @@ def _get_data_changelog_version(changelog):
 
 
 def _coerce_dtypes(
-    data: pd.DataFrame, dataset: Dataset, cleanse_int_cols: bool = False
+    data: pd.DataFrame,
+    dataset: Dataset,
+    cleanse_int_cols: bool = False,
 ) -> pd.DataFrame:
     # Coerce dtypes prior to noising to catch issues early as well as
     # get most columns away from dtype 'category' and into 'object' (strings)
     for col in dataset.columns:
         if cleanse_int_cols and col.name in INT_COLUMNS:
             data[col.name] = cleanse_integer_columns(data[col.name])
+        # Coerce empty strings to nans
+        if cleanse_int_cols and col.name not in INT_COLUMNS:
+            data[col.name] = data[col.name].replace("", np.nan)
         if col.dtype_name != data[col.name].dtype.name:
             data[col.name] = data[col.name].astype(col.dtype_name)
 

diff --git a/tests/unit/test_configuration.py b/tests/unit/test_configuration.py
@@ -539,7 +539,6 @@ def test_validate_noise_level_proportions(caplog, column, noise_type, noise_leve
     Tests that a warning is thrown when a user provides configuration overrides that are higher
     than the calculated metadata proportions for that column noise type pairing.
     """
-
     census = DATASETS.get_dataset("decennial_census")
     user_filters = [
         (census.date_column_name, "==", 2020),