fix: Normalize VCE RARE spellings of great lakes. (#4029)

* fix: Normalize VCE RARE spellings of great lakes. Fixes #4007 * Add lake_hurron and lake_st_clair to unexpected counties check * Switch to checking row counts by report year instead of by job name * Finish up release notes for #4007 fix * [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --------- Co-authored-by: Kathryn Mazaitis <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
catalyst-cooperative · Feb 6, 2025 · 18a19b7 · 18a19b7
1 parent fd4b1c6
commit 18a19b7
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 15 deletions.
diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -61,6 +61,9 @@ Bug Fixes
   this fix.
 * Added preliminary data validation checks for several FERC 1 tables that were
   missing it :pr:`3860`.
+* Fix spelling of Lake Huron and Lake Saint Clair in
+  :ref:`out_vcerare__hourly_available_capacity_factor` and related tables. See issue
+  :issue:`4007` and PR :pr:`4029`.
 
 Major Dependency Updates
 ^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/src/pudl/transform/vcerare.py b/src/pudl/transform/vcerare.py
@@ -55,6 +55,7 @@ def _prep_lat_long_fips_df(raw_vcerare__lat_lon_fips: pd.DataFrame) -> pd.DataFr
         .assign(
             county_state_names=lambda x: x.county_state_names.str.lower()
             .replace({r"\.": "", "-": "_"}, regex=True)
+            .pipe(_spot_fix_great_lakes_values)
             .astype("category")
         )
         # Fix FIPS codes with no leading zeros
@@ -92,16 +93,19 @@ def _prep_lat_long_fips_df(raw_vcerare__lat_lon_fips: pd.DataFrame) -> pd.DataFr
         # Remove state FIPS code column in favor of the newly added state column.
         .drop(columns=["state_id_fips", "fips", "subdivision_code"])
     )
+
+    logger.info("Spot check: fixed typos in great lakes.")
+
     logger.info("Nulling FIPS IDs for non-county regions.")
     lake_county_state_names = [
         "lake_erie_ohio",
-        "lake_hurron_michigan",
+        "lake_huron_michigan",
         "lake_michigan_illinois",
         "lake_michigan_indiana",
         "lake_michigan_michigan",
         "lake_michigan_wisconsin",
         "lake_ontario_new_york",
-        "lake_st_clair_michigan",
+        "lake_saint_clair_michigan",
         "lake_superior_minnesota",
         "lake_superior_michigan",
         "lake_superior_wisconsin",
@@ -261,6 +265,23 @@ def _get_parquet_path():
     return PudlPaths().parquet_path("out_vcerare__hourly_available_capacity_factor")
 
 
+def _spot_fix_great_lakes_values(sr: pd.Series) -> pd.Series:
+    """Normalize spelling of great lakes in cell values."""
+    return sr.replace("lake_hurron_michigan", "lake_huron_michigan").replace(
+        "lake_st_clair_michigan", "lake_saint_clair_michigan"
+    )
+
+
+def _spot_fix_great_lakes_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Normalize spelling of great lakes in column names."""
+    return df.rename(
+        columns={
+            "lake_hurron_michigan": "lake_huron_michigan",
+            "lake_st_clair_michigan": "lake_saint_clair_michigan",
+        }
+    )
+
+
 def one_year_hourly_available_capacity_factor(
     year: int,
     raw_vcerare__lat_lon_fips: pd.DataFrame,
@@ -287,7 +308,8 @@ def one_year_hourly_available_capacity_factor(
         "onshore_wind": raw_vcerare__onshore_wind_power_100m,
     }
     clean_dict = {
-        df_name: _check_for_valid_counties(df, fips_df, df_name)
+        df_name: _spot_fix_great_lakes_columns(df)
+        .pipe(_check_for_valid_counties, fips_df, df_name)
         .pipe(_add_time_cols, df_name)
         .pipe(_drop_city_cols, df_name)
         .pipe(_make_cap_fac_frac, df_name)
@@ -363,21 +385,30 @@ def check_rows(context: AssetCheckExecutionContext) -> AssetCheckResult:
     """Check rows."""
     logger.info("Check VCE RARE hourly table is the expected length")
 
-    # Define row counts for fast/full etl
-    # TODO 2024-12-27: make this check row counts per year instead of having
-    # two different counts based on job name - less brittle.
-    row_counts = {
-        "etl_full": 136437000,
-        "etl_fast": 27287400,
+    # Define row counts for report years
+    row_counts_by_year = {
+        2019: 27287400,
+        2020: 27287400,
+        2021: 27287400,
+        2022: 27287400,
+        2023: 27287400,
     }
 
     vce = _load_duckdb_table()  # noqa: F841
-    (length,) = duckdb.query("SELECT COUNT(*) FROM vce").fetchone()
-    if (expected_length := row_counts[context.op_execution_context.job_name]) != length:
+    errors = []
+    for report_year, length in duckdb.query(
+        "SELECT report_year, COUNT(*) FROM vce GROUP BY ALL"
+    ).fetchall():
+        if (expected_length := row_counts_by_year[report_year]) != length:
+            errors.append(
+                f"Expected {expected_length} for report year {report_year}, found {length}"
+            )
+    if errors:
+        logger.warning(errors)
         return AssetCheckResult(
             passed=False,
-            description="Table unexpected length",
-            metadata={"table_length": length, "expected_length": expected_length},
+            description="One or more report years have unexpected length",
+            metadata={"errors": errors},
         )
     return AssetCheckResult(passed=True)
 
@@ -550,7 +581,10 @@ def check_unexpected_counties() -> AssetCheckResult:
     )
     unexpected_counties = duckdb.query(
         "SELECT * FROM vce "
-        "WHERE county_or_lake_name = 'bedford_city' or county_or_lake_name = 'clifton_forge_city'"
+        "WHERE county_or_lake_name in ("
+        "'bedford_city','clifton_forge_city',"
+        "'lake_hurron','lake_st_clair'"
+        ")"
     ).fetchall()
     if len(unexpected_counties) > 0:
         return AssetCheckResult(
@@ -575,7 +609,10 @@ def check_duplicate_county_id_fips() -> AssetCheckResult:
         "FROM vce WHERE county_id_fips "
         "IS NOT NULL GROUP BY ALL HAVING COUNT(*) > 1"
     ).fetchall()
-    if len(duplicate_county_ids) > 0:
+    if (dupecount := len(duplicate_county_ids)) > 0:
+        logger.error(
+            f"Found {dupecount} duplicate county_id_fips values; first ten: {duplicate_county_ids[:10]}"
+        )
         return AssetCheckResult(
             passed=False,
             description="Found duplicate county_id_fips values",