Skip to content

Commit

Permalink
fix: Normalize VCE RARE spellings of great lakes. (#4029)
Browse files Browse the repository at this point in the history
* fix: Normalize VCE RARE spellings of great lakes.

Fixes #4007

* Add lake_hurron and lake_st_clair to unexpected counties check

* Switch to checking row counts by report year instead of by job name

* Finish up release notes for #4007 fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci

---------

Co-authored-by: Kathryn Mazaitis <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Feb 6, 2025
1 parent fd4b1c6 commit 18a19b7
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 15 deletions.
3 changes: 3 additions & 0 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ Bug Fixes
this fix.
* Added preliminary data validation checks for several FERC 1 tables that were
missing it :pr:`3860`.
* Fix spelling of Lake Huron and Lake Saint Clair in
:ref:`out_vcerare__hourly_available_capacity_factor` and related tables. See issue
:issue:`4007` and PR :pr:`4029`.

Major Dependency Updates
^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
67 changes: 52 additions & 15 deletions src/pudl/transform/vcerare.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def _prep_lat_long_fips_df(raw_vcerare__lat_lon_fips: pd.DataFrame) -> pd.DataFr
.assign(
county_state_names=lambda x: x.county_state_names.str.lower()
.replace({r"\.": "", "-": "_"}, regex=True)
.pipe(_spot_fix_great_lakes_values)
.astype("category")
)
# Fix FIPS codes with no leading zeros
Expand Down Expand Up @@ -92,16 +93,19 @@ def _prep_lat_long_fips_df(raw_vcerare__lat_lon_fips: pd.DataFrame) -> pd.DataFr
# Remove state FIPS code column in favor of the newly added state column.
.drop(columns=["state_id_fips", "fips", "subdivision_code"])
)

logger.info("Spot check: fixed typos in great lakes.")

logger.info("Nulling FIPS IDs for non-county regions.")
lake_county_state_names = [
"lake_erie_ohio",
"lake_hurron_michigan",
"lake_huron_michigan",
"lake_michigan_illinois",
"lake_michigan_indiana",
"lake_michigan_michigan",
"lake_michigan_wisconsin",
"lake_ontario_new_york",
"lake_st_clair_michigan",
"lake_saint_clair_michigan",
"lake_superior_minnesota",
"lake_superior_michigan",
"lake_superior_wisconsin",
Expand Down Expand Up @@ -261,6 +265,23 @@ def _get_parquet_path():
return PudlPaths().parquet_path("out_vcerare__hourly_available_capacity_factor")


def _spot_fix_great_lakes_values(sr: pd.Series) -> pd.Series:
"""Normalize spelling of great lakes in cell values."""
return sr.replace("lake_hurron_michigan", "lake_huron_michigan").replace(
"lake_st_clair_michigan", "lake_saint_clair_michigan"
)


def _spot_fix_great_lakes_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Normalize spelling of great lakes in column names."""
return df.rename(
columns={
"lake_hurron_michigan": "lake_huron_michigan",
"lake_st_clair_michigan": "lake_saint_clair_michigan",
}
)


def one_year_hourly_available_capacity_factor(
year: int,
raw_vcerare__lat_lon_fips: pd.DataFrame,
Expand All @@ -287,7 +308,8 @@ def one_year_hourly_available_capacity_factor(
"onshore_wind": raw_vcerare__onshore_wind_power_100m,
}
clean_dict = {
df_name: _check_for_valid_counties(df, fips_df, df_name)
df_name: _spot_fix_great_lakes_columns(df)
.pipe(_check_for_valid_counties, fips_df, df_name)
.pipe(_add_time_cols, df_name)
.pipe(_drop_city_cols, df_name)
.pipe(_make_cap_fac_frac, df_name)
Expand Down Expand Up @@ -363,21 +385,30 @@ def check_rows(context: AssetCheckExecutionContext) -> AssetCheckResult:
"""Check rows."""
logger.info("Check VCE RARE hourly table is the expected length")

# Define row counts for fast/full etl
# TODO 2024-12-27: make this check row counts per year instead of having
# two different counts based on job name - less brittle.
row_counts = {
"etl_full": 136437000,
"etl_fast": 27287400,
# Define row counts for report years
row_counts_by_year = {
2019: 27287400,
2020: 27287400,
2021: 27287400,
2022: 27287400,
2023: 27287400,
}

vce = _load_duckdb_table() # noqa: F841
(length,) = duckdb.query("SELECT COUNT(*) FROM vce").fetchone()
if (expected_length := row_counts[context.op_execution_context.job_name]) != length:
errors = []
for report_year, length in duckdb.query(
"SELECT report_year, COUNT(*) FROM vce GROUP BY ALL"
).fetchall():
if (expected_length := row_counts_by_year[report_year]) != length:
errors.append(
f"Expected {expected_length} for report year {report_year}, found {length}"
)
if errors:
logger.warning(errors)
return AssetCheckResult(
passed=False,
description="Table unexpected length",
metadata={"table_length": length, "expected_length": expected_length},
description="One or more report years have unexpected length",
metadata={"errors": errors},
)
return AssetCheckResult(passed=True)

Expand Down Expand Up @@ -550,7 +581,10 @@ def check_unexpected_counties() -> AssetCheckResult:
)
unexpected_counties = duckdb.query(
"SELECT * FROM vce "
"WHERE county_or_lake_name = 'bedford_city' or county_or_lake_name = 'clifton_forge_city'"
"WHERE county_or_lake_name in ("
"'bedford_city','clifton_forge_city',"
"'lake_hurron','lake_st_clair'"
")"
).fetchall()
if len(unexpected_counties) > 0:
return AssetCheckResult(
Expand All @@ -575,7 +609,10 @@ def check_duplicate_county_id_fips() -> AssetCheckResult:
"FROM vce WHERE county_id_fips "
"IS NOT NULL GROUP BY ALL HAVING COUNT(*) > 1"
).fetchall()
if len(duplicate_county_ids) > 0:
if (dupecount := len(duplicate_county_ids)) > 0:
logger.error(
f"Found {dupecount} duplicate county_id_fips values; first ten: {duplicate_county_ids[:10]}"
)
return AssetCheckResult(
passed=False,
description="Found duplicate county_id_fips values",
Expand Down

0 comments on commit 18a19b7

Please sign in to comment.