From 50365eab6067479dad1c5e42a53eb523a83e3ae6 Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Wed, 29 Nov 2023 10:25:14 -0800 Subject: [PATCH 1/8] feat: write pyproject.toml enclosing default configuration for ruff --- pyproject.toml | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..dfe594eb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[project] +name = "open-grid-emissions" +requires-python = ">3.10" +readme = "README.md" + +[tool.ruff] +# Exclude a variety of commonly ignored directories. +exclude = [ + ".git", + ".github", + ".pytest_cache", + ".ruff_cache", + ".tox", + ".venv", + "__pypackages__", + "_build", + "build", + "venv", +] + +# All paths are relative to the project root, which is the directory containing the pyproject.toml. +src = ["src"] + +# Same as Black. +line-length = 88 +indent-width = 4 + +# Assume Python 3.8 +target-version = "py310" + +# Built-in for Jupyter +extend-include = ["*.ipynb"] + +[tool.ruff.lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F"] +ignore = [] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +# Exclude notebooks +exclude = ["*.ipynb"] + +[tool.ruff.lint.flake8-quotes] +docstring-quotes = "double" + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +[tool.ruff.lint.pydocstyle] +convention = "google" From 174132d3d34eadc129134e6ff75de4c046f4bd49 Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Wed, 29 Nov 2023 10:26:31 -0800 Subject: [PATCH 2/8] chore: format notebook --- .../explore_data/compare_data_sources.ipynb | 68 +- .../explore_annually_reported_eia_data.ipynb | 92 +- .../explore_intermediate_outputs.ipynb | 64 +- notebooks/explore_data/gens_not_in_cems.ipynb | 73 +- ...import_uncontrolled_emission_factors.ipynb | 191 +- .../calculate_residual_net_generation.ipynb | 1670 +++++++++-------- .../national_wind_solar_correlations.ipynb | 17 +- .../default_fuel_sulfur_content.ipynb | 27 +- .../export_fuel_heat_content.ipynb | 7 +- .../identify_eia930_time_lags.ipynb | 495 +++-- .../manually_identify_crosswalk_updates.ipynb | 118 +- .../manually_update_OTH_fuel_code.ipynb | 19 +- .../manually_update_ba_reference.ipynb | 13 +- .../update_utility_name_ba_map.ipynb | 21 +- notebooks/manual_data/zip_data.ipynb | 11 +- notebooks/validation/data_validation.ipynb | 708 +++++-- .../validation/diff_output_versions.ipynb | 105 +- notebooks/validation/hourly_validation.ipynb | 647 +++++-- .../validation/validate_data_cleaning.ipynb | 89 +- .../validation/validate_hourly_profiles.ipynb | 129 +- .../validate_negative_profiles.ipynb | 150 +- notebooks/validation/validate_vs_egrid.ipynb | 83 +- .../visualization/map_visualization.ipynb | 1405 ++++++++------ .../visualization/plot_timeseries_data.ipynb | 19 +- .../work_in_progress/GH102_test_dask.ipynb | 55 +- .../GH153_fill_missing_nox_so2_cems.ipynb | 35 +- .../GH240_eia930_physics_reconciliation.ipynb | 44 +- .../clean_cems_outliers.ipynb | 161 +- .../work_in_progress/issue230_spikes.ipynb | 55 +- notebooks/work_in_progress/sandbox.ipynb | 4 +- .../uncertainty_analysis.ipynb | 307 ++- 31 files changed, 4346 insertions(+), 2536 deletions(-) diff --git a/notebooks/explore_data/compare_data_sources.ipynb b/notebooks/explore_data/compare_data_sources.ipynb index 886855af..41ee76c2 100644 --- a/notebooks/explore_data/compare_data_sources.ipynb +++ b/notebooks/explore_data/compare_data_sources.ipynb @@ -11,7 +11,7 @@ "# Depending on how your jupyter handles working directories, this may not be needed.\n", "import sys\n", "\n", - "sys.path.append(\"../../../open-grid-emissions/\")\n" + "sys.path.append(\"../../../open-grid-emissions/\")" ] }, { @@ -57,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "year = 2019\n" + "year = 2019" ] }, { @@ -142,7 +142,7 @@ " col.replace(\"Net Generation (MW) from \", \"\") for col in eia_930.columns\n", "]\n", "\n", - "eia_930.head(3)\n" + "eia_930.head(3)" ] }, { @@ -171,7 +171,7 @@ " \"Natural Gas\",\n", " \"All Petroleum Products\",\n", " \"Other Fuel Sources\",\n", - "]\n" + "]" ] }, { @@ -185,7 +185,7 @@ "\n", "# replace negative values with NaN\n", "for col in emitting_gen_columns:\n", - " eia_930.loc[eia_930[col] < 0, col] = np.nan\n" + " eia_930.loc[eia_930[col] < 0, col] = np.nan" ] }, { @@ -201,7 +201,7 @@ "# Calculate hourly EIA-930 non-renewable generation\n", "eia_930[\"total_net_generation\"] = eia_930[net_gen_columns].sum(axis=1)\n", "\n", - "eia_930.head(3)\n" + "eia_930.head(3)" ] }, { @@ -215,7 +215,7 @@ " eia_930.groupby(\"Balancing Authority\").sum()[\"emitting_net_generation\"]\n", ")\n", "annual_930 = annual_930.rename(columns={\"emitting_net_generation\": \"EIA-930\"})\n", - "annual_930.head()\n" + "annual_930.head()" ] }, { @@ -247,7 +247,7 @@ " AND report_date <= '{year}-12-30'\",\n", " pudl_engine,\n", ")\n", - "gen_fuel_923.head()\n" + "gen_fuel_923.head()" ] }, { @@ -264,7 +264,7 @@ "fuel_code_dict_pudl = pd.Series(\n", " fuel_code_dict_pudl.fuel_type_code_pudl.values,\n", " index=fuel_code_dict_pudl.energy_source_code,\n", - ").to_dict()\n" + ").to_dict()" ] }, { @@ -336,7 +336,7 @@ "# Add ba code to generation_fuel_eia923\n", "gen_fuel_923 = gen_fuel_923.merge(plants_ba, how=\"left\", on=\"plant_id_eia\")\n", "\n", - "gen_fuel_923.head()\n" + "gen_fuel_923.head()" ] }, { @@ -350,7 +350,7 @@ " gen_fuel_923.groupby(\"balancing_authority_code_eia\").sum()[\"net_generation_mwh\"]\n", ")\n", "annual_923 = annual_923.rename(columns={\"net_generation_mwh\": \"EIA-923\"})\n", - "annual_923.head()\n" + "annual_923.head()" ] }, { @@ -362,7 +362,7 @@ "source": [ "compare_annual_923_930 = annual_923.merge(\n", " annual_930, how=\"outer\", left_index=True, right_index=True\n", - ")\n" + ")" ] }, { @@ -373,7 +373,7 @@ "outputs": [], "source": [ "# identify which BAs are missing from one or another dataset\n", - "compare_annual_923_930[compare_annual_923_930.isna().any(axis=1)]\n" + "compare_annual_923_930[compare_annual_923_930.isna().any(axis=1)]" ] }, { @@ -384,7 +384,7 @@ "outputs": [], "source": [ "# let's compare each BA\n", - "px.scatter(compare_annual_923_930)\n" + "px.scatter(compare_annual_923_930)" ] }, { @@ -404,7 +404,7 @@ " percent_diff_923_930,\n", " title=\"percent difference between emitting net generation in EIA-930 and EIA-923\",\n", " labels={\"value\": \"% change from EIA-923\"},\n", - ")\n" + ")" ] }, { @@ -430,7 +430,7 @@ "annual_923_by_fuel = annual_923_by_fuel.rename(\n", " columns={\"net_generation_mwh\": \"EIA-923\"}\n", ")\n", - "annual_923_by_fuel.head(3)\n" + "annual_923_by_fuel.head(3)" ] }, { @@ -457,7 +457,7 @@ "annual_930_by_fuel = annual_930_by_fuel.rename(\n", " columns={\"Balancing Authority\": \"balancing_authority_code_eia\"}\n", ")\n", - "annual_930_by_fuel\n" + "annual_930_by_fuel" ] }, { @@ -481,7 +481,7 @@ " facet_col_wrap=1,\n", " height=1000,\n", " title=\"comparison of net generation by fuel type for each BA\",\n", - ")\n" + ")" ] }, { @@ -498,7 +498,7 @@ "percent_error[\"percent_error\"] = (\n", " percent_error[\"EIA-930\"] - percent_error[\"EIA-923\"]\n", ") / percent_error[\"EIA-923\"]\n", - "percent_error\n" + "percent_error" ] }, { @@ -548,7 +548,7 @@ "hourly_net_emissions = pd.read_csv(\n", " \"../data/outputs/hourly_net_emission.csv\", index_col=0, parse_dates=True\n", ")\n", - "hourly_emission_rate.head()\n" + "hourly_emission_rate.head()" ] }, { @@ -558,7 +558,7 @@ "metadata": {}, "outputs": [], "source": [ - "eia_930.head()\n" + "eia_930.head()" ] }, { @@ -589,7 +589,7 @@ ")\n", "ax2.set_title(ba)\n", "ax2.legend()\n", - "ax2.set_xlim(parse_dt(\"2019-08-01\"), parse_dt(\"2019-08-10\"))\n" + "ax2.set_xlim(parse_dt(\"2019-08-01\"), parse_dt(\"2019-08-10\"))" ] }, { @@ -610,7 +610,7 @@ ")\n", "data_for_plot = data_for_plot.rename(columns={ba: \"CEMS\"})\n", "\n", - "px.line(data_for_plot, title=f\"Net generation in {ba} EIA-930 vs CEMS\")\n" + "px.line(data_for_plot, title=f\"Net generation in {ba} EIA-930 vs CEMS\")" ] }, { @@ -632,7 +632,7 @@ "source": [ "# For annual comparison graphs, see below with eGRID\n", "annual_eia_930 = eia_930.groupby(\"Balancing Authority\").sum()[\"emitting_net_generation\"]\n", - "annual_eia_930.head()\n" + "annual_eia_930.head()" ] }, { @@ -663,7 +663,7 @@ " AND report_date <= '{year}-12-30'\",\n", " pudl_engine,\n", ")\n", - "gen_923.head()\n" + "gen_923.head()" ] }, { @@ -682,7 +682,7 @@ "otherway = np.setdiff1d(\n", " gen_fuel_923[\"plant_id_eia\"].unique(), gen_923[\"plant_id_eia\"].unique()\n", ")\n", - "print(f\"{len(oneway)} plants in generation_fuel_eia923 are not in generation_eia923\")\n" + "print(f\"{len(oneway)} plants in generation_fuel_eia923 are not in generation_eia923\")" ] }, { @@ -731,7 +731,7 @@ " header=1,\n", " index_col=\"BACODE\",\n", ")\n", - "egrid.head()\n" + "egrid.head()" ] }, { @@ -753,7 +753,7 @@ " (egrid_data_code_to_name.loc[0, name], name)\n", " for name in egrid_data_code_to_name.columns\n", " ]\n", - ")\n" + ")" ] }, { @@ -767,7 +767,7 @@ "annual_generation = hourly_net_generation.sum(axis=0).rename(\"hourly\")\n", "annual_generation = egrid.merge(\n", " annual_generation, how=\"right\", left_index=True, right_index=True\n", - ").loc[:, [\"hourly\", \"BAGENACY\"]]\n" + ").loc[:, [\"hourly\", \"BAGENACY\"]]" ] }, { @@ -783,7 +783,7 @@ ")\n", "annual_generation = annual_generation.merge(\n", " annual_923, how=\"left\", left_index=True, right_index=True\n", - ")\n" + ")" ] }, { @@ -793,7 +793,7 @@ "metadata": {}, "outputs": [], "source": [ - "annual_generation.head()\n" + "annual_generation.head()" ] }, { @@ -860,7 +860,7 @@ "ax3.hlines([1.0], -0.5, len(annual_generation) - 0.5, color=\"r\")\n", "ax3.set_ylabel(\"Fraction of 930 generation captured by CEMS\")\n", "ax3.set_ylim(0, 2)\n", - "ax3.set_title(\"EIA-930 vs OGEI\")\n" + "ax3.set_title(\"EIA-930 vs OGEI\")" ] }, { @@ -872,7 +872,7 @@ "source": [ "# Many small BAs have terrible coverage of EIA-930 data, resulting in low annually aggregated 930 numbers.\n", "# The worst discrepencies between 930 and eGRID:\n", - "annual_generation.loc[[\"FPL\", \"IPCO\", \"NEVP\", \"SC\", \"TEC\", \"TVA\"], :]\n" + "annual_generation.loc[[\"FPL\", \"IPCO\", \"NEVP\", \"SC\", \"TEC\", \"TVA\"], :]" ] }, { @@ -900,7 +900,7 @@ "ax.set_xticklabels(labels=annual_generation.index, rotation=90)\n", "ax.hlines([1.0], -0.5, len(annual_generation) - 0.5, color=\"r\")\n", "ax.set_ylabel(\"Fraction of eGRID non-renewable generation captured by 923\")\n", - "ax.set_title(\"923 vs eGRID\")\n" + "ax.set_title(\"923 vs eGRID\")" ] }, { diff --git a/notebooks/explore_data/explore_annually_reported_eia_data.ipynb b/notebooks/explore_data/explore_annually_reported_eia_data.ipynb index af672e8c..b2b32a2c 100644 --- a/notebooks/explore_data/explore_annually_reported_eia_data.ipynb +++ b/notebooks/explore_data/explore_annually_reported_eia_data.ipynb @@ -14,7 +14,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "from column_checks import get_dtypes\n", "import load_data\n", @@ -39,11 +40,17 @@ "source": [ "pudl_out = load_data.initialize_pudl_out(year)\n", "\n", - "plant_frequency = pudl_out.plants_eia860()[[\"plant_id_eia\",\"respondent_frequency\"]]\n", + "plant_frequency = pudl_out.plants_eia860()[[\"plant_id_eia\", \"respondent_frequency\"]]\n", "\n", "# load the allocated EIA data\n", - "eia923_allocated = pd.read_csv(f'{outputs_folder()}{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])\n", - "eia923_allocated = eia923_allocated.merge(plant_frequency, how=\"left\", on=\"plant_id_eia\", validate=\"m:1\")" + "eia923_allocated = pd.read_csv(\n", + " f\"{outputs_folder()}{path_prefix}/eia923_allocated_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"report_date\"],\n", + ")\n", + "eia923_allocated = eia923_allocated.merge(\n", + " plant_frequency, how=\"left\", on=\"plant_id_eia\", validate=\"m:1\"\n", + ")" ] }, { @@ -52,7 +59,15 @@ "metadata": {}, "outputs": [], "source": [ - "data_from_annual = eia923_allocated.groupby([\"respondent_frequency\"], dropna=False)[[\"fuel_consumed_mmbtu\", \"net_generation_mwh\",\"co2_mass_lb\"]].sum() / eia923_allocated[[\"fuel_consumed_mmbtu\", \"net_generation_mwh\",\"co2_mass_lb\"]].sum() * 100\n", + "data_from_annual = (\n", + " eia923_allocated.groupby([\"respondent_frequency\"], dropna=False)[\n", + " [\"fuel_consumed_mmbtu\", \"net_generation_mwh\", \"co2_mass_lb\"]\n", + " ].sum()\n", + " / eia923_allocated[\n", + " [\"fuel_consumed_mmbtu\", \"net_generation_mwh\", \"co2_mass_lb\"]\n", + " ].sum()\n", + " * 100\n", + ")\n", "data_from_annual.loc[\"Total Percent\"] = data_from_annual.sum()\n", "data_from_annual" ] @@ -63,7 +78,7 @@ "metadata": {}, "outputs": [], "source": [ - "data_from_annual.loc[\"A\",:].rename(\"% of EIA-923 input data from EIA annual reporters\")" + "data_from_annual.loc[\"A\", :].rename(\"% of EIA-923 input data from EIA annual reporters\")" ] }, { @@ -80,7 +95,17 @@ "metadata": {}, "outputs": [], "source": [ - "annual_eia_used = eia923_allocated[eia923_allocated[\"hourly_data_source\"] != \"cems\"].groupby([\"respondent_frequency\"], dropna=False)[[\"fuel_consumed_mmbtu\", \"net_generation_mwh\",\"co2_mass_lb\"]].sum() / eia923_allocated[[\"fuel_consumed_mmbtu\", \"net_generation_mwh\",\"co2_mass_lb\"]].sum() * 100\n", + "annual_eia_used = (\n", + " eia923_allocated[eia923_allocated[\"hourly_data_source\"] != \"cems\"]\n", + " .groupby([\"respondent_frequency\"], dropna=False)[\n", + " [\"fuel_consumed_mmbtu\", \"net_generation_mwh\", \"co2_mass_lb\"]\n", + " ]\n", + " .sum()\n", + " / eia923_allocated[\n", + " [\"fuel_consumed_mmbtu\", \"net_generation_mwh\", \"co2_mass_lb\"]\n", + " ].sum()\n", + " * 100\n", + ")\n", "annual_eia_used.loc[\"Total Percent\"] = annual_eia_used.sum()\n", "annual_eia_used" ] @@ -91,7 +116,7 @@ "metadata": {}, "outputs": [], "source": [ - "annual_eia_used.loc[\"A\",:].rename(\"% of output data from EIA annual reporters\")" + "annual_eia_used.loc[\"A\", :].rename(\"% of output data from EIA annual reporters\")" ] }, { @@ -108,15 +133,19 @@ "metadata": {}, "outputs": [], "source": [ - "multi_source_subplants = eia923_allocated[\n", - " [\"plant_id_eia\", \"subplant_id\", \"hourly_data_source\"]\n", - "].drop_duplicates().drop(columns=\"hourly_data_source\")\n", + "multi_source_subplants = (\n", + " eia923_allocated[[\"plant_id_eia\", \"subplant_id\", \"hourly_data_source\"]]\n", + " .drop_duplicates()\n", + " .drop(columns=\"hourly_data_source\")\n", + ")\n", "\n", "multi_source_subplants = multi_source_subplants[\n", - " multi_source_subplants.duplicated(\n", - " subset=[\"plant_id_eia\", \"subplant_id\"])]\n", + " multi_source_subplants.duplicated(subset=[\"plant_id_eia\", \"subplant_id\"])\n", + "]\n", "\n", - "multi_source_subplants = eia923_allocated.merge(multi_source_subplants, how=\"inner\", on=[\"plant_id_eia\", \"subplant_id\"])\n" + "multi_source_subplants = eia923_allocated.merge(\n", + " multi_source_subplants, how=\"inner\", on=[\"plant_id_eia\", \"subplant_id\"]\n", + ")" ] }, { @@ -126,7 +155,15 @@ "outputs": [], "source": [ "# what percent of the total EIA-923 data comes from subplants with annually-reported data and multiple sources?\n", - "multi_source_summary = (multi_source_subplants.groupby([\"respondent_frequency\"], dropna=False)[[\"fuel_consumed_mmbtu\", \"net_generation_mwh\",\"co2_mass_lb\"]].sum() / eia923_allocated[[\"fuel_consumed_mmbtu\", \"net_generation_mwh\",\"co2_mass_lb\"]].sum() * 100)\n", + "multi_source_summary = (\n", + " multi_source_subplants.groupby([\"respondent_frequency\"], dropna=False)[\n", + " [\"fuel_consumed_mmbtu\", \"net_generation_mwh\", \"co2_mass_lb\"]\n", + " ].sum()\n", + " / eia923_allocated[\n", + " [\"fuel_consumed_mmbtu\", \"net_generation_mwh\", \"co2_mass_lb\"]\n", + " ].sum()\n", + " * 100\n", + ")\n", "multi_source_summary.loc[\"Total Percent\"] = multi_source_summary.sum()\n", "multi_source_summary" ] @@ -137,7 +174,9 @@ "metadata": {}, "outputs": [], "source": [ - "multi_source_summary.loc[\"A\",:].rename(\"% of output data mixing CEMS and annually-reported EIA data\")" + "multi_source_summary.loc[\"A\", :].rename(\n", + " \"% of output data mixing CEMS and annually-reported EIA data\"\n", + ")" ] }, { @@ -146,7 +185,26 @@ "metadata": {}, "outputs": [], "source": [ - "pd.concat([pd.DataFrame(data_from_annual.loc[\"A\",:].rename(\"% of EIA-923 input data from EIA annual reporters\").round(2)).T, pd.DataFrame(annual_eia_used.loc[\"A\",:].rename(\"% of output data from EIA annual reporters\").round(2)).T, pd.DataFrame(multi_source_summary.loc[\"A\",:].rename(\"% of output data mixing CEMS and annually-reported EIA data\").round(2)).T], axis=0)" + "pd.concat(\n", + " [\n", + " pd.DataFrame(\n", + " data_from_annual.loc[\"A\", :]\n", + " .rename(\"% of EIA-923 input data from EIA annual reporters\")\n", + " .round(2)\n", + " ).T,\n", + " pd.DataFrame(\n", + " annual_eia_used.loc[\"A\", :]\n", + " .rename(\"% of output data from EIA annual reporters\")\n", + " .round(2)\n", + " ).T,\n", + " pd.DataFrame(\n", + " multi_source_summary.loc[\"A\", :]\n", + " .rename(\"% of output data mixing CEMS and annually-reported EIA data\")\n", + " .round(2)\n", + " ).T,\n", + " ],\n", + " axis=0,\n", + ")" ] } ], diff --git a/notebooks/explore_data/explore_intermediate_outputs.ipynb b/notebooks/explore_data/explore_intermediate_outputs.ipynb index e76796cd..ac41bb30 100644 --- a/notebooks/explore_data/explore_intermediate_outputs.ipynb +++ b/notebooks/explore_data/explore_intermediate_outputs.ipynb @@ -14,7 +14,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "from column_checks import get_dtypes\n", "from filepaths import *\n", @@ -40,11 +41,30 @@ "year = 2020\n", "path_prefix = f\"{year}/\"\n", "\n", - "cems = pd.read_csv(outputs_folder(f\"{path_prefix}/cems_subplant_{year}.csv\"), dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "partial_cems_plant = pd.read_csv(outputs_folder(f\"{path_prefix}/partial_cems_plant_{year}.csv\"), dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "partial_cems_subplant = pd.read_csv(outputs_folder(f\"{path_prefix}/partial_cems_subplant_{year}.csv\"), dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "eia923_allocated = pd.read_csv(outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"), dtype=get_dtypes(), parse_dates=['report_date'])\n", - "plant_attributes = pd.read_csv(outputs_folder(f\"{path_prefix}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes())" + "cems = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/cems_subplant_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "partial_cems_plant = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/partial_cems_plant_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "partial_cems_subplant = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/partial_cems_subplant_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "eia923_allocated = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"report_date\"],\n", + ")\n", + "plant_attributes = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/plant_static_attributes_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + ")" ] }, { @@ -69,7 +89,7 @@ "source": [ "data = cems[cems[\"plant_id_eia\"] == 673]\n", "\n", - "data.groupby([\"plant_id_eia\",\"unitid\",\"report_date\"]).sum()" + "data.groupby([\"plant_id_eia\", \"unitid\", \"report_date\"]).sum()" ] }, { @@ -100,7 +120,9 @@ "\n", "all_data = []\n", "for ba in os.listdir(results_folder(f\"2021/power_sector_data/{resolution}/us_units\")):\n", - " df = pd.read_csv(results_folder(f\"2021/power_sector_data/{resolution}/us_units/{ba}\"))\n", + " df = pd.read_csv(\n", + " results_folder(f\"2021/power_sector_data/{resolution}/us_units/{ba}\")\n", + " )\n", " df[\"ba_code\"] = ba.split(\".\")[0]\n", " all_data.append(df)\n", "\n", @@ -120,10 +142,13 @@ "all_data[\"nox_mass_lb_for_electricity\"] / all_data[\"net_generation_mwh\"]\n", "all_data[\"so2_mass_lb_for_electricity\"] / all_data[\"net_generation_mwh\"]\n", "\n", - "for pol in [\"co2\",\"nox\",\"so2\"]:\n", - " for fuel in [\"biomass\", \"geothermal\",\"waste\", \"other\"]:\n", - " calc = all_data.loc[fuel,f\"{pol}_mass_lb_for_electricity\"] / all_data.loc[\"total\",f\"{pol}_mass_lb_for_electricity\"]\n", - " print(f\"{pol} {fuel}: {calc}\")\n" + "for pol in [\"co2\", \"nox\", \"so2\"]:\n", + " for fuel in [\"biomass\", \"geothermal\", \"waste\", \"other\"]:\n", + " calc = (\n", + " all_data.loc[fuel, f\"{pol}_mass_lb_for_electricity\"]\n", + " / all_data.loc[\"total\", f\"{pol}_mass_lb_for_electricity\"]\n", + " )\n", + " print(f\"{pol} {fuel}: {calc}\")" ] }, { @@ -139,10 +164,21 @@ "metadata": {}, "outputs": [], "source": [ - "all_data[\"pctdiff\"] = (all_data.generated_co2_rate_lb_per_mwh_for_electricity_adjusted - all_data.generated_co2_rate_lb_per_mwh_for_electricity) / all_data.generated_co2_rate_lb_per_mwh_for_electricity\n", + "all_data[\"pctdiff\"] = (\n", + " all_data.generated_co2_rate_lb_per_mwh_for_electricity_adjusted\n", + " - all_data.generated_co2_rate_lb_per_mwh_for_electricity\n", + ") / all_data.generated_co2_rate_lb_per_mwh_for_electricity\n", "\n", "\n", - "all_data.loc[all_data[\"fuel_category\"] == \"total\", [\"ba_code\",\"pctdiff\",\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\",\"generated_co2_rate_lb_per_mwh_for_electricity\"]].sort_values(by=\"pctdiff\").head(25)" + "all_data.loc[\n", + " all_data[\"fuel_category\"] == \"total\",\n", + " [\n", + " \"ba_code\",\n", + " \"pctdiff\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity\",\n", + " ],\n", + "].sort_values(by=\"pctdiff\").head(25)" ] } ], diff --git a/notebooks/explore_data/gens_not_in_cems.ipynb b/notebooks/explore_data/gens_not_in_cems.ipynb index fbc76b6b..8b08bac8 100644 --- a/notebooks/explore_data/gens_not_in_cems.ipynb +++ b/notebooks/explore_data/gens_not_in_cems.ipynb @@ -17,7 +17,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import download_data\n", "import load_data\n", @@ -42,13 +43,32 @@ "outputs": [], "source": [ "# load inputs to function\n", - "cems = pd.read_csv(outputs_folder(f\"{path_prefix}/cems_subplant_{year}.csv\"), dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "partial_cems_plant = pd.read_csv(outputs_folder(f\"{path_prefix}/partial_cems_plant_{year}.csv\"), dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "partial_cems_subplant = pd.read_csv(outputs_folder(f\"{path_prefix}/partial_cems_subplant_{year}.csv\"), dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "eia923_allocated = pd.read_csv(outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"), dtype=get_dtypes(), parse_dates=['report_date'])\n", - "plant_attributes = pd.read_csv(outputs_folder(f\"{path_prefix}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes())\n", + "cems = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/cems_subplant_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "partial_cems_plant = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/partial_cems_plant_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "partial_cems_subplant = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/partial_cems_subplant_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "eia923_allocated = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/eia923_allocated_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"report_date\"],\n", + ")\n", + "plant_attributes = pd.read_csv(\n", + " outputs_folder(f\"{path_prefix}/plant_static_attributes_{year}.csv\"),\n", + " dtype=get_dtypes(),\n", + ")\n", "\n", - "# select eia only data \n", + "# select eia only data\n", "eia_only_data = eia923_allocated[\n", " (eia923_allocated[\"hourly_data_source\"] == \"eia\")\n", " & ~(eia923_allocated[\"fuel_consumed_mmbtu\"].isna())\n", @@ -84,7 +104,7 @@ " how=\"left\",\n", " on=\"plant_id_eia\",\n", " validate=\"m:1\",\n", - ")\n" + ")" ] }, { @@ -121,7 +141,9 @@ "metadata": {}, "outputs": [], "source": [ - "eia_caiso[\"nox_rate\"] = eia_caiso[\"nox_mass_lb_for_electricity\"] / eia_caiso[\"net_generation_mwh\"]\n", + "eia_caiso[\"nox_rate\"] = (\n", + " eia_caiso[\"nox_mass_lb_for_electricity\"] / eia_caiso[\"net_generation_mwh\"]\n", + ")\n", "eia_caiso[\"nox_rate\"] = eia_caiso[\"nox_rate\"].replace(np.inf, np.nan)" ] }, @@ -131,7 +153,12 @@ "metadata": {}, "outputs": [], "source": [ - "eia_caiso.groupby([\"prime_mover_code\",\"energy_source_code\",])[\"nox_mass_lb_for_electricity\"].sum()" + "eia_caiso.groupby(\n", + " [\n", + " \"prime_mover_code\",\n", + " \"energy_source_code\",\n", + " ]\n", + ")[\"nox_mass_lb_for_electricity\"].sum()" ] }, { @@ -177,7 +204,7 @@ "subplant_nameplate = gross_to_net_generation.calculate_subplant_nameplate_capacity(year)\n", "\n", "pudl_out = load_data.initialize_pudl_out(year)\n", - "gen_cap = pudl_out.gens_eia860()[[\"plant_id_eia\",\"generator_id\",\"capacity_mw\"]]" + "gen_cap = pudl_out.gens_eia860()[[\"plant_id_eia\", \"generator_id\", \"capacity_mw\"]]" ] }, { @@ -186,8 +213,12 @@ "metadata": {}, "outputs": [], "source": [ - "eia_cf = eia_only_data.merge(gen_cap, how=\"left\", on=[\"plant_id_eia\",\"generator_id\"], validate=\"m:1\")\n", - "eia_cf[\"capfac\"] = eia_cf.net_generation_mwh / (eia_cf.report_date.dt.days_in_month * 24 * eia_cf.capacity_mw)\n", + "eia_cf = eia_only_data.merge(\n", + " gen_cap, how=\"left\", on=[\"plant_id_eia\", \"generator_id\"], validate=\"m:1\"\n", + ")\n", + "eia_cf[\"capfac\"] = eia_cf.net_generation_mwh / (\n", + " eia_cf.report_date.dt.days_in_month * 24 * eia_cf.capacity_mw\n", + ")\n", "eia_cf.loc[eia_cf[\"capfac\"] > 1.2, \"capfac\"] = np.NaN\n", "eia_cf.loc[eia_cf[\"capfac\"] < 0, \"capfac\"] = np.NaN\n", "eia_cf" @@ -199,7 +230,9 @@ "metadata": {}, "outputs": [], "source": [ - "px.histogram(eia_cf, x=\"capfac\", nbins=15, histnorm=\"percent\", width=500).update_xaxes(dtick=0.05)" + "px.histogram(eia_cf, x=\"capfac\", nbins=15, histnorm=\"percent\", width=500).update_xaxes(\n", + " dtick=0.05\n", + ")" ] }, { @@ -208,10 +241,12 @@ "metadata": {}, "outputs": [], "source": [ - "cems_cf = cems.merge(subplant_nameplate, how=\"left\", on=[\"plant_id_eia\",\"subplant_id\"])\n", - "cems_cf = cems_cf.groupby([\"plant_id_eia\",\"subplant_id\"])[[\"net_generation_mwh\",\"capacity_mw\"]].sum()\n", + "cems_cf = cems.merge(subplant_nameplate, how=\"left\", on=[\"plant_id_eia\", \"subplant_id\"])\n", + "cems_cf = cems_cf.groupby([\"plant_id_eia\", \"subplant_id\"])[\n", + " [\"net_generation_mwh\", \"capacity_mw\"]\n", + "].sum()\n", "cems_cf = cems_cf[cems_cf[\"capacity_mw\"] > 0]\n", - "cems_cf['capfac'] = cems_cf['net_generation_mwh'] / cems_cf['capacity_mw']\n", + "cems_cf[\"capfac\"] = cems_cf[\"net_generation_mwh\"] / cems_cf[\"capacity_mw\"]\n", "cems_cf.loc[cems_cf[\"capfac\"] > 1.2, \"capfac\"] = np.NaN\n", "cems_cf.loc[cems_cf[\"capfac\"] < 0, \"capfac\"] = np.NaN\n", "cems_cf" @@ -223,7 +258,9 @@ "metadata": {}, "outputs": [], "source": [ - "px.histogram(cems_cf, x=\"capfac\", nbins=15, histnorm=\"percent\", width=500).update_xaxes(dtick=0.05)" + "px.histogram(cems_cf, x=\"capfac\", nbins=15, histnorm=\"percent\", width=500).update_xaxes(\n", + " dtick=0.05\n", + ")" ] } ], diff --git a/notebooks/explore_data/manually_import_uncontrolled_emission_factors.ipynb b/notebooks/explore_data/manually_import_uncontrolled_emission_factors.ipynb index 89945fde..c288b336 100644 --- a/notebooks/explore_data/manually_import_uncontrolled_emission_factors.ipynb +++ b/notebooks/explore_data/manually_import_uncontrolled_emission_factors.ipynb @@ -40,31 +40,31 @@ " https://catalystcoop-pudl.readthedocs.io/en/latest/data_dictionaries/codes_and_labels.html\n", " \"\"\"\n", " column_names = [\n", - " 'readable_fuel_name',\n", - " 'eia_fuel_code',\n", - " 'source_and_tables',\n", - " 'emissions_units',\n", - " 'cyclone_firing_boiler',\n", - " 'fluidized_bed_firing_boiler',\n", - " 'stoker_boiler',\n", - " 'tangential_firing_boiler',\n", - " 'all_other_boiler_types',\n", - " 'combustion_turbine',\n", - " 'internal_combustion_engine'\n", + " \"readable_fuel_name\",\n", + " \"eia_fuel_code\",\n", + " \"source_and_tables\",\n", + " \"emissions_units\",\n", + " \"cyclone_firing_boiler\",\n", + " \"fluidized_bed_firing_boiler\",\n", + " \"stoker_boiler\",\n", + " \"tangential_firing_boiler\",\n", + " \"all_other_boiler_types\",\n", + " \"combustion_turbine\",\n", + " \"internal_combustion_engine\",\n", " ]\n", "\n", " column_dtypes = {\n", - " 'readable_fuel_name': 'str',\n", - " 'eia_fuel_code': 'str',\n", - " 'source_and_tables': 'str',\n", - " 'emissions_units': 'str',\n", + " \"readable_fuel_name\": \"str\",\n", + " \"eia_fuel_code\": \"str\",\n", + " \"source_and_tables\": \"str\",\n", + " \"emissions_units\": \"str\",\n", " }\n", - " \n", + "\n", " # Every other column is a float.\n", " for column_name in column_names:\n", " if column_name not in column_dtypes:\n", - " column_dtypes[column_name] = 'float64'\n", - " \n", + " column_dtypes[column_name] = \"float64\"\n", + "\n", " # NOTE(milo): Header starts on 2 for this one!\n", " df = pd.read_excel(\n", " io=path_to_xlsx,\n", @@ -89,35 +89,35 @@ "def load_nox_uncontrolled_efs(path_to_xlsx):\n", " \"\"\"\n", " https://catalystcoop-pudl.readthedocs.io/en/latest/data_dictionaries/codes_and_labels.html\n", - " \"\"\" \n", + " \"\"\"\n", " column_names = [\n", - " 'readable_fuel_name',\n", - " 'eia_fuel_code',\n", - " 'source_and_tables',\n", - " 'emissions_units',\n", - " 'cyclone_firing_boiler',\n", - " 'fluidized_bed_firing_boiler',\n", - " 'stoker_boiler',\n", - " 'tangential_firing_boiler_dry_bottom',\n", - " 'tangential_firing_boiler_wet_bottom',\n", - " 'all_other_boiler_types_dry_bottom',\n", - " 'all_other_boiler_types_wet_bottom',\n", - " 'combustion_turbine',\n", - " 'internal_combustion_engine'\n", + " \"readable_fuel_name\",\n", + " \"eia_fuel_code\",\n", + " \"source_and_tables\",\n", + " \"emissions_units\",\n", + " \"cyclone_firing_boiler\",\n", + " \"fluidized_bed_firing_boiler\",\n", + " \"stoker_boiler\",\n", + " \"tangential_firing_boiler_dry_bottom\",\n", + " \"tangential_firing_boiler_wet_bottom\",\n", + " \"all_other_boiler_types_dry_bottom\",\n", + " \"all_other_boiler_types_wet_bottom\",\n", + " \"combustion_turbine\",\n", + " \"internal_combustion_engine\",\n", " ]\n", - " \n", + "\n", " column_dtypes = {\n", - " 'readable_fuel_name': 'str',\n", - " 'eia_fuel_code': 'str',\n", - " 'source_and_tables': 'str',\n", - " 'emissions_units': 'str',\n", + " \"readable_fuel_name\": \"str\",\n", + " \"eia_fuel_code\": \"str\",\n", + " \"source_and_tables\": \"str\",\n", + " \"emissions_units\": \"str\",\n", " }\n", "\n", " # Every other column is a float.\n", " for column_name in column_names:\n", " if column_name not in column_dtypes:\n", - " column_dtypes[column_name] = 'float64'\n", - " \n", + " column_dtypes[column_name] = \"float64\"\n", + "\n", " # NOTE(milo): Header starts on 3 for this one!\n", " df = pd.read_excel(\n", " io=path_to_xlsx,\n", @@ -139,9 +139,13 @@ "metadata": {}, "outputs": [], "source": [ - "base_folder = filepaths.manual_folder('eia_electric_power_annual')\n", - "so2_uncontrolled_efs_path = os.path.join(base_folder, 'epa_a_01_so2_uncontrolled_efs.xlsx')\n", - "nox_uncontrolled_efs_path = os.path.join(base_folder, 'epa_a_02_nox_uncontrolled_efs.xlsx')" + "base_folder = filepaths.manual_folder(\"eia_electric_power_annual\")\n", + "so2_uncontrolled_efs_path = os.path.join(\n", + " base_folder, \"epa_a_01_so2_uncontrolled_efs.xlsx\"\n", + ")\n", + "nox_uncontrolled_efs_path = os.path.join(\n", + " base_folder, \"epa_a_02_nox_uncontrolled_efs.xlsx\"\n", + ")" ] }, { @@ -165,7 +169,7 @@ "def make_so2_emission_factor_rows(input_df):\n", " \"\"\"\n", " Convert the EIA emission factors excel format into the one we store in emission_factors_for_so2.csv.\n", - " \n", + "\n", " Columns:\n", " prime_mover_code,\n", " energy_source_code,\n", @@ -176,73 +180,72 @@ " multiply_by_sulfur_content\n", " \"\"\"\n", " min_column_idx = 4\n", - " \n", + "\n", " map_eia_units_to_ours = {\n", - " 'Lbs per MG': {\n", - " 'emission_factor_numerator': 'lb',\n", - " 'emission_factor_denominator': 'thousand gallons',\n", + " \"Lbs per MG\": {\n", + " \"emission_factor_numerator\": \"lb\",\n", + " \"emission_factor_denominator\": \"thousand gallons\",\n", + " },\n", + " \"Lbs per MMCF\": {\n", + " \"emission_factor_numerator\": \"lb\",\n", + " \"emission_factor_denominator\": \"Mcf\",\n", " },\n", - " 'Lbs per MMCF': {\n", - " 'emission_factor_numerator': 'lb',\n", - " 'emission_factor_denominator': 'Mcf',\n", + " \"Lbs per ton\": {\n", + " \"emission_factor_numerator\": \"lb\",\n", + " \"emission_factor_denominator\": \"short ton\",\n", " },\n", - " 'Lbs per ton': {\n", - " 'emission_factor_numerator': 'lb',\n", - " 'emission_factor_denominator': 'short ton',\n", - " }\n", " }\n", - " \n", + "\n", " map_eia_combustion_system_to_ours = {\n", - " 'cyclone_firing_boiler' : {\n", - " 'prime_mover_code': TODO\n", - " 'boiler_bottom_type': 'N/A',\n", - " 'boiler_firing_type': 'STOKER',\n", - " 'multiple_by_sulfur_content': 0,\n", + " \"cyclone_firing_boiler\": {\n", + " \"prime_mover_code\": \"TODO\",\n", + " \"boiler_bottom_type\": \"N/A\",\n", + " \"boiler_firing_type\": \"STOKER\",\n", + " \"multiple_by_sulfur_content\": 0,\n", " },\n", - " 'fluidized_bed_firing_boiler' : {\n", - " 'prime_mover_code': TODO\n", - " 'boiler_bottom_type': 'N/A',\n", - " 'boiler_firing_type': 'FLUIDIZED',\n", - " 'multiple_by_sulfur_content': 0,\n", + " \"fluidized_bed_firing_boiler\": {\n", + " \"prime_mover_code\": \"TODO\",\n", + " \"boiler_bottom_type\": \"N/A\",\n", + " \"boiler_firing_type\": \"FLUIDIZED\",\n", + " \"multiple_by_sulfur_content\": 0,\n", " },\n", - " 'stoker_boiler' : {\n", - " 'prime_mover_code': 'ST' # Steam\n", - " 'boiler_bottom_type': 'N/A',\n", - " 'boiler_firing_type': 'STOKER',\n", - " 'multiple_by_sulfur_content': 0,\n", + " \"stoker_boiler\": {\n", + " \"prime_mover_code\": \"ST\", # Steam\n", + " \"boiler_bottom_type\": \"N/A\",\n", + " \"boiler_firing_type\": \"STOKER\",\n", + " \"multiple_by_sulfur_content\": 0,\n", " },\n", - " 'tangential_firing_boiler' : {\n", - " 'prime_mover_code': 'ST' # Steam\n", - " 'boiler_bottom_type': 'N/A',\n", - " 'boiler_firing_type': 'TANGENTIAL',\n", - " 'multiple_by_sulfur_content': 0,\n", + " \"tangential_firing_boiler\": {\n", + " \"prime_mover_code\": \"ST\", # Steam\n", + " \"boiler_bottom_type\": \"N/A\",\n", + " \"boiler_firing_type\": \"TANGENTIAL\",\n", + " \"multiple_by_sulfur_content\": 0,\n", " },\n", - " 'all_other_boiler_types' : {\n", - " 'prime_mover_code': 'UNK',\n", - " 'boiler_firing_type': 'N/A',\n", - " 'multiply_by_sulfur_content': 0 \n", + " \"all_other_boiler_types\": {\n", + " \"prime_mover_code\": \"UNK\",\n", + " \"boiler_firing_type\": \"N/A\",\n", + " \"multiply_by_sulfur_content\": 0,\n", " },\n", - " 'combustion_turbine' : {\n", - " 'prime_mover_code': 'GT', # Gas combustion turbine.\n", - " 'boiler_firing_type': 'N/A',\n", - " 'multiply_by_sulfur_content': 0\n", + " \"combustion_turbine\": {\n", + " \"prime_mover_code\": \"GT\", # Gas combustion turbine.\n", + " \"boiler_firing_type\": \"N/A\",\n", + " \"multiply_by_sulfur_content\": 0,\n", + " },\n", + " \"internal_combustion_engine\": {\n", + " \"prime_mover_code\": \"IC\",\n", + " \"boiler_firing_type\": \"N/A\",\n", + " \"multiply_by_sulfur_content\": 0,\n", " },\n", - " 'internal_combustion_engine' : {\n", - " 'prime_mover_code': 'IC',\n", - " 'boiler_firing_type': 'N/A',\n", - " 'multiply_by_sulfur_content': 0\n", - " }\n", " }\n", - " \n", + "\n", " for i in range(len(input_df)):\n", " row = input_df.iloc[i]\n", - " units = row['emissions_units'].replace(' **', '') # Remove asterisks.\n", + " units = row[\"emissions_units\"].replace(\" **\", \"\") # Remove asterisks.\n", " print(units)\n", " mapped_units_dict = map_eia_units_to_ours[units]\n", " for colname in input_df.columns[min_column_idx:]:\n", " emission_factor = row[colname]\n", - " print(colname, ':', emission_factor)\n", - " " + " print(colname, \":\", emission_factor)" ] }, { @@ -266,7 +269,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4 ('open_grid_emissions')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -280,7 +283,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9 | packaged by conda-forge | (main, Jan 11 2023, 15:15:40) [MSC v.1916 64 bit (AMD64)]" + "version": "3.10.4" }, "vscode": { "interpreter": { diff --git a/notebooks/explore_methods/calculate_residual_net_generation.ipynb b/notebooks/explore_methods/calculate_residual_net_generation.ipynb index a7d3e415..f0373380 100644 --- a/notebooks/explore_methods/calculate_residual_net_generation.ipynb +++ b/notebooks/explore_methods/calculate_residual_net_generation.ipynb @@ -1,835 +1,841 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Cleaning of 930, analyze how cleaning affects residual profile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# data source https://gridemissions.s3.us-east-2.amazonaws.com/EBA_elec.csv.gz\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import plotly.express as px\n", - "import plotly.graph_objects as go\n", - "\n", - "import datetime as dt\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# Tell python where to look for modules. \n", - "# Depending on how your jupyter handles working directories, this may not be needed.\n", - "import sys\n", - "sys.path.append('../../open-grid-emissions/')\n", - "\n", - "from src.visualization import day_hour_heatmap\n", - "from src.eia930 import fuel_code_map, reformat_chalendar, load_chalendar, load_chalendar_for_pipeline\n", - "from src.download_data import download_chalendar_files\n", - "from src.data_cleaning import distribute_monthly_eia_data_to_hourly\n", - "from src.impute_hourly_profiles import aggregate_for_residual, calculate_residual" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "year = 2020\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Download data if not exists\n", - "download_chalendar_files()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "original = load_chalendar_for_pipeline(\n", - " \"../data/eia930/chalendar/EBA_adjusted_rolling.csv\", year=year\n", - ")\n", - "original.head(5)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cleaned = load_chalendar_for_pipeline(\n", - " \"../data/eia930/chalendar/EBA_adjusted_elec.csv\", year=year\n", - ")\n", - "cleaned.head(5)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Name column same as CEMS. TODO: make eia930 output use this name\n", - "cleaned = cleaned.rename(columns={\"datetime_utc\": \"datetime_utc\"})\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load hourly CEMS data outputted from main data pipeline\n", - "cems = pd.read_csv(\n", - " f\"../data/outputs/cems_{year}.csv\",\n", - " parse_dates=[\"datetime_utc\"],\n", - ")\n", - "cems.head(5)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Aggregate CEMS data and merge with EIA-930 data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # combine original and cleaned EIA-930 data\n", - "# combined_data = cleaned.merge(\n", - "# original[[\"ba_code\", \"fuel_category\", \"datetime_utc\", \"net_generation_mwh_930\"]],\n", - "# how=\"left\",\n", - "# on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"],\n", - "# suffixes=(\"_clean\", \"_orig\"),\n", - "# )\n", - "# combined_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load plant attributes (including BA codes)\n", - "plant_attributes = pd.read_csv(\"../data/outputs/plant_static_attributes.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plant_attributes.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems = cems.merge(plant_attributes, how=\"left\", on=\"plant_id_eia\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "aggregate_for_residual(cems, \"datetime_utc\", \"ba_code_physical\", transmission=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# perform different groupby operations so that we can compare different ways of aggregating the cems data\n", - "\n", - "# aggregate all generation by commercial BA\n", - "cems_bac_all = (\n", - " cems.groupby([\"ba_code\", \"fuel_category_eia930\", \"datetime_utc\"])[\n", - " \"net_generation_mwh\"\n", - " ]\n", - " .sum()\n", - " .reset_index()\n", - " .rename(\n", - " columns={\n", - " \"fuel_category_eia930\": \"fuel_category\",\n", - " \"net_generation_mwh\": \"net_generation_mwh_bac_all\",\n", - " }\n", - " )\n", - ")\n", - "\n", - "# aggregate all generation by physical BA\n", - "cems_bap_all = (\n", - " cems.groupby([\"ba_code_physical\", \"fuel_category_eia930\", \"datetime_utc\"])[\n", - " \"net_generation_mwh\"\n", - " ]\n", - " .sum()\n", - " .reset_index()\n", - " .rename(\n", - " columns={\n", - " \"fuel_category_eia930\": \"fuel_category\",\n", - " \"ba_code_physical\": \"ba_code\",\n", - " \"net_generation_mwh\": \"net_generation_mwh_bap_all\",\n", - " }\n", - " )\n", - ")\n", - "\n", - "# Aggregate transmission-connected generation by commercial BA\n", - "cems_bac_trans = (\n", - " cems[cems[\"distribution_flag\"] is False]\n", - " .groupby([\"ba_code\", \"fuel_category_eia930\", \"datetime_utc\"])[\"net_generation_mwh\"]\n", - " .sum()\n", - " .reset_index()\n", - " .rename(\n", - " columns={\n", - " \"fuel_category_eia930\": \"fuel_category\",\n", - " \"net_generation_mwh\": \"net_generation_mwh_bac_trans\",\n", - " }\n", - " )\n", - ")\n", - "\n", - "# Aggregate transmission-connected generation by physical BA\n", - "cems_bap_trans = (\n", - " cems[cems[\"distribution_flag\"] == False]\n", - " .groupby([\"ba_code_physical\", \"fuel_category_eia930\", \"datetime_utc\"])[\n", - " \"net_generation_mwh\"\n", - " ]\n", - " .sum()\n", - " .reset_index()\n", - " .rename(\n", - " columns={\n", - " \"fuel_category_eia930\": \"fuel_category\",\n", - " \"ba_code_physical\": \"ba_code\",\n", - " \"net_generation_mwh\": \"net_generation_mwh_bap_trans\",\n", - " }\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# merge the aggregated data into the EIA data\n", - "combined_data = cleaned.merge(\n", - " cems_bac_all, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", - ").fillna(0)\n", - "combined_data = combined_data.merge(\n", - " cems_bap_all, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", - ").fillna(0)\n", - "combined_data = combined_data.merge(\n", - " cems_bac_trans, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", - ").fillna(0)\n", - "combined_data = combined_data.merge(\n", - " cems_bap_trans, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", - ").fillna(0)\n", - "combined_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cleaned\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# only keep rows where local datetime is in the current year\n", - "combined_data = combined_data[\n", - " combined_data[\"datetime_local\"].apply(lambda x: x.year) == year\n", - "]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluate BA mappings\n", - "\n", - "Mapping options: \n", - "Physical or commercial BA; include or exclude distribution-connected generation \n", - "\n", - "Metric: 923 data aggregated to BA should be close to 930 data aggregated to month. \n", - "For each BA, which aggregation metric minimizes difference? \n", - "How different are the aggreagtion metrics? " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia923 = pd.read_csv(f\"../data/outputs/eia923_allocated_{year}.csv\")\n", - "eia923.report_date = pd.to_datetime(\n", - " eia923.report_date\n", - ") # TODO why is this not a date already?\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia923 = eia923.merge(plant_attributes, how=\"left\", on=\"plant_id_eia\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia930_agg = (\n", - " cleaned.groupby([\"ba_code\", \"fuel_category\", \"report_date\"]).sum().reset_index()\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia923.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia930_agg.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia923_agg = eia930_agg.copy()\n", - "# aggregate all generation by commercial BA\n", - "for ba_key in [\"ba_code\", \"ba_code_physical\"]:\n", - " for transmission_only in [\"transmission\", \"all\"]:\n", - " transmission_key = transmission_only == \"transmission\"\n", - " aggregated_gen = aggregate_for_residual(\n", - " eia923, time_key=\"report_date\", ba_key=ba_key, transmission=transmission_key\n", - " )\n", - " aggregated_gen.rename(\n", - " columns={\"net_generation_mwh\": f\"mwh_{ba_key}_{transmission_only}\"},\n", - " inplace=True,\n", - " )\n", - " aggregated_gen[f\"difference_{ba_key}_{transmission_only}\"] = (\n", - " eia923_agg[\"net_generation_mwh_930\"]\n", - " - aggregated_gen[f\"mwh_{ba_key}_{transmission_only}\"]\n", - " )\n", - " eia923_agg = eia923_agg.merge(\n", - " aggregated_gen, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"report_date\"]\n", - " )\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eia923_agg.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.histogram(\n", - " eia923_agg,\n", - " x=[\n", - " \"difference_ba_code_transmission\",\n", - " \"difference_ba_code_all\",\n", - " \"difference_ba_code_physical_transmission\",\n", - " \"difference_ba_code_physical_all\",\n", - " ],\n", - ")\n", - "\n", - "# Overlay both histograms\n", - "fig.update_layout(barmode=\"overlay\")\n", - "# Reduce opacity to see both histograms\n", - "fig.update_traces(opacity=0.25)\n", - "fig.show()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Visualize net generation data from each source in a single BA\n", - "Only visualize non-renewable data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ba = \"MISO\"\n", - "\n", - "data_to_visualize = combined_data[\n", - " (combined_data[\"ba_code\"] == ba)\n", - " & (~combined_data[\"fuel_category\"].isin([\"hydro\", \"solar\", \"wind\"]))\n", - "]\n", - "\n", - "px.line(\n", - " data_to_visualize,\n", - " x=\"datetime_local\",\n", - " y=[\n", - " \"net_generation_mwh_930_clean\",\n", - " \"net_generation_mwh_930_orig\",\n", - " \"net_generation_mwh_bac_all\",\n", - " \"net_generation_mwh_bap_all\",\n", - " \"net_generation_mwh_bac_trans\",\n", - " \"net_generation_mwh_bap_trans\",\n", - " ],\n", - " facet_col=\"fuel_category\",\n", - " height=1000,\n", - " facet_col_wrap=1,\n", - ").update_yaxes(matches=None)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Calculate the residual based on a single CEMS aggregation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for now, let's pick a single cems aggregation to use to calculate a residual\n", - "cems_data_column = \"net_generation_mwh_bac_all\"\n", - "\n", - "combined_data[\"residual\"] = (\n", - " combined_data[\"net_generation_mwh_930_clean\"] - combined_data[cems_data_column]\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Visualize residual for the BA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_to_visualize = combined_data[\n", - " (combined_data[\"ba_code\"] == ba)\n", - " & (~combined_data[\"fuel_category\"].isin([\"hydro\", \"solar\", \"wind\"]))\n", - "]\n", - "\n", - "px.line(\n", - " data_to_visualize,\n", - " x=\"datetime_local\",\n", - " y=[\"net_generation_mwh_930_clean\", cems_data_column, \"residual\"],\n", - " facet_col=\"fuel_category\",\n", - " height=1000,\n", - " facet_col_wrap=1,\n", - ").update_yaxes(matches=None)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test Scaling Strategy\n", - "If the residual is ever negative, we want to scale the cems net generation data to always be less than or equal to the 930 net generation. \n", - "\n", - "To do this, we'll try scaling the data as a percentage:\n", - "1. For each hour, calculate the ratio between 930 NG and CEMS NG.\n", - "2. For each BA-fuel, find the minimum ratio. If the minimum ratio is >= 1, it means that 930 is always greater than CEMS and doesn't need to be scaled. For any BA-fuels where the ratio is < 1, we will use this as a scaling factor to scale the CEMS data such that the scaled data is always <= the 930 data\n", - "3. Multiply all hourly CEMS values by the scaling factor" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# only keep data where the cems data is greater than zero\n", - "scaling_factors = combined_data.copy()[combined_data[cems_data_column] != 0]\n", - "\n", - "# calculate the ratio of 930 net generation to cems net generation\n", - "# if correct, ratio should be >=1\n", - "scaling_factors[\"scaling_factor\"] = (\n", - " scaling_factors[\"net_generation_mwh_930_clean\"] / scaling_factors[cems_data_column]\n", - ")\n", - "\n", - "# find the minimum ratio for each ba-fuel\n", - "scaling_factors = (\n", - " scaling_factors.groupby([\"ba_code\", \"fuel_category\"])[\"scaling_factor\"]\n", - " .min()\n", - " .reset_index()\n", - ")\n", - "scaling_factors\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# merge the scaling factor into the combined data\n", - "# for any BA-fuels without a scaling factor, fill with 1 (scale to 100% of the origina data)\n", - "combined_data = combined_data.merge(\n", - " scaling_factors, how=\"left\", on=[\"ba_code\", \"fuel_category\"]\n", - ").fillna(1)\n", - "\n", - "# calculate the scaled cems data\n", - "combined_data[\"cems_scaled\"] = (\n", - " combined_data[cems_data_column] * combined_data[\"scaling_factor\"]\n", - ")\n", - "\n", - "# calculate a scaled residual\n", - "combined_data[\"residual_scaled\"] = (\n", - " combined_data[\"net_generation_mwh_930_clean\"] - combined_data[\"cems_scaled\"]\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Plot scaled residuals" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ba = \"PJM\"\n", - "\n", - "data_to_visualize = combined_data[\n", - " (combined_data[\"ba_code\"] == ba)\n", - " & (~combined_data[\"fuel_category\"].isin([\"hydro\", \"solar\", \"wind\"]))\n", - "]\n", - "\n", - "px.line(\n", - " data_to_visualize,\n", - " x=\"datetime_local\",\n", - " y=[\n", - " \"net_generation_mwh_930_clean\",\n", - " cems_data_column,\n", - " \"cems_scaled\",\n", - " \"residual\",\n", - " \"residual_scaled\",\n", - " ],\n", - " facet_col=\"fuel_category\",\n", - " height=1000,\n", - " facet_col_wrap=1,\n", - ").update_yaxes(matches=None)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_data = combined_data.reset_index()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# broken\n", - "\n", - "ba = \"MISO\"\n", - "fuel = \"natural_gas\"\n", - "\n", - "data_to_visualize = combined_data.copy()[\n", - " (combined_data[\"ba_code\"] == ba) & (combined_data[\"fuel_category\"] == fuel)\n", - "]\n", - "data_to_visualize[\"datetime_local\"] = pd.to_datetime(\n", - " data_to_visualize[\"datetime_local\"]\n", - ")\n", - "data_to_visualize[\"date\"] = data_to_visualize[\"datetime_local\"].dt.date\n", - "data_to_visualize[\"hour\"] = data_to_visualize[\"datetime_local\"].dt.hour\n", - "\n", - "# data_to_visualize = data_to_visualize.pivot(index='hour', columns='date', values='residual_scaled')\n", - "\n", - "# px.imshow(data_to_visualize, color_continuous_scale=\"RdBu\", width=1000, height=400, color_continuous_midpoint=0,)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Export the profile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_to_export = combined_data[\n", - " [\n", - " \"ba_code\",\n", - " \"fuel_category\",\n", - " \"datetime_utc\",\n", - " \"datetime_local\",\n", - " \"report_date\",\n", - " \"residual_scaled\",\n", - " ]\n", - "]\n", - "data_to_export.to_csv(\"../data/output/residual_profiles.csv\", index=False)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluate profile quality\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"../data/outputs/residual_profiles.csv\") as f:\n", - " line = f.readline()\n", - " print(line.split(\",\"))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Produced by data_pipeline\n", - "eia = pd.read_csv(\"../data/output/eia923_for_residual.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Produced by plant_class_differences\n", - "# TODO use output plant data to find smallest plants after cleaning -- some of these are in \"no cems\" data categories\n", - "validation_plants = pd.read_csv(\"../data/output/validation_plants.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "to_distribute = eia[eia.plant_id_eia.isin(validation_plants.plant_id_eia)]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "to_distribute.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "distributed = distribute_monthly_eia_data_to_hourly(\n", - " to_distribute, combined_data, \"residual_scaled\"\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cleaned = load_chalendar_for_pipeline(\n", - " \"../data/eia930/chalendar/EBA_adjusted_elec.csv\", year=year\n", - ")\n", - "cems = pd.read_csv(\n", - " f\"../data/outputs/cems_{year}.csv\",\n", - " parse_dates=[\"datetime_utc\"],\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cleaned = cleaned.rename(columns={\"datetime_utc\": \"datetime_utc\"})\n", - "cems = cems.rename(columns={\"datetime_utc\": \"datetime_utc\"})\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plant_attributes = pd.read_csv(\"../data/outputs/plant_static_attributes.csv\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cleaned.head()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "calculate_residual(cems, cleaned, plant_attributes, 2020)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cems\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.5 ('hourly_egrid')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.5" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "4103f3cd497821eca917ea303dbe10c590d787eb7d2dc3fd4e15dec0356e7931" - } - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cleaning of 930, analyze how cleaning affects residual profile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# data source https://gridemissions.s3.us-east-2.amazonaws.com/EBA_elec.csv.gz\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import plotly.express as px\n", + "import plotly.graph_objects as go\n", + "\n", + "import datetime as dt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Tell python where to look for modules.\n", + "# Depending on how your jupyter handles working directories, this may not be needed.\n", + "import sys\n", + "\n", + "sys.path.append(\"../../open-grid-emissions/\")\n", + "\n", + "from src.visualization import day_hour_heatmap\n", + "from src.eia930 import (\n", + " fuel_code_map,\n", + " reformat_chalendar,\n", + " load_chalendar,\n", + " load_chalendar_for_pipeline,\n", + ")\n", + "from src.download_data import download_chalendar_files\n", + "from src.data_cleaning import distribute_monthly_eia_data_to_hourly\n", + "from src.impute_hourly_profiles import aggregate_for_residual, calculate_residual" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "year = 2020" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download data if not exists\n", + "download_chalendar_files()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "original = load_chalendar_for_pipeline(\n", + " \"../data/eia930/chalendar/EBA_adjusted_rolling.csv\", year=year\n", + ")\n", + "original.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned = load_chalendar_for_pipeline(\n", + " \"../data/eia930/chalendar/EBA_adjusted_elec.csv\", year=year\n", + ")\n", + "cleaned.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Name column same as CEMS. TODO: make eia930 output use this name\n", + "cleaned = cleaned.rename(columns={\"datetime_utc\": \"datetime_utc\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load hourly CEMS data outputted from main data pipeline\n", + "cems = pd.read_csv(\n", + " f\"../data/outputs/cems_{year}.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ")\n", + "cems.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Aggregate CEMS data and merge with EIA-930 data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # combine original and cleaned EIA-930 data\n", + "# combined_data = cleaned.merge(\n", + "# original[[\"ba_code\", \"fuel_category\", \"datetime_utc\", \"net_generation_mwh_930\"]],\n", + "# how=\"left\",\n", + "# on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"],\n", + "# suffixes=(\"_clean\", \"_orig\"),\n", + "# )\n", + "# combined_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load plant attributes (including BA codes)\n", + "plant_attributes = pd.read_csv(\"../data/outputs/plant_static_attributes.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plant_attributes.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems = cems.merge(plant_attributes, how=\"left\", on=\"plant_id_eia\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "aggregate_for_residual(cems, \"datetime_utc\", \"ba_code_physical\", transmission=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# perform different groupby operations so that we can compare different ways of aggregating the cems data\n", + "\n", + "# aggregate all generation by commercial BA\n", + "cems_bac_all = (\n", + " cems.groupby([\"ba_code\", \"fuel_category_eia930\", \"datetime_utc\"])[\n", + " \"net_generation_mwh\"\n", + " ]\n", + " .sum()\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"fuel_category_eia930\": \"fuel_category\",\n", + " \"net_generation_mwh\": \"net_generation_mwh_bac_all\",\n", + " }\n", + " )\n", + ")\n", + "\n", + "# aggregate all generation by physical BA\n", + "cems_bap_all = (\n", + " cems.groupby([\"ba_code_physical\", \"fuel_category_eia930\", \"datetime_utc\"])[\n", + " \"net_generation_mwh\"\n", + " ]\n", + " .sum()\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"fuel_category_eia930\": \"fuel_category\",\n", + " \"ba_code_physical\": \"ba_code\",\n", + " \"net_generation_mwh\": \"net_generation_mwh_bap_all\",\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Aggregate transmission-connected generation by commercial BA\n", + "cems_bac_trans = (\n", + " cems[cems[\"distribution_flag\"] is False]\n", + " .groupby([\"ba_code\", \"fuel_category_eia930\", \"datetime_utc\"])[\"net_generation_mwh\"]\n", + " .sum()\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"fuel_category_eia930\": \"fuel_category\",\n", + " \"net_generation_mwh\": \"net_generation_mwh_bac_trans\",\n", + " }\n", + " )\n", + ")\n", + "\n", + "# Aggregate transmission-connected generation by physical BA\n", + "cems_bap_trans = (\n", + " cems[cems[\"distribution_flag\"] == False]\n", + " .groupby([\"ba_code_physical\", \"fuel_category_eia930\", \"datetime_utc\"])[\n", + " \"net_generation_mwh\"\n", + " ]\n", + " .sum()\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"fuel_category_eia930\": \"fuel_category\",\n", + " \"ba_code_physical\": \"ba_code\",\n", + " \"net_generation_mwh\": \"net_generation_mwh_bap_trans\",\n", + " }\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the aggregated data into the EIA data\n", + "combined_data = cleaned.merge(\n", + " cems_bac_all, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", + ").fillna(0)\n", + "combined_data = combined_data.merge(\n", + " cems_bap_all, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", + ").fillna(0)\n", + "combined_data = combined_data.merge(\n", + " cems_bac_trans, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", + ").fillna(0)\n", + "combined_data = combined_data.merge(\n", + " cems_bap_trans, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"datetime_utc\"]\n", + ").fillna(0)\n", + "combined_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# only keep rows where local datetime is in the current year\n", + "combined_data = combined_data[\n", + " combined_data[\"datetime_local\"].apply(lambda x: x.year) == year\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate BA mappings\n", + "\n", + "Mapping options: \n", + "Physical or commercial BA; include or exclude distribution-connected generation \n", + "\n", + "Metric: 923 data aggregated to BA should be close to 930 data aggregated to month. \n", + "For each BA, which aggregation metric minimizes difference? \n", + "How different are the aggreagtion metrics? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia923 = pd.read_csv(f\"../data/outputs/eia923_allocated_{year}.csv\")\n", + "eia923.report_date = pd.to_datetime(\n", + " eia923.report_date\n", + ") # TODO why is this not a date already?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia923 = eia923.merge(plant_attributes, how=\"left\", on=\"plant_id_eia\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia930_agg = (\n", + " cleaned.groupby([\"ba_code\", \"fuel_category\", \"report_date\"]).sum().reset_index()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia923.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia930_agg.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia923_agg = eia930_agg.copy()\n", + "# aggregate all generation by commercial BA\n", + "for ba_key in [\"ba_code\", \"ba_code_physical\"]:\n", + " for transmission_only in [\"transmission\", \"all\"]:\n", + " transmission_key = transmission_only == \"transmission\"\n", + " aggregated_gen = aggregate_for_residual(\n", + " eia923, time_key=\"report_date\", ba_key=ba_key, transmission=transmission_key\n", + " )\n", + " aggregated_gen.rename(\n", + " columns={\"net_generation_mwh\": f\"mwh_{ba_key}_{transmission_only}\"},\n", + " inplace=True,\n", + " )\n", + " aggregated_gen[f\"difference_{ba_key}_{transmission_only}\"] = (\n", + " eia923_agg[\"net_generation_mwh_930\"]\n", + " - aggregated_gen[f\"mwh_{ba_key}_{transmission_only}\"]\n", + " )\n", + " eia923_agg = eia923_agg.merge(\n", + " aggregated_gen, how=\"left\", on=[\"ba_code\", \"fuel_category\", \"report_date\"]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eia923_agg.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.histogram(\n", + " eia923_agg,\n", + " x=[\n", + " \"difference_ba_code_transmission\",\n", + " \"difference_ba_code_all\",\n", + " \"difference_ba_code_physical_transmission\",\n", + " \"difference_ba_code_physical_all\",\n", + " ],\n", + ")\n", + "\n", + "# Overlay both histograms\n", + "fig.update_layout(barmode=\"overlay\")\n", + "# Reduce opacity to see both histograms\n", + "fig.update_traces(opacity=0.25)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualize net generation data from each source in a single BA\n", + "Only visualize non-renewable data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ba = \"MISO\"\n", + "\n", + "data_to_visualize = combined_data[\n", + " (combined_data[\"ba_code\"] == ba)\n", + " & (~combined_data[\"fuel_category\"].isin([\"hydro\", \"solar\", \"wind\"]))\n", + "]\n", + "\n", + "px.line(\n", + " data_to_visualize,\n", + " x=\"datetime_local\",\n", + " y=[\n", + " \"net_generation_mwh_930_clean\",\n", + " \"net_generation_mwh_930_orig\",\n", + " \"net_generation_mwh_bac_all\",\n", + " \"net_generation_mwh_bap_all\",\n", + " \"net_generation_mwh_bac_trans\",\n", + " \"net_generation_mwh_bap_trans\",\n", + " ],\n", + " facet_col=\"fuel_category\",\n", + " height=1000,\n", + " facet_col_wrap=1,\n", + ").update_yaxes(matches=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculate the residual based on a single CEMS aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for now, let's pick a single cems aggregation to use to calculate a residual\n", + "cems_data_column = \"net_generation_mwh_bac_all\"\n", + "\n", + "combined_data[\"residual\"] = (\n", + " combined_data[\"net_generation_mwh_930_clean\"] - combined_data[cems_data_column]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualize residual for the BA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_to_visualize = combined_data[\n", + " (combined_data[\"ba_code\"] == ba)\n", + " & (~combined_data[\"fuel_category\"].isin([\"hydro\", \"solar\", \"wind\"]))\n", + "]\n", + "\n", + "px.line(\n", + " data_to_visualize,\n", + " x=\"datetime_local\",\n", + " y=[\"net_generation_mwh_930_clean\", cems_data_column, \"residual\"],\n", + " facet_col=\"fuel_category\",\n", + " height=1000,\n", + " facet_col_wrap=1,\n", + ").update_yaxes(matches=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Scaling Strategy\n", + "If the residual is ever negative, we want to scale the cems net generation data to always be less than or equal to the 930 net generation. \n", + "\n", + "To do this, we'll try scaling the data as a percentage:\n", + "1. For each hour, calculate the ratio between 930 NG and CEMS NG.\n", + "2. For each BA-fuel, find the minimum ratio. If the minimum ratio is >= 1, it means that 930 is always greater than CEMS and doesn't need to be scaled. For any BA-fuels where the ratio is < 1, we will use this as a scaling factor to scale the CEMS data such that the scaled data is always <= the 930 data\n", + "3. Multiply all hourly CEMS values by the scaling factor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# only keep data where the cems data is greater than zero\n", + "scaling_factors = combined_data.copy()[combined_data[cems_data_column] != 0]\n", + "\n", + "# calculate the ratio of 930 net generation to cems net generation\n", + "# if correct, ratio should be >=1\n", + "scaling_factors[\"scaling_factor\"] = (\n", + " scaling_factors[\"net_generation_mwh_930_clean\"] / scaling_factors[cems_data_column]\n", + ")\n", + "\n", + "# find the minimum ratio for each ba-fuel\n", + "scaling_factors = (\n", + " scaling_factors.groupby([\"ba_code\", \"fuel_category\"])[\"scaling_factor\"]\n", + " .min()\n", + " .reset_index()\n", + ")\n", + "scaling_factors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# merge the scaling factor into the combined data\n", + "# for any BA-fuels without a scaling factor, fill with 1 (scale to 100% of the origina data)\n", + "combined_data = combined_data.merge(\n", + " scaling_factors, how=\"left\", on=[\"ba_code\", \"fuel_category\"]\n", + ").fillna(1)\n", + "\n", + "# calculate the scaled cems data\n", + "combined_data[\"cems_scaled\"] = (\n", + " combined_data[cems_data_column] * combined_data[\"scaling_factor\"]\n", + ")\n", + "\n", + "# calculate a scaled residual\n", + "combined_data[\"residual_scaled\"] = (\n", + " combined_data[\"net_generation_mwh_930_clean\"] - combined_data[\"cems_scaled\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plot scaled residuals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ba = \"PJM\"\n", + "\n", + "data_to_visualize = combined_data[\n", + " (combined_data[\"ba_code\"] == ba)\n", + " & (~combined_data[\"fuel_category\"].isin([\"hydro\", \"solar\", \"wind\"]))\n", + "]\n", + "\n", + "px.line(\n", + " data_to_visualize,\n", + " x=\"datetime_local\",\n", + " y=[\n", + " \"net_generation_mwh_930_clean\",\n", + " cems_data_column,\n", + " \"cems_scaled\",\n", + " \"residual\",\n", + " \"residual_scaled\",\n", + " ],\n", + " facet_col=\"fuel_category\",\n", + " height=1000,\n", + " facet_col_wrap=1,\n", + ").update_yaxes(matches=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_data = combined_data.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# broken\n", + "\n", + "ba = \"MISO\"\n", + "fuel = \"natural_gas\"\n", + "\n", + "data_to_visualize = combined_data.copy()[\n", + " (combined_data[\"ba_code\"] == ba) & (combined_data[\"fuel_category\"] == fuel)\n", + "]\n", + "data_to_visualize[\"datetime_local\"] = pd.to_datetime(\n", + " data_to_visualize[\"datetime_local\"]\n", + ")\n", + "data_to_visualize[\"date\"] = data_to_visualize[\"datetime_local\"].dt.date\n", + "data_to_visualize[\"hour\"] = data_to_visualize[\"datetime_local\"].dt.hour\n", + "\n", + "# data_to_visualize = data_to_visualize.pivot(index='hour', columns='date', values='residual_scaled')\n", + "\n", + "# px.imshow(data_to_visualize, color_continuous_scale=\"RdBu\", width=1000, height=400, color_continuous_midpoint=0,)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Export the profile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_to_export = combined_data[\n", + " [\n", + " \"ba_code\",\n", + " \"fuel_category\",\n", + " \"datetime_utc\",\n", + " \"datetime_local\",\n", + " \"report_date\",\n", + " \"residual_scaled\",\n", + " ]\n", + "]\n", + "data_to_export.to_csv(\"../data/output/residual_profiles.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluate profile quality\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"../data/outputs/residual_profiles.csv\") as f:\n", + " line = f.readline()\n", + " print(line.split(\",\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Produced by data_pipeline\n", + "eia = pd.read_csv(\"../data/output/eia923_for_residual.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Produced by plant_class_differences\n", + "# TODO use output plant data to find smallest plants after cleaning -- some of these are in \"no cems\" data categories\n", + "validation_plants = pd.read_csv(\"../data/output/validation_plants.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "to_distribute = eia[eia.plant_id_eia.isin(validation_plants.plant_id_eia)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "to_distribute.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "distributed = distribute_monthly_eia_data_to_hourly(\n", + " to_distribute, combined_data, \"residual_scaled\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned = load_chalendar_for_pipeline(\n", + " \"../data/eia930/chalendar/EBA_adjusted_elec.csv\", year=year\n", + ")\n", + "cems = pd.read_csv(\n", + " f\"../data/outputs/cems_{year}.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned = cleaned.rename(columns={\"datetime_utc\": \"datetime_utc\"})\n", + "cems = cems.rename(columns={\"datetime_utc\": \"datetime_utc\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plant_attributes = pd.read_csv(\"../data/outputs/plant_static_attributes.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "calculate_residual(cems, cleaned, plant_attributes, 2020)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cems" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.5 ('hourly_egrid')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 }, - "nbformat": 4, - "nbformat_minor": 2 + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "4103f3cd497821eca917ea303dbe10c590d787eb7d2dc3fd4e15dec0356e7931" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/notebooks/explore_methods/national_wind_solar_correlations.ipynb b/notebooks/explore_methods/national_wind_solar_correlations.ipynb index 1b6a7669..99b4ff37 100644 --- a/notebooks/explore_methods/national_wind_solar_correlations.ipynb +++ b/notebooks/explore_methods/national_wind_solar_correlations.ipynb @@ -27,7 +27,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "\n", "import eia930\n", @@ -47,7 +48,7 @@ "# load eia930 data\n", "\n", "# If running small, we didn't clean the whole year, so need to use the Chalender file to build residual profiles.\n", - "clean_930_file = (f\"{outputs_folder()}{path_prefix}/eia930/eia930_elec.csv\")\n", + "clean_930_file = f\"{outputs_folder()}{path_prefix}/eia930/eia930_elec.csv\"\n", "eia930_data = eia930.load_chalendar_for_pipeline(clean_930_file, year=year)\n", "# until we can fix the physics reconciliation, we need to apply some post-processing steps\n", "eia930_data = eia930.remove_imputed_ones(eia930_data)\n", @@ -67,8 +68,8 @@ "metadata": {}, "outputs": [], "source": [ - "fuel=\"wind\"\n", - "report_date=\"2020-11-01\"\n", + "fuel = \"wind\"\n", + "report_date = \"2020-11-01\"\n", "\n", "df_temporary = eia930_data.copy()[\n", " (eia930_data[\"fuel_category_eia930\"] == fuel)\n", @@ -89,7 +90,9 @@ "outputs": [], "source": [ "# how well correlated are profiles across utc time\n", - "df_temporary.pivot(index=\"datetime_utc\", columns=\"ba_code\", values=\"net_generation_mwh_930\").corr().mean().mean()" + "df_temporary.pivot(\n", + " index=\"datetime_utc\", columns=\"ba_code\", values=\"net_generation_mwh_930\"\n", + ").corr().mean().mean()" ] }, { @@ -99,7 +102,9 @@ "outputs": [], "source": [ "# how well correlated are profiles across local time\n", - "df_temporary.pivot(index=\"datetime_local\", columns=\"ba_code\", values=\"net_generation_mwh_930\").corr().mean().mean()" + "df_temporary.pivot(\n", + " index=\"datetime_local\", columns=\"ba_code\", values=\"net_generation_mwh_930\"\n", + ").corr().mean().mean()" ] } ], diff --git a/notebooks/manual_data/default_fuel_sulfur_content.ipynb b/notebooks/manual_data/default_fuel_sulfur_content.ipynb index d70ac65d..48779e24 100644 --- a/notebooks/manual_data/default_fuel_sulfur_content.ipynb +++ b/notebooks/manual_data/default_fuel_sulfur_content.ipynb @@ -30,20 +30,25 @@ "metadata": {}, "outputs": [], "source": [ - "for year in [2015,2016,2017,2018,2019,2020]:\n", + "for year in [2015, 2016, 2017, 2018, 2019, 2020]:\n", + " pudl_out = load_data.initialize_pudl_out(year)\n", "\n", - " pudl_out = load_data.initialize_pudl_out(year)\n", + " (\n", + " plant_specific_fuel_sulfur_content,\n", + " national_avg_fuel_sulfur_content,\n", + " annual_avg_fuel_sulfur_content,\n", + " ) = emissions.return_monthly_plant_fuel_sulfur_content(pudl_out)\n", "\n", - " (plant_specific_fuel_sulfur_content,\n", - " national_avg_fuel_sulfur_content,\n", - " annual_avg_fuel_sulfur_content) = emissions.return_monthly_plant_fuel_sulfur_content(pudl_out)\n", + " annual_avg_fuel_sulfur_content = annual_avg_fuel_sulfur_content.rename(\n", + " columns={\"sulfur_content_pct\": f\"sulfur_content_pct_{year}\"}\n", + " )\n", "\n", - " annual_avg_fuel_sulfur_content = annual_avg_fuel_sulfur_content.rename(columns={\"sulfur_content_pct\":f\"sulfur_content_pct_{year}\"})\n", - "\n", - " if year == 2015:\n", - " result = annual_avg_fuel_sulfur_content.copy()\n", - " else:\n", - " result = result.merge(annual_avg_fuel_sulfur_content, how=\"outer\", on=\"energy_source_code\")\n", + " if year == 2015:\n", + " result = annual_avg_fuel_sulfur_content.copy()\n", + " else:\n", + " result = result.merge(\n", + " annual_avg_fuel_sulfur_content, how=\"outer\", on=\"energy_source_code\"\n", + " )\n", "\n", "result" ] diff --git a/notebooks/manual_data/export_fuel_heat_content.ipynb b/notebooks/manual_data/export_fuel_heat_content.ipynb index 62dee1bf..6a01315e 100644 --- a/notebooks/manual_data/export_fuel_heat_content.ipynb +++ b/notebooks/manual_data/export_fuel_heat_content.ipynb @@ -15,11 +15,12 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import load_data\n", "from filepaths import *\n", - "import emissions\n" + "import emissions" ] }, { @@ -57,7 +58,7 @@ "\n", "fuel_heat_content.to_csv(\n", " outputs_folder(\"annual_average_fuel_heat_content.csv\"), index=False\n", - ")\n" + ")" ] } ], diff --git a/notebooks/manual_data/identify_eia930_time_lags.ipynb b/notebooks/manual_data/identify_eia930_time_lags.ipynb index 06cfd726..04d54e5b 100644 --- a/notebooks/manual_data/identify_eia930_time_lags.ipynb +++ b/notebooks/manual_data/identify_eia930_time_lags.ipynb @@ -57,7 +57,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import download_data\n", "import load_data\n", @@ -78,8 +79,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Data before and after shifts \n", - "# Note: this is very slow! (~30min) because it's pivoting large files. \n", + "# Data before and after shifts\n", + "# Note: this is very slow! (~30min) because it's pivoting large files.\n", "lraw = []\n", "lshift = []\n", "\n", @@ -92,16 +93,16 @@ " s = eia930.reformat_chalendar(s)\n", " r = eia930.reformat_chalendar(r)\n", "\n", - " s = s[s.fuel.isin([\"COL\",\"NG\",\"OIL\"])]\n", + " s = s[s.fuel.isin([\"COL\", \"NG\", \"OIL\"])]\n", " s = s.rename(columns={\"UTC Time at End of Hour\": \"datetime_utc\"})\n", - " s = s.groupby([\"datetime_utc\",\"BA\"]).sum()[\"generation\"].reset_index()\n", - " s = s[s.datetime_utc.dt.year == year] # filter for year\n", + " s = s.groupby([\"datetime_utc\", \"BA\"]).sum()[\"generation\"].reset_index()\n", + " s = s[s.datetime_utc.dt.year == year] # filter for year\n", "\n", " # Filter for fossil fuels, sum by BA\n", - " r = r[r.fuel.isin([\"COL\",\"NG\",\"OIL\"])]\n", + " r = r[r.fuel.isin([\"COL\", \"NG\", \"OIL\"])]\n", " r = r.rename(columns={\"UTC Time at End of Hour\": \"datetime_utc\"})\n", - " r = r.groupby([\"datetime_utc\",\"BA\"]).sum()[\"generation\"].reset_index()\n", - " r = r[r.datetime_utc.dt.year == year] # filter for year\n", + " r = r.groupby([\"datetime_utc\", \"BA\"]).sum()[\"generation\"].reset_index()\n", + " r = r[r.datetime_utc.dt.year == year] # filter for year\n", " lraw.append(r)\n", " lshift.append(s)" ] @@ -122,14 +123,18 @@ "metadata": {}, "outputs": [], "source": [ - "# Load data after shifting and rolling filter \n", + "# Load data after shifting and rolling filter\n", "\n", "all_rolled = []\n", "for y in [2019, 2020, 2021]:\n", - " rolled_930 = pd.read_csv(f\"../../data/outputs/{y}/eia930/eia930_rolling.csv\", index_col=0, parse_dates=True)\n", + " rolled_930 = pd.read_csv(\n", + " f\"../../data/outputs/{y}/eia930/eia930_rolling.csv\",\n", + " index_col=0,\n", + " parse_dates=True,\n", + " )\n", " rolled_930 = rolled_930[rolled_930.index.year == y]\n", " all_rolled.append(rolled_930)\n", - "rolled_930 = eia930.reformat_chalendar(pd.concat(all_rolled))\n" + "rolled_930 = eia930.reformat_chalendar(pd.concat(all_rolled))" ] }, { @@ -138,9 +143,14 @@ "metadata": {}, "outputs": [], "source": [ - "##### Remove renewables before summing 930 \n", + "##### Remove renewables before summing 930\n", "\n", - "rolled_930 = rolled_930[rolled_930.fuel.isin([\"COL\",\"NG\",\"OIL\"])].groupby([\"datetime_utc\",\"BA\"]).sum().reset_index()" + "rolled_930 = (\n", + " rolled_930[rolled_930.fuel.isin([\"COL\", \"NG\", \"OIL\"])]\n", + " .groupby([\"datetime_utc\", \"BA\"])\n", + " .sum()\n", + " .reset_index()\n", + ")" ] }, { @@ -152,19 +162,26 @@ "# Load files\n", "# Aggregate by BA during loading to cut down on space\n", "cems = pd.DataFrame()\n", - "for y in [2019, 2020, 2021]: \n", + "for y in [2019, 2020, 2021]:\n", " print(f\"loading {y}\")\n", " file = f\"{data_folder()}/outputs/{y}/cems_cleaned_{y}.csv\"\n", " plant_meta = pd.read_csv(f\"../../data/outputs/{y}/plant_static_attributes_{y}.csv\")\n", - " c = pd.read_csv(file, index_col=0, parse_dates=['datetime_utc'])\n", - " c = c.rename(columns={\"datetime_utc\":\"datetime_utc\"})\n", - " c = c.merge(plant_meta[['plant_id_eia', 'plant_primary_fuel', 'ba_code']], how='left', left_index=True, right_on='plant_id_eia')\n", + " c = pd.read_csv(file, index_col=0, parse_dates=[\"datetime_utc\"])\n", + " c = c.rename(columns={\"datetime_utc\": \"datetime_utc\"})\n", + " c = c.merge(\n", + " plant_meta[[\"plant_id_eia\", \"plant_primary_fuel\", \"ba_code\"]],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_on=\"plant_id_eia\",\n", + " )\n", " # exclude solar power for CEMS, since we're just going to look at COL + OIL + NG in the 930 data\n", " c = c[c[\"plant_primary_fuel\"] != \"SUN\"]\n", " print(\"Aggregating\")\n", - " if y == 2021: \n", - " c = c.rename(columns={\"gross_generation_mwh\":\"net_generation_mwh\"})\n", - " cems_aggregated = c.groupby([\"datetime_utc\",\"ba_code\"]).sum()[\"net_generation_mwh\"].reset_index()\n", + " if y == 2021:\n", + " c = c.rename(columns={\"gross_generation_mwh\": \"net_generation_mwh\"})\n", + " cems_aggregated = (\n", + " c.groupby([\"datetime_utc\", \"ba_code\"]).sum()[\"net_generation_mwh\"].reset_index()\n", + " )\n", " cems = pd.concat([cems, cems_aggregated])\n", "\n", "cems.head()" @@ -176,7 +193,9 @@ "metadata": {}, "outputs": [], "source": [ - "plant_attributes = pd.read_csv(outputs_folder(f\"{year}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes())" + "plant_attributes = pd.read_csv(\n", + " outputs_folder(f\"{year}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes()\n", + ")" ] }, { @@ -202,7 +221,9 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"shared BAs: {len(bas)} out of {len(raw.BA.unique())} 930 BAs and {len(cems.ba_code.unique())} CEMS BAs.\")\n", + "print(\n", + " f\"shared BAs: {len(bas)} out of {len(raw.BA.unique())} 930 BAs and {len(cems.ba_code.unique())} CEMS BAs.\"\n", + ")\n", "\n", "missing_cems = set(raw.BA.unique()).difference(set(cems.ba_code.unique()))\n", "missing_930 = set(cems.ba_code.unique()).difference(set(raw.BA.unique()))\n", @@ -217,26 +238,27 @@ "outputs": [], "source": [ "def find_best_cor(cems, df_eia930):\n", - " cems = cems.pivot(columns=\"ba_code\", index=\"datetime_utc\", values=\"net_generation_mwh\")\n", + " cems = cems.pivot(\n", + " columns=\"ba_code\", index=\"datetime_utc\", values=\"net_generation_mwh\"\n", + " )\n", " df_eia930 = df_eia930.pivot(columns=\"BA\", index=\"datetime_utc\", values=\"generation\")\n", "\n", " bas = set(cems.columns).intersection(set(df_eia930.columns))\n", "\n", - " correlations = pd.DataFrame(index=bas, columns=range(-12,12), dtype=float)\n", + " correlations = pd.DataFrame(index=bas, columns=range(-12, 12), dtype=float)\n", "\n", " for ba in correlations.index:\n", " for lag in correlations.columns:\n", - " # prepare 930: select BA \n", - " #eia = df_eia930[df_eia930.BA==ba][\"generation\"]\n", + " # prepare 930: select BA\n", + " # eia = df_eia930[df_eia930.BA==ba][\"generation\"]\n", " # prepare CEMS: select BA\n", - " #c = cems[cems.ba_code==ba][\"net_generation_mwh\"]\n", - " # calculate \n", - " correlations.loc[ba,lag] = cems[ba]\\\n", - " .corr(df_eia930[ba].shift(lag))\n", + " # c = cems[cems.ba_code==ba][\"net_generation_mwh\"]\n", + " # calculate\n", + " correlations.loc[ba, lag] = cems[ba].corr(df_eia930[ba].shift(lag))\n", "\n", " best = correlations.apply(lambda s: s.index[s.argmax()], axis=1).rename(\"best\")\n", "\n", - " correlations = pd.concat([best, correlations], axis='columns')\n", + " correlations = pd.concat([best, correlations], axis=\"columns\")\n", " return correlations" ] }, @@ -246,8 +268,8 @@ "metadata": {}, "outputs": [], "source": [ - "cems.drop_duplicates(subset=[\"datetime_utc\",\"ba_code\"], inplace=True)\n", - "#rolled_930.drop_duplicates(subset=[\"datetime_utc\",\"BA\"], inplace=True)" + "cems.drop_duplicates(subset=[\"datetime_utc\", \"ba_code\"], inplace=True)\n", + "# rolled_930.drop_duplicates(subset=[\"datetime_utc\",\"BA\"], inplace=True)" ] }, { @@ -258,18 +280,43 @@ "source": [ "# Calculate best correlations for shifted (no EBA cleaning) data\n", "\n", - "cems_930_cors = pd.concat([find_best_cor(cems, shifted).best.rename(\"all_years\"),\\\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2019],shifted[shifted.datetime_utc.dt.year==2019]).best.rename(\"2019\"),\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2020],shifted[shifted.datetime_utc.dt.year==2020]).best.rename(\"2020\"),\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2021],shifted[shifted.datetime_utc.dt.year==2021]).best.rename(\"2021\"),\n", - " find_best_cor(cems[(cems.datetime_utc.dt.month>=4)&(cems.datetime_utc.dt.month<=9)],\n", - " shifted[(shifted.datetime_utc.dt.month>=4)&(shifted.datetime_utc.dt.month<=9)]).best.rename(\"daylight time\"),\n", - " find_best_cor(cems[(cems.datetime_utc.dt.month>=11)|(cems.datetime_utc.dt.month<=2)],\n", - " shifted[(shifted.datetime_utc.dt.month>=11)|(shifted.datetime_utc.dt.month<=2)]).best.rename(\"standard time\")],\n", - " axis='columns')\n", + "cems_930_cors = pd.concat(\n", + " [\n", + " find_best_cor(cems, shifted).best.rename(\"all_years\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2019],\n", + " shifted[shifted.datetime_utc.dt.year == 2019],\n", + " ).best.rename(\"2019\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2020],\n", + " shifted[shifted.datetime_utc.dt.year == 2020],\n", + " ).best.rename(\"2020\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2021],\n", + " shifted[shifted.datetime_utc.dt.year == 2021],\n", + " ).best.rename(\"2021\"),\n", + " find_best_cor(\n", + " cems[(cems.datetime_utc.dt.month >= 4) & (cems.datetime_utc.dt.month <= 9)],\n", + " shifted[\n", + " (shifted.datetime_utc.dt.month >= 4)\n", + " & (shifted.datetime_utc.dt.month <= 9)\n", + " ],\n", + " ).best.rename(\"daylight time\"),\n", + " find_best_cor(\n", + " cems[\n", + " (cems.datetime_utc.dt.month >= 11) | (cems.datetime_utc.dt.month <= 2)\n", + " ],\n", + " shifted[\n", + " (shifted.datetime_utc.dt.month >= 11)\n", + " | (shifted.datetime_utc.dt.month <= 2)\n", + " ],\n", + " ).best.rename(\"standard time\"),\n", + " ],\n", + " axis=\"columns\",\n", + ")\n", "\n", "cems_930_cors.to_csv(\"../../data/outputs/2021/cems_SHIFTEDeia930_cor_lags.csv\")\n", - "#cems_930_cors" + "# cems_930_cors" ] }, { @@ -280,15 +327,34 @@ "source": [ "# Calculate best correlations for raw data\n", "\n", - "cems_930_cors = pd.concat([find_best_cor(cems, raw).best.rename(\"all_years\"),\\\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2019],raw[raw.datetime_utc.dt.year==2019]).best.rename(\"2019\"),\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2020],raw[raw.datetime_utc.dt.year==2020]).best.rename(\"2020\"),\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2021],raw[raw.datetime_utc.dt.year==2021]).best.rename(\"2021\"),\n", - " find_best_cor(cems[(cems.datetime_utc.dt.month>=4)&(cems.datetime_utc.dt.month<=9)],\n", - " raw[(raw.datetime_utc.dt.month>=4)&(raw.datetime_utc.dt.month<=9)]).best.rename(\"daylight time\"),\n", - " find_best_cor(cems[(cems.datetime_utc.dt.month>=11)|(cems.datetime_utc.dt.month<=2)],\n", - " raw[(raw.datetime_utc.dt.month>=11)|(raw.datetime_utc.dt.month<=2)]).best.rename(\"standard time\")],\n", - " axis='columns')\n", + "cems_930_cors = pd.concat(\n", + " [\n", + " find_best_cor(cems, raw).best.rename(\"all_years\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2019],\n", + " raw[raw.datetime_utc.dt.year == 2019],\n", + " ).best.rename(\"2019\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2020],\n", + " raw[raw.datetime_utc.dt.year == 2020],\n", + " ).best.rename(\"2020\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2021],\n", + " raw[raw.datetime_utc.dt.year == 2021],\n", + " ).best.rename(\"2021\"),\n", + " find_best_cor(\n", + " cems[(cems.datetime_utc.dt.month >= 4) & (cems.datetime_utc.dt.month <= 9)],\n", + " raw[(raw.datetime_utc.dt.month >= 4) & (raw.datetime_utc.dt.month <= 9)],\n", + " ).best.rename(\"daylight time\"),\n", + " find_best_cor(\n", + " cems[\n", + " (cems.datetime_utc.dt.month >= 11) | (cems.datetime_utc.dt.month <= 2)\n", + " ],\n", + " raw[(raw.datetime_utc.dt.month >= 11) | (raw.datetime_utc.dt.month <= 2)],\n", + " ).best.rename(\"standard time\"),\n", + " ],\n", + " axis=\"columns\",\n", + ")\n", "\n", "cems_930_cors.to_csv(\"../../data/outputs/2021/cems_RAWeia930_cor_lags.csv\")\n", "cems_930_cors" @@ -300,17 +366,42 @@ "metadata": {}, "outputs": [], "source": [ - "## Calculate correlations using different subsets of 930 data \n", - "\n", - "cems_930_cors = pd.concat([find_best_cor(cems, rolled_930).best.rename(\"all_years\"),\\\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2019],rolled_930[rolled_930.datetime_utc.dt.year==2019]).best.rename(\"2019\"),\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2020],rolled_930[rolled_930.datetime_utc.dt.year==2020]).best.rename(\"2020\"),\n", - " find_best_cor(cems[cems.datetime_utc.dt.year==2021],rolled_930[rolled_930.datetime_utc.dt.year==2021]).best.rename(\"2021\"),\n", - " find_best_cor(cems[(cems.datetime_utc.dt.month>=4)&(cems.datetime_utc.dt.month<=9)],\n", - " rolled_930[(rolled_930.datetime_utc.dt.month>=4)&(rolled_930.datetime_utc.dt.month<=9)]).best.rename(\"daylight time\"),\n", - " find_best_cor(cems[(cems.datetime_utc.dt.month>=11)|(cems.datetime_utc.dt.month<=2)],\n", - " rolled_930[(rolled_930.datetime_utc.dt.month>=11)|(rolled_930.datetime_utc.dt.month<=2)]).best.rename(\"standard time\")],\n", - " axis='columns')\n", + "## Calculate correlations using different subsets of 930 data\n", + "\n", + "cems_930_cors = pd.concat(\n", + " [\n", + " find_best_cor(cems, rolled_930).best.rename(\"all_years\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2019],\n", + " rolled_930[rolled_930.datetime_utc.dt.year == 2019],\n", + " ).best.rename(\"2019\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2020],\n", + " rolled_930[rolled_930.datetime_utc.dt.year == 2020],\n", + " ).best.rename(\"2020\"),\n", + " find_best_cor(\n", + " cems[cems.datetime_utc.dt.year == 2021],\n", + " rolled_930[rolled_930.datetime_utc.dt.year == 2021],\n", + " ).best.rename(\"2021\"),\n", + " find_best_cor(\n", + " cems[(cems.datetime_utc.dt.month >= 4) & (cems.datetime_utc.dt.month <= 9)],\n", + " rolled_930[\n", + " (rolled_930.datetime_utc.dt.month >= 4)\n", + " & (rolled_930.datetime_utc.dt.month <= 9)\n", + " ],\n", + " ).best.rename(\"daylight time\"),\n", + " find_best_cor(\n", + " cems[\n", + " (cems.datetime_utc.dt.month >= 11) | (cems.datetime_utc.dt.month <= 2)\n", + " ],\n", + " rolled_930[\n", + " (rolled_930.datetime_utc.dt.month >= 11)\n", + " | (rolled_930.datetime_utc.dt.month <= 2)\n", + " ],\n", + " ).best.rename(\"standard time\"),\n", + " ],\n", + " axis=\"columns\",\n", + ")\n", "\n", "cems_930_cors.to_csv(\"../../data/outputs/2021/cems_RAWeia930_cor_lags.csv\")\n", "cems_930_cors" @@ -326,19 +417,27 @@ "\n", "ba = \"SC\"\n", "\n", - "to_plot_930 = shifted[shifted.BA==ba].groupby(\"datetime_utc\").sum()\n", + "to_plot_930 = shifted[shifted.BA == ba].groupby(\"datetime_utc\").sum()\n", "\n", "print(f\"correlations for {ba}\")\n", "print(cems_930_cors.loc[ba])\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(go.Scatter(x=cems[cems.ba_code==ba].datetime_utc, y=cems[cems.ba_code==ba].net_generation_mwh, name=\"CEMS\"))\n", - "fig.add_trace(go.Scatter(x=to_plot_930.index, y=to_plot_930.generation, name=\"EIA 930 (after adjustment and rolling cleaning)\"))\n", - "fig.update_layout(\n", - " title=ba,\n", - " xaxis_title=\"Date\",\n", - " yaxis_title=\"Generation\"\n", - ")" + "fig.add_trace(\n", + " go.Scatter(\n", + " x=cems[cems.ba_code == ba].datetime_utc,\n", + " y=cems[cems.ba_code == ba].net_generation_mwh,\n", + " name=\"CEMS\",\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=to_plot_930.index,\n", + " y=to_plot_930.generation,\n", + " name=\"EIA 930 (after adjustment and rolling cleaning)\",\n", + " )\n", + ")\n", + "fig.update_layout(title=ba, xaxis_title=\"Date\", yaxis_title=\"Generation\")" ] }, { @@ -354,11 +453,17 @@ "metadata": {}, "outputs": [], "source": [ - "#interchange = pd.read_csv(\"../data/eia930/chalendar/EBA_rolling.csv\",index_col=0, parse_dates=True)\n", + "# interchange = pd.read_csv(\"../data/eia930/chalendar/EBA_rolling.csv\",index_col=0, parse_dates=True)\n", "interchanges = []\n", - "for year in [2019, 2020, 2021]: \n", - " interchange = pd.read_csv(f\"../../data/outputs/{year}/eia930/eia930_raw.csv\",index_col=0, parse_dates=True)\n", - " interchange = interchange[interchange.index.year == year] # limit to after gen was reported by fuel type\n", + "for year in [2019, 2020, 2021]:\n", + " interchange = pd.read_csv(\n", + " f\"../../data/outputs/{year}/eia930/eia930_raw.csv\",\n", + " index_col=0,\n", + " parse_dates=True,\n", + " )\n", + " interchange = interchange[\n", + " interchange.index.year == year\n", + " ] # limit to after gen was reported by fuel type\n", " interchanges.append(interchange)" ] }, @@ -377,7 +482,7 @@ "metadata": {}, "outputs": [], "source": [ - "bas930 = {re.split(r\"[-.]\",c)[1] for c in interchange.columns}" + "bas930 = {re.split(r\"[-.]\", c)[1] for c in interchange.columns}" ] }, { @@ -388,47 +493,61 @@ "source": [ "# given a df where columns are interchange data, add best correlation between matching BAs to interchange_cors dict\n", "# optionally, write markdown to {file}.md and csvs at {file}_{ba}.csv\n", - "def interchange_cor(interchange, interchange_cors:dict={}, file=\"\", name:str=\"cors\"):\n", + "def interchange_cor(\n", + " interchange, interchange_cors: dict = {}, file=\"\", name: str = \"cors\"\n", + "):\n", " # Delete file\n", " if file != \"\":\n", - " hs = open(file+\".md\",\"w\")\n", + " hs = open(file + \".md\", \"w\")\n", " hs.write(\"\\n\\n\")\n", - " hs.close() \n", + " hs.close()\n", "\n", " for ba in bas930:\n", " print(ba, end=\"...\")\n", - " other_cols = [c for c in interchange.columns \\\n", - " if re.split(r\"[-.]\",c)[1]==ba \\\n", - " and re.split(r\"[-.]\",c)[2]!=\"ALL\"]\n", - " other_bas = [re.split(r\"[-.]\",c)[2] for c in other_cols]\n", - " #print(f\"{ba} connects to {other_bas}\")\n", - "\n", - " out = pd.DataFrame(index=other_bas, columns=range(-12,12), dtype=float)\n", + " other_cols = [\n", + " c\n", + " for c in interchange.columns\n", + " if re.split(r\"[-.]\", c)[1] == ba and re.split(r\"[-.]\", c)[2] != \"ALL\"\n", + " ]\n", + " other_bas = [re.split(r\"[-.]\", c)[2] for c in other_cols]\n", + " # print(f\"{ba} connects to {other_bas}\")\n", + "\n", + " out = pd.DataFrame(index=other_bas, columns=range(-12, 12), dtype=float)\n", " for o_ba in out.index:\n", " this_way = f\"EBA.{o_ba}-{ba}.ID.H\"\n", " other_way = f\"EBA.{ba}-{o_ba}.ID.H\"\n", - " if other_way not in interchange.columns or this_way not in interchange.columns: \n", + " if (\n", + " other_way not in interchange.columns\n", + " or this_way not in interchange.columns\n", + " ):\n", " continue\n", " for lag in out.columns:\n", - " out.loc[o_ba,lag] = abs(interchange[this_way]\\\n", - " .corr(-1*interchange[other_way].shift(lag)))\n", - " \n", + " out.loc[o_ba, lag] = abs(\n", + " interchange[this_way].corr(-1 * interchange[other_way].shift(lag))\n", + " )\n", + "\n", " # where is correlation the best?\n", - " out = pd.concat([out, out.apply(lambda s: s.index[s.argmax()], axis=1).rename(\"best\")], axis='columns')\n", + " out = pd.concat(\n", + " [out, out.apply(lambda s: s.index[s.argmax()], axis=1).rename(\"best\")],\n", + " axis=\"columns\",\n", + " )\n", "\n", " if file != \"\":\n", " # add new lines for proper markdown syntax\n", - " hs = open(file+\".md\",\"a\")\n", + " hs = open(file + \".md\", \"a\")\n", " hs.write(f\"\\n\\n# {ba}\\n\\n\")\n", - " hs.close() \n", + " hs.close()\n", "\n", - " out.to_markdown(file+\".md\",mode=\"a\")\n", + " out.to_markdown(file + \".md\", mode=\"a\")\n", "\n", - " out.to_csv(f\"{file}_{ba}\"+\".csv\")\n", + " out.to_csv(f\"{file}_{ba}\" + \".csv\")\n", "\n", - " interchange_cors[ba] = pd.concat([interchange_cors.get(ba, pd.DataFrame()), out.best.rename(name)], axis='columns')\n", + " interchange_cors[ba] = pd.concat(\n", + " [interchange_cors.get(ba, pd.DataFrame()), out.best.rename(name)],\n", + " axis=\"columns\",\n", + " )\n", "\n", - " return interchange_cors\n" + " return interchange_cors" ] }, { @@ -438,11 +557,25 @@ "outputs": [], "source": [ "int_cors = interchange_cor(interchange, interchange_cors={}, name=\"all_years\")\n", - "int_cors = interchange_cor(interchange[\"2019-01-01T00:00\":\"2019-12-30T00:00\"], int_cors, name=\"2019\")\n", - "int_cors = interchange_cor(interchange[\"2020-01-01T00:00\":\"2020-12-30T00:00\"], int_cors, name=\"2020\")\n", - "int_cors = interchange_cor(interchange[\"2021-01-01T00:00\":\"2021-12-30T00:00\"], int_cors, name=\"2021\")\n", - "int_cors = interchange_cor(interchange[(interchange.index.month >= 4)&(interchange.index.month <=9)], int_cors, name=\"daylight savings\")\n", - "int_cors = interchange_cor(interchange[(interchange.index.month >= 11)|(interchange.index.month <=2)], int_cors, name=\"standard time\")\n" + "int_cors = interchange_cor(\n", + " interchange[\"2019-01-01T00:00\":\"2019-12-30T00:00\"], int_cors, name=\"2019\"\n", + ")\n", + "int_cors = interchange_cor(\n", + " interchange[\"2020-01-01T00:00\":\"2020-12-30T00:00\"], int_cors, name=\"2020\"\n", + ")\n", + "int_cors = interchange_cor(\n", + " interchange[\"2021-01-01T00:00\":\"2021-12-30T00:00\"], int_cors, name=\"2021\"\n", + ")\n", + "int_cors = interchange_cor(\n", + " interchange[(interchange.index.month >= 4) & (interchange.index.month <= 9)],\n", + " int_cors,\n", + " name=\"daylight savings\",\n", + ")\n", + "int_cors = interchange_cor(\n", + " interchange[(interchange.index.month >= 11) | (interchange.index.month <= 2)],\n", + " int_cors,\n", + " name=\"standard time\",\n", + ")" ] }, { @@ -465,18 +598,17 @@ "# Output to md file because that's an easy way to manually scan through BAs and look for anomalies\n", "\n", "file = \"../../data/outputs/2021/interchange_corr_summary_adjusted.md\"\n", - "hs = open(file,\"w\")\n", + "hs = open(file, \"w\")\n", "hs.write(\"\\n\\n\")\n", - "hs.close() \n", - "\n", - "for (ba,out) in int_cors.items():\n", + "hs.close()\n", "\n", + "for ba, out in int_cors.items():\n", " # add new lines for proper markdown syntax\n", - " hs = open(file,\"a\")\n", - " hs.write(f\"\\n\\n# {ba}\\n\\n\")\n", - " hs.close() \n", + " hs = open(file, \"a\")\n", + " hs.write(f\"\\n\\n# {ba}\\n\\n\")\n", + " hs.close()\n", "\n", - " out.to_markdown(file,mode=\"a\")" + " out.to_markdown(file, mode=\"a\")" ] }, { @@ -496,7 +628,13 @@ "ba2 = \"MISO\"\n", "\n", "fig = px.line(interchange[f\"EBA.{ba1}-{ba2}.ID.H\"])\n", - "fig.add_trace(go.Scatter(x=interchange.index, y=interchange[f\"EBA.{ba2}-{ba1}.ID.H\"], name=f\"EBA.{ba2}-{ba1}.ID.H\"))" + "fig.add_trace(\n", + " go.Scatter(\n", + " x=interchange.index,\n", + " y=interchange[f\"EBA.{ba2}-{ba1}.ID.H\"],\n", + " name=f\"EBA.{ba2}-{ba1}.ID.H\",\n", + " )\n", + ")" ] }, { @@ -507,26 +645,28 @@ "source": [ "ba = \"PJM\"\n", "\n", - "# find cols of mappings in both directions \n", - "other_cols = [c for c in interchange.columns \\\n", - " if re.split(r\"[-.]\",c)[1]==ba \\\n", - " and re.split(r\"[-.]\",c)[2]!=\"ALL\"]\n", - "other_bas = [re.split(r\"[-.]\",c)[2] for c in other_cols]\n", + "# find cols of mappings in both directions\n", + "other_cols = [\n", + " c\n", + " for c in interchange.columns\n", + " if re.split(r\"[-.]\", c)[1] == ba and re.split(r\"[-.]\", c)[2] != \"ALL\"\n", + "]\n", + "other_bas = [re.split(r\"[-.]\", c)[2] for c in other_cols]\n", "\n", "these_cols = [f\"EBA.{o_ba}-{ba}.ID.H\" for o_ba in other_bas]\n", "\n", "# make long version with just cols of interest, adding BA column and to/from column\n", "toplot = pd.DataFrame()\n", - "for i in range(len(other_bas)): \n", + "for i in range(len(other_bas)):\n", " to_add = (interchange[other_cols[i]]).rename(\"interchange\").to_frame()\n", " to_add[\"source\"] = ba\n", " to_add[\"BA\"] = other_bas[i]\n", "\n", - " to_add_2 = (interchange[these_cols[i]]*(-1)).rename(\"interchange\").to_frame()\n", + " to_add_2 = (interchange[these_cols[i]] * (-1)).rename(\"interchange\").to_frame()\n", " to_add_2[\"source\"] = \"other BA\"\n", " to_add_2[\"BA\"] = other_bas[i]\n", "\n", - " toplot = pd.concat([toplot, to_add, to_add_2], axis='index')\n" + " toplot = pd.concat([toplot, to_add, to_add_2], axis=\"index\")" ] }, { @@ -535,14 +675,21 @@ "metadata": {}, "outputs": [], "source": [ - "fig = px.line(toplot, x=toplot.index, y=\"interchange\", facet_col=\"BA\", facet_col_wrap=2, color=\"source\")\n", + "fig = px.line(\n", + " toplot,\n", + " x=toplot.index,\n", + " y=\"interchange\",\n", + " facet_col=\"BA\",\n", + " facet_col_wrap=2,\n", + " color=\"source\",\n", + ")\n", "fig.update_layout(\n", " title=f\"Interchange from {ba}\",\n", " xaxis_title=\"Date\",\n", " yaxis_title=\"Interchange\",\n", - " legend_title=\"Source for
interchange data\"\n", + " legend_title=\"Source for
interchange data\",\n", ")\n", - "fig.for_each_annotation(lambda a: a.update(text=\"Other \"+a.text))" + "fig.for_each_annotation(lambda a: a.update(text=\"Other \" + a.text))" ] }, { @@ -551,16 +698,24 @@ "metadata": {}, "outputs": [], "source": [ - "first=\"PJM\"\n", - "second=\"MISO\"\n", - "\n", - "fig = px.line(interchange, x=interchange.index, y=[f\"EBA.{first}-{second}.ID.H\",f\"EBA.{second}-{first}.ID.H\", f\"EBA.{first}-ALL.TI.H\"])\n", + "first = \"PJM\"\n", + "second = \"MISO\"\n", + "\n", + "fig = px.line(\n", + " interchange,\n", + " x=interchange.index,\n", + " y=[\n", + " f\"EBA.{first}-{second}.ID.H\",\n", + " f\"EBA.{second}-{first}.ID.H\",\n", + " f\"EBA.{first}-ALL.TI.H\",\n", + " ],\n", + ")\n", "\n", "fig.update_layout(\n", " title=f\"{first}/{second} interchange\",\n", " xaxis_title=\"Date\",\n", " yaxis_title=\"Interchange\",\n", - " legend_title=\"Series\"\n", + " legend_title=\"Series\",\n", ")" ] }, @@ -573,14 +728,18 @@ "ba = \"CFE\"\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(go.Scatter(x=interchange.index, \n", - " y=interchange[f\"EBA.{ba}-ALL.D.H\"]-interchange[f\"EBA.{ba}-ALL.NG.H\"]))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=interchange.index,\n", + " y=interchange[f\"EBA.{ba}-ALL.D.H\"] - interchange[f\"EBA.{ba}-ALL.NG.H\"],\n", + " )\n", + ")\n", "\n", "fig.update_layout(\n", " title=f\"{ba} demand - generation\",\n", " xaxis_title=\"Date\",\n", " yaxis_title=\"Demand - generation\",\n", - " legend_title=\"Series\"\n", + " legend_title=\"Series\",\n", ")" ] }, @@ -601,31 +760,36 @@ "source": [ "# given a df where columns are interchange data, add best correlation between matching BAs to interchange_cors dict\n", "# optionally, write markdown to {file}.md and csvs at {file}_{ba}.csv\n", - "def interchange_sign(interchange, i_sign:dict={}, file=\"\", name:str=\"cors\"):\n", + "def interchange_sign(interchange, i_sign: dict = {}, file=\"\", name: str = \"cors\"):\n", " for ba in bas930:\n", " print(ba, end=\"...\")\n", - " other_cols = [c for c in interchange.columns \\\n", - " if re.split(r\"[-.]\",c)[1]==ba \\\n", - " and re.split(r\"[-.]\",c)[2]!=\"ALL\"]\n", - " other_bas = [re.split(r\"[-.]\",c)[2] for c in other_cols]\n", - " #print(f\"{ba} connects to {other_bas}\")\n", - "\n", - " out = pd.DataFrame(index=other_bas, columns=range(-12,12), dtype=float)\n", + " other_cols = [\n", + " c\n", + " for c in interchange.columns\n", + " if re.split(r\"[-.]\", c)[1] == ba and re.split(r\"[-.]\", c)[2] != \"ALL\"\n", + " ]\n", + " other_bas = [re.split(r\"[-.]\", c)[2] for c in other_cols]\n", + " # print(f\"{ba} connects to {other_bas}\")\n", + "\n", + " out = pd.DataFrame(index=other_bas, columns=range(-12, 12), dtype=float)\n", " for o_ba in out.index:\n", " this_way = f\"EBA.{o_ba}-{ba}.ID.H\"\n", " other_way = f\"EBA.{ba}-{o_ba}.ID.H\"\n", - " if other_way not in interchange or this_way not in interchange: \n", - " continue \n", + " if other_way not in interchange or this_way not in interchange:\n", + " continue\n", " for lag in out.columns:\n", - " out.loc[o_ba,lag] = interchange[this_way]\\\n", - " .corr(-1*interchange[other_way].shift(lag))\n", - " \n", + " out.loc[o_ba, lag] = interchange[this_way].corr(\n", + " -1 * interchange[other_way].shift(lag)\n", + " )\n", + "\n", " # where is correlation the best?\n", " out = out.apply(lambda s: s.iloc[abs(s).argmax()], axis=1)\n", "\n", - " i_sign[ba] = pd.concat([i_sign.get(ba, pd.DataFrame()), out.rename(name)], axis='columns')\n", + " i_sign[ba] = pd.concat(\n", + " [i_sign.get(ba, pd.DataFrame()), out.rename(name)], axis=\"columns\"\n", + " )\n", "\n", - " return i_sign\n" + " return i_sign" ] }, { @@ -635,11 +799,25 @@ "outputs": [], "source": [ "int_sign = interchange_sign(interchange, {}, name=\"all_years\")\n", - "int_sign = interchange_sign(interchange[\"2019-01-01T00:00\":\"2019-12-30T00:00\"], int_sign, name=\"2019\")\n", - "int_sign = interchange_sign(interchange[\"2020-01-01T00:00\":\"2020-12-30T00:00\"], int_sign, name=\"2020\")\n", - "int_sign = interchange_sign(interchange[\"2020-01-01T00:00\":\"2020-12-30T00:00\"], int_sign, name=\"2021\")\n", - "int_sign = interchange_sign(interchange[(interchange.index.month >= 4)&(interchange.index.month <=9)], int_sign, name=\"daylight savings\")\n", - "int_sign = interchange_sign(interchange[(interchange.index.month >= 11)|(interchange.index.month <=2)], int_sign, name=\"standard time\")" + "int_sign = interchange_sign(\n", + " interchange[\"2019-01-01T00:00\":\"2019-12-30T00:00\"], int_sign, name=\"2019\"\n", + ")\n", + "int_sign = interchange_sign(\n", + " interchange[\"2020-01-01T00:00\":\"2020-12-30T00:00\"], int_sign, name=\"2020\"\n", + ")\n", + "int_sign = interchange_sign(\n", + " interchange[\"2020-01-01T00:00\":\"2020-12-30T00:00\"], int_sign, name=\"2021\"\n", + ")\n", + "int_sign = interchange_sign(\n", + " interchange[(interchange.index.month >= 4) & (interchange.index.month <= 9)],\n", + " int_sign,\n", + " name=\"daylight savings\",\n", + ")\n", + "int_sign = interchange_sign(\n", + " interchange[(interchange.index.month >= 11) | (interchange.index.month <= 2)],\n", + " int_sign,\n", + " name=\"standard time\",\n", + ")" ] }, { @@ -649,18 +827,17 @@ "outputs": [], "source": [ "file = f\"{outputs_folder('2021')}/interchange_cors_sign.md\"\n", - "hs = open(file,\"w\")\n", + "hs = open(file, \"w\")\n", "hs.write(\"\\n\\n\")\n", - "hs.close() \n", - "\n", - "for (ba,out) in int_sign.items():\n", + "hs.close()\n", "\n", + "for ba, out in int_sign.items():\n", " # add new lines for proper markdown syntax\n", - " hs = open(file,\"a\")\n", - " hs.write(f\"\\n\\n# {ba}\\n\\n\")\n", - " hs.close() \n", + " hs = open(file, \"a\")\n", + " hs.write(f\"\\n\\n# {ba}\\n\\n\")\n", + " hs.close()\n", "\n", - " out.to_markdown(file,mode=\"a\")" + " out.to_markdown(file, mode=\"a\")" ] } ], diff --git a/notebooks/manual_data/manually_identify_crosswalk_updates.ipynb b/notebooks/manual_data/manually_identify_crosswalk_updates.ipynb index 51bd809c..8e4de0c9 100644 --- a/notebooks/manual_data/manually_identify_crosswalk_updates.ipynb +++ b/notebooks/manual_data/manually_identify_crosswalk_updates.ipynb @@ -16,7 +16,7 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import sqlalchemy as sa \n", + "import sqlalchemy as sa\n", "import pudl.output" ] }, @@ -26,7 +26,7 @@ "metadata": {}, "outputs": [], "source": [ - "year =2020" + "year = 2020" ] }, { @@ -38,11 +38,11 @@ "# load raw cems data\n", "cems_path = f\"../data/downloads/pudl/pudl_data/parquet/epacems/year={year}\"\n", "cems = pd.read_parquet(cems_path).rename(\n", - " columns={\n", - " \"plant_id_eia\": \"plant_id_epa\",\n", - " \"heat_content_mmbtu\": \"fuel_consumed_mmbtu\",\n", - " }\n", - " )\n", + " columns={\n", + " \"plant_id_eia\": \"plant_id_epa\",\n", + " \"heat_content_mmbtu\": \"fuel_consumed_mmbtu\",\n", + " }\n", + ")\n", "\n", "# load crosswalk data\n", "crosswalk = pudl.output.epacems.epa_crosswalk()\n", @@ -51,8 +51,8 @@ "pudl_db = \"sqlite:///../data/downloads/pudl/pudl_data/sqlite/pudl.sqlite\"\n", "pudl_engine = sa.create_engine(pudl_db)\n", "pudl_out = pudl.output.pudltabl.PudlTabl(\n", - " pudl_engine, freq=\"MS\", start_date=f\"{year}-01-01\", end_date=f\"{year}-12-31\"\n", - " )\n", + " pudl_engine, freq=\"MS\", start_date=f\"{year}-01-01\", end_date=f\"{year}-12-31\"\n", + ")\n", "gens_860 = pudl_out.gens_eia860()" ] }, @@ -83,7 +83,7 @@ " pd.read_csv(\n", " \"../data/manual/egrid_static_tables/table_4-2_plants_not_connected_to_grid.csv\"\n", " )[\"Plant ID\"]\n", - ")\n" + ")" ] }, { @@ -92,7 +92,7 @@ "metadata": {}, "outputs": [], "source": [ - "ids[ids['plant_id_epa'].isin(ngc_plants)].to_clipboard()" + "ids[ids[\"plant_id_epa\"].isin(ngc_plants)].to_clipboard()" ] }, { @@ -108,7 +108,9 @@ "metadata": {}, "outputs": [], "source": [ - "missing_eia = crosswalk[~crosswalk['CAMD_PLANT_ID'].isna() & crosswalk['EIA_PLANT_ID'].isna()]\n", + "missing_eia = crosswalk[\n", + " ~crosswalk[\"CAMD_PLANT_ID\"].isna() & crosswalk[\"EIA_PLANT_ID\"].isna()\n", + "]\n", "missing_eia" ] }, @@ -118,22 +120,47 @@ "metadata": {}, "outputs": [], "source": [ - "missing_ids = missing_eia[['CAMD_PLANT_ID','CAMD_UNIT_ID','CAMD_GENERATOR_ID']].drop_duplicates()\n", + "missing_ids = missing_eia[\n", + " [\"CAMD_PLANT_ID\", \"CAMD_UNIT_ID\", \"CAMD_GENERATOR_ID\"]\n", + "].drop_duplicates()\n", "\n", - "missing_ids = missing_ids.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['CAMD_PLANT_ID','CAMD_UNIT_ID'], right_on=['plant_id_eia','generator_id'])\n", - "missing_ids = missing_ids.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['CAMD_PLANT_ID','CAMD_GENERATOR_ID'], right_on=['plant_id_eia','generator_id'], suffixes=('_u','_g'))\n", + "missing_ids = missing_ids.merge(\n", + " gens_860[[\"plant_id_eia\", \"generator_id\"]],\n", + " how=\"left\",\n", + " left_on=[\"CAMD_PLANT_ID\", \"CAMD_UNIT_ID\"],\n", + " right_on=[\"plant_id_eia\", \"generator_id\"],\n", + ")\n", + "missing_ids = missing_ids.merge(\n", + " gens_860[[\"plant_id_eia\", \"generator_id\"]],\n", + " how=\"left\",\n", + " left_on=[\"CAMD_PLANT_ID\", \"CAMD_GENERATOR_ID\"],\n", + " right_on=[\"plant_id_eia\", \"generator_id\"],\n", + " suffixes=(\"_u\", \"_g\"),\n", + ")\n", "\n", "# identify the source of the data\n", - "missing_ids['source'] = ''\n", - "missing_ids.loc[~missing_ids['plant_id_eia_u'].isna(), 'source'] = \"CAMD_UNIT_ID matches EIA_GENERATOR_ID\"\n", - "missing_ids.loc[~missing_ids['plant_id_eia_g'].isna(), 'source'] = \"CAMD_GENERATOR_ID matches EIA_GENERATOR_ID\"\n", + "missing_ids[\"source\"] = \"\"\n", + "missing_ids.loc[\n", + " ~missing_ids[\"plant_id_eia_u\"].isna(), \"source\"\n", + "] = \"CAMD_UNIT_ID matches EIA_GENERATOR_ID\"\n", + "missing_ids.loc[\n", + " ~missing_ids[\"plant_id_eia_g\"].isna(), \"source\"\n", + "] = \"CAMD_GENERATOR_ID matches EIA_GENERATOR_ID\"\n", "\n", "# fill nas in each group so that we can identify where values match\n", - "missing_ids['generator_id_u'] = missing_ids['generator_id_u'].fillna(missing_ids['generator_id_g'])\n", - "missing_ids['generator_id_g'] = missing_ids['generator_id_g'].fillna(missing_ids['generator_id_u'])\n", + "missing_ids[\"generator_id_u\"] = missing_ids[\"generator_id_u\"].fillna(\n", + " missing_ids[\"generator_id_g\"]\n", + ")\n", + "missing_ids[\"generator_id_g\"] = missing_ids[\"generator_id_g\"].fillna(\n", + " missing_ids[\"generator_id_u\"]\n", + ")\n", "\n", - "missing_ids['plant_id_eia_u'] = missing_ids['plant_id_eia_u'].fillna(missing_ids['plant_id_eia_g'])\n", - "missing_ids['plant_id_eia_g'] = missing_ids['plant_id_eia_g'].fillna(missing_ids['plant_id_eia_u'])\n" + "missing_ids[\"plant_id_eia_u\"] = missing_ids[\"plant_id_eia_u\"].fillna(\n", + " missing_ids[\"plant_id_eia_g\"]\n", + ")\n", + "missing_ids[\"plant_id_eia_g\"] = missing_ids[\"plant_id_eia_g\"].fillna(\n", + " missing_ids[\"plant_id_eia_u\"]\n", + ")" ] }, { @@ -143,7 +170,13 @@ "outputs": [], "source": [ "# identify where we identified a consistent generator match\n", - "unit_manual_match = (missing_ids[missing_ids['generator_id_u'] == missing_ids['generator_id_g']]).drop(columns=['plant_id_eia_g','generator_id_g']).rename(columns={'plant_id_eia_u':'plant_id_eia','generator_id_u':'generator_id'})\n", + "unit_manual_match = (\n", + " (missing_ids[missing_ids[\"generator_id_u\"] == missing_ids[\"generator_id_g\"]])\n", + " .drop(columns=[\"plant_id_eia_g\", \"generator_id_g\"])\n", + " .rename(\n", + " columns={\"plant_id_eia_u\": \"plant_id_eia\", \"generator_id_u\": \"generator_id\"}\n", + " )\n", + ")\n", "unit_manual_match" ] }, @@ -153,7 +186,7 @@ "metadata": {}, "outputs": [], "source": [ - "unit_manual_match.to_csv('../data/outputs/crosswalk_unit_manual_matches.csv')" + "unit_manual_match.to_csv(\"../data/outputs/crosswalk_unit_manual_matches.csv\")" ] }, { @@ -163,7 +196,9 @@ "outputs": [], "source": [ "# identify where the two matching methods returned different generator matches\n", - "multi_match = missing_ids[missing_ids['generator_id_u'] != missing_ids['generator_id_g']]\n", + "multi_match = missing_ids[\n", + " missing_ids[\"generator_id_u\"] != missing_ids[\"generator_id_g\"]\n", + "]\n", "multi_match" ] }, @@ -173,7 +208,7 @@ "metadata": {}, "outputs": [], "source": [ - "multi_match.to_csv('../data/outputs/crosswalk_unit_manual_matches_multi.csv')" + "multi_match.to_csv(\"../data/outputs/crosswalk_unit_manual_matches_multi.csv\")" ] }, { @@ -182,7 +217,7 @@ "metadata": {}, "outputs": [], "source": [ - "gens_860[gens_860['plant_id_eia'] == 3443]" + "gens_860[gens_860[\"plant_id_eia\"] == 3443]" ] }, { @@ -198,8 +233,13 @@ "metadata": {}, "outputs": [], "source": [ - "missing_from_cw = ids.merge(crosswalk[['CAMD_PLANT_ID','CAMD_UNIT_ID']], how='left', left_on=['plant_id_epa','emissions_unit_id_epa'], right_on=['CAMD_PLANT_ID','CAMD_UNIT_ID'])\n", - "missing_from_cw = missing_from_cw[missing_from_cw['CAMD_UNIT_ID'].isna()]" + "missing_from_cw = ids.merge(\n", + " crosswalk[[\"CAMD_PLANT_ID\", \"CAMD_UNIT_ID\"]],\n", + " how=\"left\",\n", + " left_on=[\"plant_id_epa\", \"emissions_unit_id_epa\"],\n", + " right_on=[\"CAMD_PLANT_ID\", \"CAMD_UNIT_ID\"],\n", + ")\n", + "missing_from_cw = missing_from_cw[missing_from_cw[\"CAMD_UNIT_ID\"].isna()]" ] }, { @@ -208,10 +248,17 @@ "metadata": {}, "outputs": [], "source": [ - "missing_from_cw = missing_from_cw.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['plant_id_epa','emissions_unit_id_epa'], right_on=['plant_id_eia','generator_id'])\n", + "missing_from_cw = missing_from_cw.merge(\n", + " gens_860[[\"plant_id_eia\", \"generator_id\"]],\n", + " how=\"left\",\n", + " left_on=[\"plant_id_epa\", \"emissions_unit_id_epa\"],\n", + " right_on=[\"plant_id_eia\", \"generator_id\"],\n", + ")\n", "\n", - "missing_from_cw['source'] = ''\n", - "missing_from_cw.loc[~missing_from_cw['generator_id'].isna(), 'source'] = \"CAMD_UNIT_ID matches EIA_GENERATOR_ID\"\n", + "missing_from_cw[\"source\"] = \"\"\n", + "missing_from_cw.loc[\n", + " ~missing_from_cw[\"generator_id\"].isna(), \"source\"\n", + "] = \"CAMD_UNIT_ID matches EIA_GENERATOR_ID\"\n", "\n", "missing_from_cw" ] @@ -222,7 +269,7 @@ "metadata": {}, "outputs": [], "source": [ - "missing_from_cw.to_csv('../data/outputs/missing_from_crosswalk.csv', index=False)" + "missing_from_cw.to_csv(\"../data/outputs/missing_from_crosswalk.csv\", index=False)" ] }, { @@ -231,7 +278,10 @@ "metadata": {}, "outputs": [], "source": [ - "gens_860.loc[gens_860['plant_id_eia'] == 55641, ['plant_id_eia','generator_id','prime_mover_code']]" + "gens_860.loc[\n", + " gens_860[\"plant_id_eia\"] == 55641,\n", + " [\"plant_id_eia\", \"generator_id\", \"prime_mover_code\"],\n", + "]" ] } ], diff --git a/notebooks/manual_data/manually_update_OTH_fuel_code.ipynb b/notebooks/manual_data/manually_update_OTH_fuel_code.ipynb index 0472d9e2..af7ec875 100644 --- a/notebooks/manual_data/manually_update_OTH_fuel_code.ipynb +++ b/notebooks/manual_data/manually_update_OTH_fuel_code.ipynb @@ -15,7 +15,8 @@ "import plotly.express as px\n", "\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import load_data\n", "import data_cleaning" @@ -28,7 +29,7 @@ "outputs": [], "source": [ "year = 2021\n", - "pudl_out = load_data.initialize_pudl_out(year=year)\n" + "pudl_out = load_data.initialize_pudl_out(year=year)" ] }, { @@ -49,8 +50,8 @@ " y=\"fuel_mmbtu_per_unit\",\n", " title=f\"Range of heat contents for each fuel reported in {year}\",\n", " width=1000,\n", - " height=600\n", - ")\n" + " height=600,\n", + ")" ] }, { @@ -59,7 +60,9 @@ "metadata": {}, "outputs": [], "source": [ - "plants_with_oth = fuel_heat_content[fuel_heat_content[\"energy_source_code\"] == \"OTH\"].copy()\n", + "plants_with_oth = fuel_heat_content[\n", + " fuel_heat_content[\"energy_source_code\"] == \"OTH\"\n", + "].copy()\n", "plants_with_oth = plants_with_oth.groupby(\"plant_id_eia\").mean()\n", "plants_with_oth" ] @@ -70,7 +73,7 @@ "metadata": {}, "outputs": [], "source": [ - "fuel_heat_content[fuel_heat_content[\"plant_id_eia\"] == 902]\n" + "fuel_heat_content[fuel_heat_content[\"plant_id_eia\"] == 902]" ] }, { @@ -81,7 +84,7 @@ "source": [ "# load EPA fuel type data\n", "epa_fuel_types = data_cleaning.get_epa_unit_fuel_types(year)\n", - "epa_fuel_types[epa_fuel_types[\"energy_source_code\"] == \"OTH\"]\n" + "epa_fuel_types[epa_fuel_types[\"energy_source_code\"] == \"OTH\"]" ] }, { @@ -92,7 +95,7 @@ "source": [ "# Load EIA-860 to examine whether a plant is retired\n", "gens_860 = pudl_out.gens_eia860()\n", - "gens_860[gens_860[\"plant_id_eia\"] == 60670]\n" + "gens_860[gens_860[\"plant_id_eia\"] == 60670]" ] } ], diff --git a/notebooks/manual_data/manually_update_ba_reference.ipynb b/notebooks/manual_data/manually_update_ba_reference.ipynb index b63ceb80..ab7f108f 100644 --- a/notebooks/manual_data/manually_update_ba_reference.ipynb +++ b/notebooks/manual_data/manually_update_ba_reference.ipynb @@ -22,7 +22,7 @@ "import os\n", "import requests\n", "import pandas as pd\n", - "import numpy as np\n" + "import numpy as np" ] }, { @@ -36,7 +36,12 @@ "\n", "# merge the ferc data into the manual table\n", "ba_reference_updated = ba_reference.merge(\n", - " ferc_bas, how=\"outer\", on=\"ba_code\", indicator=\"source\", suffixes=(None, \"_ferc\"), validate=\"1:1\"\n", + " ferc_bas,\n", + " how=\"outer\",\n", + " on=\"ba_code\",\n", + " indicator=\"source\",\n", + " suffixes=(None, \"_ferc\"),\n", + " validate=\"1:1\",\n", ")\n", "\n", "# fill any missing data in the manual table with the data from ferc\n", @@ -64,7 +69,7 @@ "ba_reference_updated = ba_reference_updated.drop(columns=[\"us_ba_ferc\"])\n", "\n", "\n", - "ba_reference_updated\n" + "ba_reference_updated" ] }, { @@ -73,7 +78,7 @@ "metadata": {}, "outputs": [], "source": [ - "ba_reference_updated.to_csv(\"../../data/manual/ba_reference_updated.csv\", index=False)\n" + "ba_reference_updated.to_csv(\"../../data/manual/ba_reference_updated.csv\", index=False)" ] } ], diff --git a/notebooks/manual_data/update_utility_name_ba_map.ipynb b/notebooks/manual_data/update_utility_name_ba_map.ipynb index ffddb998..ccc4a527 100644 --- a/notebooks/manual_data/update_utility_name_ba_map.ipynb +++ b/notebooks/manual_data/update_utility_name_ba_map.ipynb @@ -17,7 +17,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import load_data\n", "from column_checks import get_dtypes\n", @@ -85,7 +86,11 @@ "outputs": [], "source": [ "# what are all the utility names not mapped to a BA?\n", - "list(plant_ba.loc[plant_ba[\"balancing_authority_code_eia\"].isna(), \"utility_name_eia\"].unique())" + "list(\n", + " plant_ba.loc[\n", + " plant_ba[\"balancing_authority_code_eia\"].isna(), \"utility_name_eia\"\n", + " ].unique()\n", + ")" ] }, { @@ -95,7 +100,12 @@ "outputs": [], "source": [ "# what are all the utility names not mapped to a BA?\n", - "list(plant_ba.loc[plant_ba[\"balancing_authority_code_eia\"].isna(), \"transmission_distribution_owner_name\"].unique())" + "list(\n", + " plant_ba.loc[\n", + " plant_ba[\"balancing_authority_code_eia\"].isna(),\n", + " \"transmission_distribution_owner_name\",\n", + " ].unique()\n", + ")" ] }, { @@ -104,7 +114,10 @@ "metadata": {}, "outputs": [], "source": [ - "plant_ba[plant_ba[\"balancing_authority_code_eia\"].isna() & (plant_ba[\"utility_name_eia\"] == \"Pacific Gas & Electric Co\")]" + "plant_ba[\n", + " plant_ba[\"balancing_authority_code_eia\"].isna()\n", + " & (plant_ba[\"utility_name_eia\"] == \"Pacific Gas & Electric Co\")\n", + "]" ] } ], diff --git a/notebooks/manual_data/zip_data.ipynb b/notebooks/manual_data/zip_data.ipynb index d67b546b..1660e436 100644 --- a/notebooks/manual_data/zip_data.ipynb +++ b/notebooks/manual_data/zip_data.ipynb @@ -18,9 +18,10 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", "\n", - "import output_data\n" + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", + "\n", + "import output_data" ] }, { @@ -29,7 +30,7 @@ "metadata": {}, "outputs": [], "source": [ - "years = [2019,2020,2021]\n", + "years = [2019, 2020, 2021]\n", "\n", "output_data.prepare_files_for_upload(years)" ] @@ -40,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "for year in [2019,2020,2021]:\n", + "for year in [2019, 2020, 2021]:\n", " output_data.zip_results_for_s3(year)" ] }, @@ -50,7 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "for year in [2019,2020,2021]:\n", + "for year in [2019, 2020, 2021]:\n", " output_data.zip_data_for_zenodo(year)" ] } diff --git a/notebooks/validation/data_validation.ipynb b/notebooks/validation/data_validation.ipynb index 557a1543..60cecf2e 100644 --- a/notebooks/validation/data_validation.ipynb +++ b/notebooks/validation/data_validation.ipynb @@ -22,9 +22,10 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", - "# Tell python where to look for modules. \n", + "# Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "# import local modules\n", "import load_data\n", @@ -49,8 +50,13 @@ "outputs": [], "source": [ "year = 2020\n", - "cems = pd.read_csv(f'../data/outputs/cems_subplant_{year}.csv', parse_dates=['datetime_utc','report_date'])\n", - "eia923_allocated = pd.read_csv(f'../data/outputs/eia923_allocated_{year}.csv', parse_dates=['report_date'])" + "cems = pd.read_csv(\n", + " f\"../data/outputs/cems_subplant_{year}.csv\",\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "eia923_allocated = pd.read_csv(\n", + " f\"../data/outputs/eia923_allocated_{year}.csv\", parse_dates=[\"report_date\"]\n", + ")" ] }, { @@ -61,7 +67,12 @@ "source": [ "# what percent of emissions is reported in CEMS vs EIA\n", "# NOTE: This does not include emissions only reported by CEMS, so the % may be higher\n", - "(eia923_allocated.groupby('hourly_data_source')['co2_mass_lb_adjusted'].sum() / eia923_allocated.groupby('hourly_data_source')['co2_mass_lb_adjusted'].sum().sum(axis=0)).round(3)" + "(\n", + " eia923_allocated.groupby(\"hourly_data_source\")[\"co2_mass_lb_adjusted\"].sum()\n", + " / eia923_allocated.groupby(\"hourly_data_source\")[\"co2_mass_lb_adjusted\"]\n", + " .sum()\n", + " .sum(axis=0)\n", + ").round(3)" ] }, { @@ -84,10 +95,21 @@ "source": [ "# perform checks on allocated data\n", "# fuel consumption and co2 emissions should be positive\n", - "negative_test = validation.test_for_negative_values(eia923_allocated, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])\n", + "negative_test = validation.test_for_negative_values(\n", + " eia923_allocated,\n", + " [\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ],\n", + ")\n", "\n", "# if net generation is positive, fuel consumption should be non zero\n", - "missing_fuel_test = validation.test_for_missing_fuel(eia923_allocated, 'net_generation_mwh')\n", + "missing_fuel_test = validation.test_for_missing_fuel(\n", + " eia923_allocated, \"net_generation_mwh\"\n", + ")\n", "\n", "# fuel consumed for electricity should be less than fuel consumed\n", "chp_allocation_test = validation.test_chp_allocation(eia923_allocated)\n", @@ -96,16 +118,38 @@ "missing_co2_test = validation.test_for_missing_co2(eia923_allocated)\n", "\n", "# check for generators with no data\n", - "missing_data_test = validation.test_for_missing_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])\n", + "missing_data_test = validation.test_for_missing_data(\n", + " eia923_allocated,\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ],\n", + ")\n", "\n", "# check for generators with all data = 0\n", - "zero_data_test = validation.test_for_zero_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])\n", + "zero_data_test = validation.test_for_zero_data(\n", + " eia923_allocated,\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ],\n", + ")\n", "\n", "# check for missing energy source code\n", "missing_esc_test = validation.test_for_missing_energy_source_code(eia923_allocated)\n", "\n", "# check for missing and incorrect prime movers\n", - "incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(eia923_allocated, year)\n", + "incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(\n", + " eia923_allocated, year\n", + ")\n", "\n", "# check for missing subplant ids\n", "eia_missing_subplant_test = validation.test_for_missing_subplant_id(eia923_allocated)\n", @@ -120,7 +164,7 @@ "metadata": {}, "outputs": [], "source": [ - "heat_rate_test.sort_values(by='heat_rate')" + "heat_rate_test.sort_values(by=\"heat_rate\")" ] }, { @@ -137,10 +181,19 @@ "outputs": [], "source": [ "# fuel consumption and co2 emissions should be positive\n", - "cems_negative_test = validation.test_for_negative_values(cems, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted', 'gross_generation_mwh'])\n", + "cems_negative_test = validation.test_for_negative_values(\n", + " cems,\n", + " [\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + " \"gross_generation_mwh\",\n", + " ],\n", + ")\n", "\n", "# if net generation is positive, fuel consumption should be non zero\n", - "cems_missing_fuel_test = validation.test_for_missing_fuel(cems,'gross_generation_mwh')\n", + "cems_missing_fuel_test = validation.test_for_missing_fuel(cems, \"gross_generation_mwh\")\n", "\n", "# fuel consumed for electricity should be less than fuel consumed\n", "cems_chp_allocation_test = validation.test_chp_allocation(cems)\n", @@ -155,7 +208,7 @@ "cems_missing_subplant_test = validation.test_for_missing_subplant_id(cems)\n", "\n", "# test to see if there are any net generation values greater than gross generation\n", - "gtn_test = validation.test_gtn_results(cems)\n" + "gtn_test = validation.test_gtn_results(cems)" ] }, { @@ -164,7 +217,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems_missing_subplant_test[['plant_id_eia','emissions_unit_id_epa']].drop_duplicates()" + "cems_missing_subplant_test[[\"plant_id_eia\", \"emissions_unit_id_epa\"]].drop_duplicates()" ] }, { @@ -173,7 +226,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems_missing_esc_test[['plant_id_eia','emissions_unit_id_epa']].drop_duplicates()" + "cems_missing_esc_test[[\"plant_id_eia\", \"emissions_unit_id_epa\"]].drop_duplicates()" ] }, { @@ -197,12 +250,20 @@ "outputs": [], "source": [ "year = 2020\n", - "cems = pd.read_csv(f'../data/outputs/{year}/cems_{year}.csv', dtype=get_dtypes())\n", - "partial_cems_scaled = pd.read_csv(f'../data/outputs/{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes())\n", - "eia923_allocated = pd.read_csv(f'../data/outputs/{year}/eia923_allocated_{year}.csv', dtype=get_dtypes())\n", + "cems = pd.read_csv(f\"../data/outputs/{year}/cems_{year}.csv\", dtype=get_dtypes())\n", + "partial_cems_scaled = pd.read_csv(\n", + " f\"../data/outputs/{year}/partial_cems_scaled_{year}.csv\", dtype=get_dtypes()\n", + ")\n", + "eia923_allocated = pd.read_csv(\n", + " f\"../data/outputs/{year}/eia923_allocated_{year}.csv\", dtype=get_dtypes()\n", + ")\n", "\n", - "plant_attributes = pd.read_csv(f\"../data/outputs/{year}/plant_static_attributes_{year}.csv\")\n", - "eia923_allocated = eia923_allocated.merge(plant_attributes, how=\"left\", on=\"plant_id_eia\")\n", + "plant_attributes = pd.read_csv(\n", + " f\"../data/outputs/{year}/plant_static_attributes_{year}.csv\"\n", + ")\n", + "eia923_allocated = eia923_allocated.merge(\n", + " plant_attributes, how=\"left\", on=\"plant_id_eia\"\n", + ")\n", "cems = cems.merge(plant_attributes, how=\"left\", on=\"plant_id_eia\")" ] }, @@ -212,7 +273,9 @@ "metadata": {}, "outputs": [], "source": [ - "partial_cems_scaled = partial_cems_scaled.merge(plant_attributes, how=\"left\", on=\"plant_id_eia\")" + "partial_cems_scaled = partial_cems_scaled.merge(\n", + " plant_attributes, how=\"left\", on=\"plant_id_eia\"\n", + ")" ] }, { @@ -223,9 +286,14 @@ "source": [ "ba = \"CISO\"\n", "fuel = \"natural_gas\"\n", - "test_eia = eia923_allocated[(eia923_allocated[\"ba_code\"] == ba) & (eia923_allocated[\"fuel_category\"] == fuel)]\n", + "test_eia = eia923_allocated[\n", + " (eia923_allocated[\"ba_code\"] == ba) & (eia923_allocated[\"fuel_category\"] == fuel)\n", + "]\n", "test_cems = cems[(cems[\"ba_code\"] == ba) & (cems[\"fuel_category\"] == fuel)]\n", - "test_pc = partial_cems_scaled[(partial_cems_scaled[\"ba_code\"] == ba) & (partial_cems_scaled[\"fuel_category\"] == fuel)]" + "test_pc = partial_cems_scaled[\n", + " (partial_cems_scaled[\"ba_code\"] == ba)\n", + " & (partial_cems_scaled[\"fuel_category\"] == fuel)\n", + "]" ] }, { @@ -234,7 +302,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_eia.groupby('hourly_data_source').sum()['net_generation_mwh']" + "test_eia.groupby(\"hourly_data_source\").sum()[\"net_generation_mwh\"]" ] }, { @@ -243,7 +311,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_cems[[\"gross_generation_mwh\",'net_generation_mwh']].sum()" + "test_cems[[\"gross_generation_mwh\", \"net_generation_mwh\"]].sum()" ] }, { @@ -252,7 +320,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_pc[['net_generation_mwh']].sum()" + "test_pc[[\"net_generation_mwh\"]].sum()" ] }, { @@ -261,7 +329,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_pc = test_pc.drop(columns='source')" + "test_pc = test_pc.drop(columns=\"source\")" ] }, { @@ -270,7 +338,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_cems = test_cems.drop(columns='source')" + "test_cems = test_cems.drop(columns=\"source\")" ] }, { @@ -300,7 +368,7 @@ "metadata": {}, "outputs": [], "source": [ - "filtered_cems[[\"gross_generation_mwh\",'net_generation_mwh']].sum()" + "filtered_cems[[\"gross_generation_mwh\", \"net_generation_mwh\"]].sum()" ] }, { @@ -309,9 +377,16 @@ "metadata": {}, "outputs": [], "source": [ - "subplants_ided_as_cems = test_eia.loc[test_eia[\"hourly_data_source\"] == 'cems', [\"plant_id_eia\",\"subplant_id\"]].drop_duplicates()\n", - "subplants_in_cems = filtered_cems[[\"plant_id_eia\",\"subplant_id\"]].drop_duplicates()\n", - "cems_overlap = subplants_ided_as_cems.merge(subplants_in_cems, how=\"outer\", on=[\"plant_id_eia\",\"subplant_id\"], indicator=\"source\")\n", + "subplants_ided_as_cems = test_eia.loc[\n", + " test_eia[\"hourly_data_source\"] == \"cems\", [\"plant_id_eia\", \"subplant_id\"]\n", + "].drop_duplicates()\n", + "subplants_in_cems = filtered_cems[[\"plant_id_eia\", \"subplant_id\"]].drop_duplicates()\n", + "cems_overlap = subplants_ided_as_cems.merge(\n", + " subplants_in_cems,\n", + " how=\"outer\",\n", + " on=[\"plant_id_eia\", \"subplant_id\"],\n", + " indicator=\"source\",\n", + ")\n", "cems_overlap" ] }, @@ -321,9 +396,13 @@ "metadata": {}, "outputs": [], "source": [ - "subplants_ided_as_pc = test_eia.loc[test_eia[\"hourly_data_source\"] == 'partial_cems', [\"plant_id_eia\",\"subplant_id\"]].drop_duplicates()\n", - "subplants_in_pc = test_pc[[\"plant_id_eia\",\"subplant_id\"]].drop_duplicates()\n", - "pc_overlap = subplants_ided_as_pc.merge(subplants_in_pc, how=\"outer\", on=[\"plant_id_eia\",\"subplant_id\"], indicator=\"source\")\n", + "subplants_ided_as_pc = test_eia.loc[\n", + " test_eia[\"hourly_data_source\"] == \"partial_cems\", [\"plant_id_eia\", \"subplant_id\"]\n", + "].drop_duplicates()\n", + "subplants_in_pc = test_pc[[\"plant_id_eia\", \"subplant_id\"]].drop_duplicates()\n", + "pc_overlap = subplants_ided_as_pc.merge(\n", + " subplants_in_pc, how=\"outer\", on=[\"plant_id_eia\", \"subplant_id\"], indicator=\"source\"\n", + ")\n", "pc_overlap" ] }, @@ -333,7 +412,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_cems.loc[test_cems['plant_id_eia'] == 55748, \"net_generation_mwh\"].sum()" + "test_cems.loc[test_cems[\"plant_id_eia\"] == 55748, \"net_generation_mwh\"].sum()" ] }, { @@ -350,15 +429,53 @@ "outputs": [], "source": [ "# for plants where there is data reported in cems, see how off it is from data reported in eia\n", - "cems_plant_monthly = cems.groupby(['plant_id_eia','subplant_id','report_date'], dropna=False).sum()[['gross_generation_mwh','net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].reset_index()\n", - "gf_plant_monthly = eia923_allocated.groupby(['plant_id_eia','subplant_id','report_date'], dropna=False).sum().reset_index()\n", - "compare_cems_eia = gf_plant_monthly.merge(cems_plant_monthly, how='inner', on=['plant_id_eia','subplant_id','report_date'], suffixes=(\"_eia\",'_cems'))\n", - "\n", + "cems_plant_monthly = (\n", + " cems.groupby([\"plant_id_eia\", \"subplant_id\", \"report_date\"], dropna=False)\n", + " .sum()[\n", + " [\n", + " \"gross_generation_mwh\",\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ]\n", + " ]\n", + " .reset_index()\n", + ")\n", + "gf_plant_monthly = (\n", + " eia923_allocated.groupby(\n", + " [\"plant_id_eia\", \"subplant_id\", \"report_date\"], dropna=False\n", + " )\n", + " .sum()\n", + " .reset_index()\n", + ")\n", + "compare_cems_eia = gf_plant_monthly.merge(\n", + " cems_plant_monthly,\n", + " how=\"inner\",\n", + " on=[\"plant_id_eia\", \"subplant_id\", \"report_date\"],\n", + " suffixes=(\"_eia\", \"_cems\"),\n", + ")\n", "\n", - "for column in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']:\n", - " compare_cems_eia[f'{column}_pctdiff'] = ((compare_cems_eia[f'{column}_cems'].replace(0,0.1) - compare_cems_eia[f'{column}_eia'].replace(0,0.1)) / compare_cems_eia[f'{column}_eia'].replace(0,0.1)).round(3)\n", "\n", - "compare_cems_eia = compare_cems_eia.set_index(['plant_id_eia','subplant_id','report_date'])\n", + "for column in [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + "]:\n", + " compare_cems_eia[f\"{column}_pctdiff\"] = (\n", + " (\n", + " compare_cems_eia[f\"{column}_cems\"].replace(0, 0.1)\n", + " - compare_cems_eia[f\"{column}_eia\"].replace(0, 0.1)\n", + " )\n", + " / compare_cems_eia[f\"{column}_eia\"].replace(0, 0.1)\n", + " ).round(3)\n", + "\n", + "compare_cems_eia = compare_cems_eia.set_index(\n", + " [\"plant_id_eia\", \"subplant_id\", \"report_date\"]\n", + ")\n", "compare_cems_eia = compare_cems_eia.reindex(sorted(compare_cems_eia.columns), axis=1)" ] }, @@ -369,10 +486,10 @@ "outputs": [], "source": [ "# identify where there are differences between reported CEMS and EIA values for the same subplant-month\n", - "value = 'net_generation_mwh'\n", + "value = \"net_generation_mwh\"\n", "\n", - "comparison = compare_cems_eia[[f'{value}_cems', f'{value}_eia', f'{value}_pctdiff']]\n", - "comparison[(~comparison[f'{value}_pctdiff'].between(-0.05,0.05))]" + "comparison = compare_cems_eia[[f\"{value}_cems\", f\"{value}_eia\", f\"{value}_pctdiff\"]]\n", + "comparison[(~comparison[f\"{value}_pctdiff\"].between(-0.05, 0.05))]" ] }, { @@ -389,10 +506,15 @@ "outputs": [], "source": [ "# filter the data for which we only have EIA data\n", - "monthly_eia_data_to_distribute = eia923_allocated[(eia923_allocated['hourly_data_source'] == 'eia') & ~(eia923_allocated['fuel_consumed_mmbtu'].isna())]\n", + "monthly_eia_data_to_distribute = eia923_allocated[\n", + " (eia923_allocated[\"hourly_data_source\"] == \"eia\")\n", + " & ~(eia923_allocated[\"fuel_consumed_mmbtu\"].isna())\n", + "]\n", "\n", "# assign ba codes to the data\n", - "monthly_eia_data_to_distribute = assign_ba_code_to_plant(monthly_eia_data_to_distribute, year)\n", + "monthly_eia_data_to_distribute = assign_ba_code_to_plant(\n", + " monthly_eia_data_to_distribute, year\n", + ")\n", "cems = assign_ba_code_to_plant(cems, year)" ] }, @@ -406,14 +528,45 @@ "###################################\n", "\n", "# Aggregate cems and eia data by plant id, then combine\n", - "cems_plant_annual = cems.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].reset_index()\n", - "eia_plant_annual = monthly_eia_data_to_distribute.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum()[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].reset_index()\n", - "plant_annual_total = pd.concat([cems_plant_annual,eia_plant_annual], axis=0)\n", + "cems_plant_annual = (\n", + " cems.groupby([\"ba_code\", \"state\", \"plant_id_eia\"], dropna=False)\n", + " .sum()[\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ]\n", + " ]\n", + " .reset_index()\n", + ")\n", + "eia_plant_annual = (\n", + " monthly_eia_data_to_distribute.groupby(\n", + " [\"ba_code\", \"state\", \"plant_id_eia\"], dropna=False\n", + " )\n", + " .sum()[\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ]\n", + " ]\n", + " .reset_index()\n", + ")\n", + "plant_annual_total = pd.concat([cems_plant_annual, eia_plant_annual], axis=0)\n", "# group any plants that have records from both datasets\n", - "plant_annual_total = plant_annual_total.groupby(['ba_code','state','plant_id_eia'], dropna=False).sum().reset_index()\n", + "plant_annual_total = (\n", + " plant_annual_total.groupby([\"ba_code\", \"state\", \"plant_id_eia\"], dropna=False)\n", + " .sum()\n", + " .reset_index()\n", + ")\n", "\n", "# add a egrid id\n", - "plant_annual_total = validation.add_egrid_plant_id(plant_annual_total, from_id='eia', to_id='egrid')\n", + "plant_annual_total = validation.add_egrid_plant_id(\n", + " plant_annual_total, from_id=\"eia\", to_id=\"egrid\"\n", + ")\n", "\n", "# Load the eGRID plant table\n", "egrid_plant = validation.load_egrid_plant_file(year)" @@ -435,14 +588,21 @@ "outputs": [], "source": [ "# identify any plants that are in egrid but not our totals, and any plants that are in our totals, but not egrid\n", - "plant_not_in_calc = list(set(egrid_plant['plant_id_eia'].unique()) - set(plant_annual_total['plant_id_eia'].unique()))\n", + "plant_not_in_calc = list(\n", + " set(egrid_plant[\"plant_id_eia\"].unique())\n", + " - set(plant_annual_total[\"plant_id_eia\"].unique())\n", + ")\n", "\n", "# Which plants are included in eGRID but are missing from our calculations?\n", - "missing_from_calc = egrid_plant[egrid_plant['plant_id_egrid'].isin(plant_not_in_calc)]\n", + "missing_from_calc = egrid_plant[egrid_plant[\"plant_id_egrid\"].isin(plant_not_in_calc)]\n", "\n", "# see if any of these plants are retired\n", - "generators_eia860 = load_data.load_pudl_table('generators_eia860', year=year)\n", - "missing_from_calc.merge(generators_eia860.groupby('plant_id_eia')['retirement_date'].unique().reset_index(), how='left', on='plant_id_eia')" + "generators_eia860 = load_data.load_pudl_table(\"generators_eia860\", year=year)\n", + "missing_from_calc.merge(\n", + " generators_eia860.groupby(\"plant_id_eia\")[\"retirement_date\"].unique().reset_index(),\n", + " how=\"left\",\n", + " on=\"plant_id_eia\",\n", + ")" ] }, { @@ -459,10 +619,17 @@ "outputs": [], "source": [ "# Which plants are in our calculations, but are missing from eGRID?\n", - "plants_not_in_egrid = list(set(plant_annual_total['plant_id_egrid'].unique()) - set(egrid_plant['plant_id_egrid'].unique()))\n", + "plants_not_in_egrid = list(\n", + " set(plant_annual_total[\"plant_id_egrid\"].unique())\n", + " - set(egrid_plant[\"plant_id_egrid\"].unique())\n", + ")\n", "\n", - "plant_names = load_data.load_pudl_table('plants_entity_eia')[['plant_id_eia','plant_name_eia','sector_name_eia']]\n", - "missing_from_egrid = plant_annual_total[plant_annual_total['plant_id_egrid'].isin(plants_not_in_egrid)].merge(plant_names, how='left', on='plant_id_eia')\n", + "plant_names = load_data.load_pudl_table(\"plants_entity_eia\")[\n", + " [\"plant_id_eia\", \"plant_name_eia\", \"sector_name_eia\"]\n", + "]\n", + "missing_from_egrid = plant_annual_total[\n", + " plant_annual_total[\"plant_id_egrid\"].isin(plants_not_in_egrid)\n", + "].merge(plant_names, how=\"left\", on=\"plant_id_eia\")\n", "\n", "missing_from_egrid" ] @@ -474,7 +641,7 @@ "outputs": [], "source": [ "# how many of the plants missing from egrid have non-zero data\n", - "missing_from_egrid[missing_from_egrid['fuel_consumed_mmbtu'] > 1].count()" + "missing_from_egrid[missing_from_egrid[\"fuel_consumed_mmbtu\"] > 1].count()" ] }, { @@ -491,11 +658,23 @@ "outputs": [], "source": [ "# identify where there is a single egrid plant id for multiple eia plant ids\n", - "double_ids = plant_annual_total[plant_annual_total['plant_id_egrid'].duplicated(keep=False)]\n", - "double_ids = double_ids.groupby('plant_id_egrid').sum()['net_generation_mwh'].reset_index() # focus on net generation for now\n", + "double_ids = plant_annual_total[\n", + " plant_annual_total[\"plant_id_egrid\"].duplicated(keep=False)\n", + "]\n", + "double_ids = (\n", + " double_ids.groupby(\"plant_id_egrid\").sum()[\"net_generation_mwh\"].reset_index()\n", + ") # focus on net generation for now\n", "# merge the egrid data\n", - "double_ids = double_ids.merge(egrid_plant[['plant_id_egrid','net_generation_mwh']], how='left', on='plant_id_egrid', suffixes=('_calc','_egrid'))\n", - "double_ids['percent_diff'] = ((double_ids['net_generation_mwh_calc'] - double_ids['net_generation_mwh_egrid']) / double_ids['net_generation_mwh_egrid']).round(3)\n", + "double_ids = double_ids.merge(\n", + " egrid_plant[[\"plant_id_egrid\", \"net_generation_mwh\"]],\n", + " how=\"left\",\n", + " on=\"plant_id_egrid\",\n", + " suffixes=(\"_calc\", \"_egrid\"),\n", + ")\n", + "double_ids[\"percent_diff\"] = (\n", + " (double_ids[\"net_generation_mwh_calc\"] - double_ids[\"net_generation_mwh_egrid\"])\n", + " / double_ids[\"net_generation_mwh_egrid\"]\n", + ").round(3)\n", "double_ids" ] }, @@ -512,10 +691,18 @@ "metadata": {}, "outputs": [], "source": [ - "ba_code_match = egrid_plant.set_index('plant_id_eia')[['plant_name','ba_code']].merge(plant_annual_total.set_index('plant_id_eia')[['ba_code']], how='inner', left_index=True, right_index=True, suffixes=(\"_egrid\",'_calc'))\n", + "ba_code_match = egrid_plant.set_index(\"plant_id_eia\")[[\"plant_name\", \"ba_code\"]].merge(\n", + " plant_annual_total.set_index(\"plant_id_eia\")[[\"ba_code\"]],\n", + " how=\"inner\",\n", + " left_index=True,\n", + " right_index=True,\n", + " suffixes=(\"_egrid\", \"_calc\"),\n", + ")\n", "\n", "# plants with missing ba code\n", - "ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]" + "ba_code_match[\n", + " (ba_code_match[\"ba_code_calc\"].isna()) & ~(ba_code_match[\"ba_code_egrid\"].isna())\n", + "]" ] }, { @@ -525,7 +712,10 @@ "outputs": [], "source": [ "# plants with incorrect ba code\n", - "ba_code_match[(ba_code_match['ba_code_calc'] != ba_code_match['ba_code_egrid']) & ~(ba_code_match['ba_code_calc'].isna())]" + "ba_code_match[\n", + " (ba_code_match[\"ba_code_calc\"] != ba_code_match[\"ba_code_egrid\"])\n", + " & ~(ba_code_match[\"ba_code_calc\"].isna())\n", + "]" ] }, { @@ -598,9 +788,7 @@ ") / egrid_eia_comparison[f\"{metric}_eia923\"]\n", "egrid_eia_comparison.loc[\n", " egrid_eia_comparison[\"difference\"] == 0, \"percent_difference\"\n", - "] = 0\n", - "\n", - "\n" + "] = 0" ] }, { @@ -610,13 +798,20 @@ "outputs": [], "source": [ "# add cems data\n", - "cems_total = cems.copy()[['plant_id_eia',metric]]\n", + "cems_total = cems.copy()[[\"plant_id_eia\", metric]]\n", "cems_total[\"plant_id_egrid\"] = cems_total[\"plant_id_eia\"]\n", "cems_total[\"plant_id_egrid\"].update(cems_total[\"plant_id_egrid\"].map(eia_to_egrid_id))\n", - "cems_total = cems_total.groupby('plant_id_egrid').sum()[metric].reset_index().rename(columns={metric:f\"{metric}_cems\"})\n", + "cems_total = (\n", + " cems_total.groupby(\"plant_id_egrid\")\n", + " .sum()[metric]\n", + " .reset_index()\n", + " .rename(columns={metric: f\"{metric}_cems\"})\n", + ")\n", "\n", - "# merge cems data into egrid \n", - "egrid_eia_comparison = egrid_eia_comparison.merge(cems_total, how='outer', on='plant_id_egrid')" + "# merge cems data into egrid\n", + "egrid_eia_comparison = egrid_eia_comparison.merge(\n", + " cems_total, how=\"outer\", on=\"plant_id_egrid\"\n", + ")" ] }, { @@ -641,7 +836,7 @@ "metadata": {}, "outputs": [], "source": [ - "egrid_eia_comparison[egrid_eia_comparison['source'] == 'left_only']" + "egrid_eia_comparison[egrid_eia_comparison[\"source\"] == \"left_only\"]" ] }, { @@ -651,7 +846,9 @@ "outputs": [], "source": [ "# egrid seems to be missing fuel consumption data for most nuclear power plants\n", - "missing_nuclear = egrid_eia_comparison[egrid_eia_comparison['energy_source_code'] == 'NUC']\n", + "missing_nuclear = egrid_eia_comparison[\n", + " egrid_eia_comparison[\"energy_source_code\"] == \"NUC\"\n", + "]\n", "missing_nuclear.sum()" ] }, @@ -661,7 +858,7 @@ "metadata": {}, "outputs": [], "source": [ - "egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < - 0.01)]" + "egrid_eia_comparison[(egrid_eia_comparison[\"percent_difference\"] < -0.01)]" ] }, { @@ -671,7 +868,10 @@ "outputs": [], "source": [ "# where is egrid missing data?\n", - "egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')]#.sort_values(by='percent_difference').head(20)" + "egrid_eia_comparison[\n", + " (egrid_eia_comparison[\"percent_difference\"] < -0.01)\n", + " & (egrid_eia_comparison[\"energy_source_code\"] != \"NUC\")\n", + "] # .sort_values(by='percent_difference').head(20)" ] }, { @@ -682,12 +882,26 @@ "source": [ "# how much emissions does this account for?\n", "# group by fuel code\n", - "missing_emissions = egrid_eia_comparison[(egrid_eia_comparison['percent_difference'] < -0.01) & (egrid_eia_comparison['energy_source_code'] != 'NUC')].groupby('energy_source_code').sum().reset_index()\n", + "missing_emissions = (\n", + " egrid_eia_comparison[\n", + " (egrid_eia_comparison[\"percent_difference\"] < -0.01)\n", + " & (egrid_eia_comparison[\"energy_source_code\"] != \"NUC\")\n", + " ]\n", + " .groupby(\"energy_source_code\")\n", + " .sum()\n", + " .reset_index()\n", + ")\n", "\n", "# get emission factors\n", - "emission_factors = load_data.load_ghg_emission_factors()[['energy_source_code', 'co2_lb_per_mmbtu']]\n", - "missing_emissions = missing_emissions.merge(emission_factors, how='left', on='energy_source_code')\n", - "missing_emissions['co2_mass_lb'] = missing_emissions['difference'] * missing_emissions['co2_lb_per_mmbtu']\n", + "emission_factors = load_data.load_ghg_emission_factors()[\n", + " [\"energy_source_code\", \"co2_lb_per_mmbtu\"]\n", + "]\n", + "missing_emissions = missing_emissions.merge(\n", + " emission_factors, how=\"left\", on=\"energy_source_code\"\n", + ")\n", + "missing_emissions[\"co2_mass_lb\"] = (\n", + " missing_emissions[\"difference\"] * missing_emissions[\"co2_lb_per_mmbtu\"]\n", + ")\n", "missing_emissions.sum()" ] }, @@ -705,67 +919,196 @@ "outputs": [], "source": [ "# standardize column names and index so that the two dfs can be divided\n", - "calculated_to_compare = plant_annual_total.groupby('plant_id_egrid').sum().drop(columns=['plant_id_eia'])\n", + "calculated_to_compare = (\n", + " plant_annual_total.groupby(\"plant_id_egrid\").sum().drop(columns=[\"plant_id_eia\"])\n", + ")\n", "\n", "# drop the plants that have no data in eGRID\n", - "plants_with_no_data_in_egrid = list(egrid_plant[egrid_plant[['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']].sum(axis=1) == 0]['plant_id_egrid'])\n", - "egrid_plant = egrid_plant[~egrid_plant['plant_id_eia'].isin(plants_with_no_data_in_egrid)]\n", + "plants_with_no_data_in_egrid = list(\n", + " egrid_plant[\n", + " egrid_plant[\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ]\n", + " ].sum(axis=1)\n", + " == 0\n", + " ][\"plant_id_egrid\"]\n", + ")\n", + "egrid_plant = egrid_plant[\n", + " ~egrid_plant[\"plant_id_eia\"].isin(plants_with_no_data_in_egrid)\n", + "]\n", "\n", - "egrid_to_compare = egrid_plant.set_index(['plant_id_egrid']).drop(columns=['ba_code','state','plant_name','plant_id_eia'])\n", + "egrid_to_compare = egrid_plant.set_index([\"plant_id_egrid\"]).drop(\n", + " columns=[\"ba_code\", \"state\", \"plant_name\", \"plant_id_eia\"]\n", + ")\n", "\n", "# divide calculated value by egrid value\n", - "compared = calculated_to_compare.div(egrid_to_compare).merge(egrid_plant[['plant_id_egrid','plant_name','ba_code', 'state']], how='left', left_index=True, right_on='plant_id_egrid').set_index('plant_id_egrid')\n", - "compared['plant_name'] = compared['plant_name'].fillna('unknown')\n", + "compared = (\n", + " calculated_to_compare.div(egrid_to_compare)\n", + " .merge(\n", + " egrid_plant[[\"plant_id_egrid\", \"plant_name\", \"ba_code\", \"state\"]],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_on=\"plant_id_egrid\",\n", + " )\n", + " .set_index(\"plant_id_egrid\")\n", + ")\n", + "compared[\"plant_name\"] = compared[\"plant_name\"].fillna(\"unknown\")\n", "\n", "# create a dataframe that merges the two sources of data together\n", - "compared_merged = calculated_to_compare.merge(egrid_to_compare, how='outer', on='plant_id_egrid', suffixes=('_calc','_egrid'))\n", + "compared_merged = calculated_to_compare.merge(\n", + " egrid_to_compare, how=\"outer\", on=\"plant_id_egrid\", suffixes=(\"_calc\", \"_egrid\")\n", + ")\n", "\n", "# for each column, change missing values to zero if both values are zero (only nan b/c divide by zero)\n", - "for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:\n", + "for col in [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb_adjusted\",\n", + " \"co2_mass_lb\",\n", + "]:\n", " # identify plants with zero values for both\n", - " plant_ids = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'] == 0)].index)\n", + " plant_ids = list(\n", + " compared_merged[\n", + " (compared_merged[f\"{col}_calc\"] == 0)\n", + " & (compared_merged[f\"{col}_egrid\"] == 0)\n", + " ].index\n", + " )\n", " compared.loc[compared.index.isin(plant_ids), col] = 1\n", "\n", "# for each column, categorize the data based on how far it is off from egrid\n", - "for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:\n", + "for col in [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb_adjusted\",\n", + " \"co2_mass_lb\",\n", + "]:\n", " # add a new column\n", - " compared[f'{col}_status'] = pd.cut(x=compared[col], \n", - " bins=[-999999999,0,0.5,0.9,0.99,0.9999,1,1.0001,1.01,1.1,1.5,999999999], \n", - " labels=['negative','<50%','+/-50%','+/-10%','+/-1%','!exact','!exact','+/-1%','+/-10%','+/-50%','>50%'], \n", - " ordered=False)\n", + " compared[f\"{col}_status\"] = pd.cut(\n", + " x=compared[col],\n", + " bins=[\n", + " -999999999,\n", + " 0,\n", + " 0.5,\n", + " 0.9,\n", + " 0.99,\n", + " 0.9999,\n", + " 1,\n", + " 1.0001,\n", + " 1.01,\n", + " 1.1,\n", + " 1.5,\n", + " 999999999,\n", + " ],\n", + " labels=[\n", + " \"negative\",\n", + " \"<50%\",\n", + " \"+/-50%\",\n", + " \"+/-10%\",\n", + " \"+/-1%\",\n", + " \"!exact\",\n", + " \"!exact\",\n", + " \"+/-1%\",\n", + " \"+/-10%\",\n", + " \"+/-50%\",\n", + " \">50%\",\n", + " ],\n", + " ordered=False,\n", + " )\n", " # replace any missing values with missing\n", - " compared[f'{col}_status'] = compared[f'{col}_status'].astype(str) \n", - " compared[f'{col}_status'] = compared[f'{col}_status'].fillna('missing')\n", - " compared[f'{col}_status'] = compared[f'{col}_status'].replace('nan','missing')\n", - " compared.loc[(compared.index.isin(plants_not_in_egrid)),f'{col}_status'] = 'not_in_egrid'\n", + " compared[f\"{col}_status\"] = compared[f\"{col}_status\"].astype(str)\n", + " compared[f\"{col}_status\"] = compared[f\"{col}_status\"].fillna(\"missing\")\n", + " compared[f\"{col}_status\"] = compared[f\"{col}_status\"].replace(\"nan\", \"missing\")\n", + " compared.loc[\n", + " (compared.index.isin(plants_not_in_egrid)), f\"{col}_status\"\n", + " ] = \"not_in_egrid\"\n", "\n", "# identify which plants are missing from egrid vs calculated values\n", - "for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:\n", + "for col in [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb_adjusted\",\n", + " \"co2_mass_lb\",\n", + "]:\n", " # identify plants that are missing in egrid\n", - " plants_missing_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'].isna())].index)\n", - " compared.loc[compared.index.isin(plants_missing_egrid), f'{col}_status'] = 'missing_in_egrid'\n", + " plants_missing_egrid = list(\n", + " compared_merged[\n", + " (compared_merged[f\"{col}_calc\"] > 0)\n", + " & (compared_merged[f\"{col}_egrid\"].isna())\n", + " ].index\n", + " )\n", + " compared.loc[\n", + " compared.index.isin(plants_missing_egrid), f\"{col}_status\"\n", + " ] = \"missing_in_egrid\"\n", " # identify plants that are missing from our calculations\n", - " plants_missing_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] > 0)].index)\n", - " compared.loc[compared.index.isin(plants_missing_calc), f'{col}_status'] = 'missing_in_calc'\n", + " plants_missing_calc = list(\n", + " compared_merged[\n", + " (compared_merged[f\"{col}_calc\"].isna())\n", + " & (compared_merged[f\"{col}_egrid\"] > 0)\n", + " ].index\n", + " )\n", + " compared.loc[\n", + " compared.index.isin(plants_missing_calc), f\"{col}_status\"\n", + " ] = \"missing_in_calc\"\n", " # identify where our calculations are missing a zero value\n", - " plants_missing_zero_calc = list(compared_merged[(compared_merged[f'{col}_calc'].isna()) & (compared_merged[f'{col}_egrid'] == 0)].index)\n", - " compared.loc[compared.index.isin(plants_missing_zero_calc), f'{col}_status'] = 'missing_zero_in_calc'\n", + " plants_missing_zero_calc = list(\n", + " compared_merged[\n", + " (compared_merged[f\"{col}_calc\"].isna())\n", + " & (compared_merged[f\"{col}_egrid\"] == 0)\n", + " ].index\n", + " )\n", + " compared.loc[\n", + " compared.index.isin(plants_missing_zero_calc), f\"{col}_status\"\n", + " ] = \"missing_zero_in_calc\"\n", " # identify where egrid has a missing value instead of a zero\n", - " plants_missing_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] == 0) & (compared_merged[f'{col}_egrid'].isna())].index)\n", - " compared.loc[compared.index.isin(plants_missing_zero_egrid), f'{col}_status'] = 'missing_zero_in_egrid'\n", + " plants_missing_zero_egrid = list(\n", + " compared_merged[\n", + " (compared_merged[f\"{col}_calc\"] == 0)\n", + " & (compared_merged[f\"{col}_egrid\"].isna())\n", + " ].index\n", + " )\n", + " compared.loc[\n", + " compared.index.isin(plants_missing_zero_egrid), f\"{col}_status\"\n", + " ] = \"missing_zero_in_egrid\"\n", " # identify where egrid has a zero value where we have a positive value\n", - " plants_incorrect_zero_egrid = list(compared_merged[(compared_merged[f'{col}_calc'] > 0) & (compared_merged[f'{col}_egrid'] == 0)].index)\n", - " compared.loc[compared.index.isin(plants_incorrect_zero_egrid), f'{col}_status'] = '>50%'\n", + " plants_incorrect_zero_egrid = list(\n", + " compared_merged[\n", + " (compared_merged[f\"{col}_calc\"] > 0)\n", + " & (compared_merged[f\"{col}_egrid\"] == 0)\n", + " ].index\n", + " )\n", + " compared.loc[\n", + " compared.index.isin(plants_incorrect_zero_egrid), f\"{col}_status\"\n", + " ] = \">50%\"\n", "\n", "# create a dataframe that counts how many plants are in each category\n", "comparison_count = []\n", - "for col in ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb_adjusted','co2_mass_lb']:\n", - " count = compared.groupby(f'{col}_status', dropna=False).count()['plant_name'].rename(col)\n", - " count.index = count.index.rename('status')\n", + "for col in [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb_adjusted\",\n", + " \"co2_mass_lb\",\n", + "]:\n", + " count = (\n", + " compared.groupby(f\"{col}_status\", dropna=False)\n", + " .count()[\"plant_name\"]\n", + " .rename(col)\n", + " )\n", + " count.index = count.index.rename(\"status\")\n", " comparison_count.append(count)\n", "\n", "comparison_count = pd.concat(comparison_count, axis=1).fillna(0).astype(int)\n", - "comparison_count = pd.concat([comparison_count, pd.DataFrame(comparison_count.sum().rename('Total')).T], axis=0) \n", + "comparison_count = pd.concat(\n", + " [comparison_count, pd.DataFrame(comparison_count.sum().rename(\"Total\")).T], axis=0\n", + ")\n", "comparison_count" ] }, @@ -800,28 +1143,59 @@ " 'BANGENAN':'net_generation_mwh',\n", " 'BACO2AN':'co2_mass_lb'})\"\"\"\n", "\n", - "data_columns = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_adjusted']\n", + "data_columns = [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + "]\n", "\n", - "#aggregate the plant data up to the BA level\n", - "egrid_ba = egrid_plant.groupby(['ba_code']).sum()[data_columns].reset_index()\n", + "# aggregate the plant data up to the BA level\n", + "egrid_ba = egrid_plant.groupby([\"ba_code\"]).sum()[data_columns].reset_index()\n", "\n", "# divide our calculation by the BA totals from eGRID\n", "# if there are 0 values, replace with 0.1, so that div by zero doesn't return missing value\n", - "ba_metric = plant_annual_total.groupby('ba_code', dropna=False).sum().drop(columns='plant_id_eia').replace(0,0.1).div(egrid_ba.set_index('ba_code').replace(0,0.1)).sort_values(by='co2_mass_lb').round(3)\n", + "ba_metric = (\n", + " plant_annual_total.groupby(\"ba_code\", dropna=False)\n", + " .sum()\n", + " .drop(columns=\"plant_id_eia\")\n", + " .replace(0, 0.1)\n", + " .div(egrid_ba.set_index(\"ba_code\").replace(0, 0.1))\n", + " .sort_values(by=\"co2_mass_lb\")\n", + " .round(3)\n", + ")\n", "\n", - "total = pd.DataFrame(plant_annual_total[data_columns].sum().div(egrid_ba[data_columns].sum()).rename('Total')).T\n", + "total = pd.DataFrame(\n", + " plant_annual_total[data_columns]\n", + " .sum()\n", + " .div(egrid_ba[data_columns].sum())\n", + " .rename(\"Total\")\n", + ").T\n", "\n", "# calculate the difference in the number of plants in each region\n", - "plant_count = (plant_annual_total.groupby('ba_code', dropna=False).count()['plant_id_egrid'] - egrid_plant.groupby('ba_code', dropna=False).count()['plant_id_egrid']).rename('num_plants')\n", - "ba_metric = ba_metric.merge(plant_count, how='left', left_index=True, right_index=True).drop(columns=['plant_id_egrid']).sort_index()\n", + "plant_count = (\n", + " plant_annual_total.groupby(\"ba_code\", dropna=False).count()[\"plant_id_egrid\"]\n", + " - egrid_plant.groupby(\"ba_code\", dropna=False).count()[\"plant_id_egrid\"]\n", + ").rename(\"num_plants\")\n", + "ba_metric = (\n", + " ba_metric.merge(plant_count, how=\"left\", left_index=True, right_index=True)\n", + " .drop(columns=[\"plant_id_egrid\"])\n", + " .sort_index()\n", + ")\n", "\n", - "ba_metric = pd.concat([ba_metric, total], axis=0).round(2) \n", + "ba_metric = pd.concat([ba_metric, total], axis=0).round(2)\n", "\n", - "ba_metric = ba_metric[data_columns + ['num_plants']]\n", + "ba_metric = ba_metric[data_columns + [\"num_plants\"]]\n", "\n", - "columns_to_check = ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb']\n", + "columns_to_check = [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + "]\n", "\n", - "with pd.option_context('display.max_rows', None, 'display.max_columns', None):\n", + "with pd.option_context(\"display.max_rows\", None, \"display.max_columns\", None):\n", " display(ba_metric[~(ba_metric[columns_to_check] == 1).all(axis=1)])" ] }, @@ -832,10 +1206,17 @@ "outputs": [], "source": [ "# how much co2 comes from CEMS vs EIA\n", - "data_source_by_ba = pd.pivot_table(eia923_allocated, values='co2_mass_lb', index='ba_code', columns='data_source', dropna=False, aggfunc=np.sum).replace(0,0.001)\n", + "data_source_by_ba = pd.pivot_table(\n", + " eia923_allocated,\n", + " values=\"co2_mass_lb\",\n", + " index=\"ba_code\",\n", + " columns=\"data_source\",\n", + " dropna=False,\n", + " aggfunc=np.sum,\n", + ").replace(0, 0.001)\n", "data_source_by_ba = data_source_by_ba.div(data_source_by_ba.sum(axis=1), axis=0)\n", - "with pd.option_context('display.max_rows', None, 'display.max_columns', None):\n", - " display(data_source_by_ba.round(3).fillna(0).sort_values(by='cems'))" + "with pd.option_context(\"display.max_rows\", None, \"display.max_columns\", None):\n", + " display(data_source_by_ba.round(3).fillna(0).sort_values(by=\"cems\"))" ] }, { @@ -889,12 +1270,12 @@ "outputs": [], "source": [ "# examine specific plants in a category\n", - "value = 'fuel_consumed_mmbtu'\n", - "status = '>50%'\n", + "value = \"fuel_consumed_mmbtu\"\n", + "status = \">50%\"\n", "\n", - "#compared_merged.loc[64877,:]\n", + "# compared_merged.loc[64877,:]\n", "\n", - "compared[compared[f'{value}_status'] == status].sort_values(by=value)" + "compared[compared[f\"{value}_status\"] == status].sort_values(by=value)" ] }, { @@ -903,7 +1284,7 @@ "metadata": {}, "outputs": [], "source": [ - "compared[compared[f'{value}_status'] == status].sort_values(by=value).sample(10)" + "compared[compared[f\"{value}_status\"] == status].sort_values(by=value).sample(10)" ] }, { @@ -921,7 +1302,7 @@ "metadata": {}, "outputs": [], "source": [ - "egrid_plant[egrid_plant['plant_id_eia'] == plant_to_explore]" + "egrid_plant[egrid_plant[\"plant_id_eia\"] == plant_to_explore]" ] }, { @@ -930,7 +1311,7 @@ "metadata": {}, "outputs": [], "source": [ - "plant_annual_total[plant_annual_total['plant_id_eia'] == plant_to_explore]" + "plant_annual_total[plant_annual_total[\"plant_id_eia\"] == plant_to_explore]" ] }, { @@ -939,7 +1320,7 @@ "metadata": {}, "outputs": [], "source": [ - "eia923_allocated[eia923_allocated['plant_id_eia'] == plant_to_explore].sum()" + "eia923_allocated[eia923_allocated[\"plant_id_eia\"] == plant_to_explore].sum()" ] }, { @@ -948,7 +1329,11 @@ "metadata": {}, "outputs": [], "source": [ - "cems_unit_monthly = cems.groupby(['plant_id_eia','emissions_unit_id_epa','report_date']).sum().reset_index()\n" + "cems_unit_monthly = (\n", + " cems.groupby([\"plant_id_eia\", \"emissions_unit_id_epa\", \"report_date\"])\n", + " .sum()\n", + " .reset_index()\n", + ")" ] }, { @@ -957,7 +1342,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems_unit_monthly[cems_unit_monthly['plant_id_eia'] == plant_to_explore].sum()" + "cems_unit_monthly[cems_unit_monthly[\"plant_id_eia\"] == plant_to_explore].sum()" ] }, { @@ -966,7 +1351,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems[(cems['plant_id_eia'] == plant_to_explore) & (cems['operating_time_hours'] > 0)]" + "cems[(cems[\"plant_id_eia\"] == plant_to_explore) & (cems[\"operating_time_hours\"] > 0)]" ] }, { @@ -975,7 +1360,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems_plant_annual[cems_plant_annual['plant_id_eia'] == plant_to_explore]" + "cems_plant_annual[cems_plant_annual[\"plant_id_eia\"] == plant_to_explore]" ] }, { @@ -986,7 +1371,7 @@ "source": [ "# there are some plants that report heat input and co2 in CEMS, but are missing net generation data\n", "# TODO: we should maybe try and fill net generation data using EIA-923?\n", - "cems_plant_annual[cems_plant_annual['net_generation_mwh'] == 0]" + "cems_plant_annual[cems_plant_annual[\"net_generation_mwh\"] == 0]" ] }, { @@ -1004,8 +1389,16 @@ "metadata": {}, "outputs": [], "source": [ - "ba_plant_match = plant_annual_total[plant_annual_total['ba_code'].isna()].merge(egrid_plant[egrid_plant['ba_code'].isna()], how='left', on='plant_id_egrid', suffixes=(\"_calc\",'_egrid'))\n", - "ba_plant_match[ba_plant_match['net_generation_mwh_calc'].round(0) != ba_plant_match['net_generation_mwh_egrid'].round(0)]" + "ba_plant_match = plant_annual_total[plant_annual_total[\"ba_code\"].isna()].merge(\n", + " egrid_plant[egrid_plant[\"ba_code\"].isna()],\n", + " how=\"left\",\n", + " on=\"plant_id_egrid\",\n", + " suffixes=(\"_calc\", \"_egrid\"),\n", + ")\n", + "ba_plant_match[\n", + " ba_plant_match[\"net_generation_mwh_calc\"].round(0)\n", + " != ba_plant_match[\"net_generation_mwh_egrid\"].round(0)\n", + "]" ] }, { @@ -1014,7 +1407,7 @@ "metadata": {}, "outputs": [], "source": [ - "egrid_plant[egrid_plant['ba_code'] == 'CPLE']" + "egrid_plant[egrid_plant[\"ba_code\"] == \"CPLE\"]" ] }, { @@ -1023,7 +1416,9 @@ "metadata": {}, "outputs": [], "source": [ - "compare_plants_in_ba = egrid_plant.merge(plant_annual_total, how='outer', on='plant_id_egrid', suffixes=('_egrid','_calc'))" + "compare_plants_in_ba = egrid_plant.merge(\n", + " plant_annual_total, how=\"outer\", on=\"plant_id_egrid\", suffixes=(\"_egrid\", \"_calc\")\n", + ")" ] }, { @@ -1032,10 +1427,19 @@ "metadata": {}, "outputs": [], "source": [ - "ba = 'CPLE'\n", - "metric = 'fuel_consumed_mmbtu'\n", + "ba = \"CPLE\"\n", + "metric = \"fuel_consumed_mmbtu\"\n", "\n", - "compare_plants_in_ba[((compare_plants_in_ba['ba_code_egrid'] == ba) | (compare_plants_in_ba['ba_code_calc'] == ba)) & (compare_plants_in_ba[f'{metric}_egrid'].round(0) != compare_plants_in_ba[f'{metric}_calc'].round(0))]" + "compare_plants_in_ba[\n", + " (\n", + " (compare_plants_in_ba[\"ba_code_egrid\"] == ba)\n", + " | (compare_plants_in_ba[\"ba_code_calc\"] == ba)\n", + " )\n", + " & (\n", + " compare_plants_in_ba[f\"{metric}_egrid\"].round(0)\n", + " != compare_plants_in_ba[f\"{metric}_calc\"].round(0)\n", + " )\n", + "]" ] }, { @@ -1044,7 +1448,7 @@ "metadata": {}, "outputs": [], "source": [ - "plant_annual_total[plant_annual_total['ba_code'] == 'AMPL']" + "plant_annual_total[plant_annual_total[\"ba_code\"] == \"AMPL\"]" ] } ], diff --git a/notebooks/validation/diff_output_versions.ipynb b/notebooks/validation/diff_output_versions.ipynb index b6c8d200..7ba28be0 100644 --- a/notebooks/validation/diff_output_versions.ipynb +++ b/notebooks/validation/diff_output_versions.ipynb @@ -16,7 +16,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import load_data\n", "from column_checks import get_dtypes\n", @@ -61,30 +62,62 @@ "# unzip archived data\n", "if not os.path.exists(data_folder(\"diff\")):\n", " os.mkdir(data_folder(\"diff\"))\n", - "with zipfile.ZipFile(data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\") as zip_to_unzip:\n", - " zip_to_unzip.extractall(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\"))\n", + "with zipfile.ZipFile(\n", + " data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\"\n", + ") as zip_to_unzip:\n", + " zip_to_unzip.extractall(\n", + " data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\")\n", + " )\n", "\n", "# load archived data\n", - "prev_data = pd.read_csv(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/plant_data.csv\"), dtype=get_dtypes()).round(0)\n", + "prev_data = pd.read_csv(\n", + " data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/plant_data.csv\"),\n", + " dtype=get_dtypes(),\n", + ").round(0)\n", "\n", "# load new data\n", - "new_data = pd.read_csv(results_folder(f\"{year}/{data_type}/{resolution}/us_units/plant_data.csv\"), dtype=get_dtypes()).round(0)\n", + "new_data = pd.read_csv(\n", + " results_folder(f\"{year}/{data_type}/{resolution}/us_units/plant_data.csv\"),\n", + " dtype=get_dtypes(),\n", + ").round(0)\n", "\n", "# load plant attributes\n", - "plant_attributes = pd.read_csv(outputs_folder(f\"{year}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes())\n", + "plant_attributes = pd.read_csv(\n", + " outputs_folder(f\"{year}/plant_static_attributes_{year}.csv\"), dtype=get_dtypes()\n", + ")\n", "\n", - "prev_data = prev_data.merge(plant_attributes[[\"plant_id_eia\",\"ba_code\",\"fuel_category\"]], how=\"left\", on=\"plant_id_eia\")\n", - "new_data = new_data.merge(plant_attributes[[\"plant_id_eia\",\"ba_code\",\"fuel_category\"]], how=\"left\", on=\"plant_id_eia\")\n", + "prev_data = prev_data.merge(\n", + " plant_attributes[[\"plant_id_eia\", \"ba_code\", \"fuel_category\"]],\n", + " how=\"left\",\n", + " on=\"plant_id_eia\",\n", + ")\n", + "new_data = new_data.merge(\n", + " plant_attributes[[\"plant_id_eia\", \"ba_code\", \"fuel_category\"]],\n", + " how=\"left\",\n", + " on=\"plant_id_eia\",\n", + ")\n", "\n", - "key_cols = [\"plant_id_eia\",\"ba_code\",\"fuel_category\"]\n", - "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n", + "key_cols = [\"plant_id_eia\", \"ba_code\", \"fuel_category\"]\n", + "comparison = prev_data.set_index(key_cols).compare(\n", + " new_data.set_index(key_cols), result_names=(\"previous\", \"new\")\n", + ")\n", "\n", "# get difference\n", - "diff = comparison.groupby(level=0, axis=1).diff().rename(columns={\"new\":\"pct_diff\"}).drop(columns=[\"previous\"], level=1)\n", - "comparison = pd.concat([comparison, diff], axis=1).sort_index(axis=1, level=0, ascending=True, sort_remaining=False)\n", - "comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'] = (comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'].values / comparison.iloc[:, comparison.columns.get_level_values(1)=='previous'].values).round(2)\n", + "diff = (\n", + " comparison.groupby(level=0, axis=1)\n", + " .diff()\n", + " .rename(columns={\"new\": \"pct_diff\"})\n", + " .drop(columns=[\"previous\"], level=1)\n", + ")\n", + "comparison = pd.concat([comparison, diff], axis=1).sort_index(\n", + " axis=1, level=0, ascending=True, sort_remaining=False\n", + ")\n", + "comparison.iloc[:, comparison.columns.get_level_values(1) == \"pct_diff\"] = (\n", + " comparison.iloc[:, comparison.columns.get_level_values(1) == \"pct_diff\"].values\n", + " / comparison.iloc[:, comparison.columns.get_level_values(1) == \"previous\"].values\n", + ").round(2)\n", "\n", - "comparison\n" + "comparison" ] }, { @@ -93,7 +126,9 @@ "metadata": {}, "outputs": [], "source": [ - "comparison[comparison.loc[:,(\"co2_mass_lb_for_electricity\",\"pct_diff\")] > 0.001]#.groupby(\"ba_code\").sum().sum()" + "comparison[\n", + " comparison.loc[:, (\"co2_mass_lb_for_electricity\", \"pct_diff\")] > 0.001\n", + "] # .groupby(\"ba_code\").sum().sum()" ] }, { @@ -117,13 +152,20 @@ "# unzip archived data\n", "if not os.path.exists(data_folder(\"diff\")):\n", " os.mkdir(data_folder(\"diff\"))\n", - "with zipfile.ZipFile(data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\") as zip_to_unzip:\n", - " zip_to_unzip.extractall(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\"))\n", + "with zipfile.ZipFile(\n", + " data_folder(f\"s3_upload/{year}_{data_type}_{resolution}_us_units.zip\"), \"r\"\n", + ") as zip_to_unzip:\n", + " zip_to_unzip.extractall(\n", + " data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\")\n", + " )\n", "\n", "# load archived data\n", "prev_data = []\n", "for ba in os.listdir(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units\")):\n", - " df = pd.read_csv(data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/{ba}\"), dtype=get_dtypes())\n", + " df = pd.read_csv(\n", + " data_folder(f\"diff/{year}_{data_type}_{resolution}_us_units/{ba}\"),\n", + " dtype=get_dtypes(),\n", + " )\n", " df[\"ba_code\"] = ba.split(\".\")[0]\n", " prev_data.append(df)\n", "\n", @@ -132,15 +174,20 @@ "# load data\n", "new_data = []\n", "for ba in os.listdir(results_folder(f\"{year}/{data_type}/{resolution}/us_units\")):\n", - " df = pd.read_csv(results_folder(f\"{year}/{data_type}/{resolution}/us_units/{ba}\"), dtype=get_dtypes())\n", + " df = pd.read_csv(\n", + " results_folder(f\"{year}/{data_type}/{resolution}/us_units/{ba}\"),\n", + " dtype=get_dtypes(),\n", + " )\n", " df[\"ba_code\"] = ba.split(\".\")[0]\n", " new_data.append(df)\n", "\n", "new_data = pd.concat(new_data, axis=0).reset_index(drop=True)\n", "\n", "key_cols = [\"ba_code\", \"fuel_category\"]\n", - "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n", - "comparison\n" + "comparison = prev_data.set_index(key_cols).compare(\n", + " new_data.set_index(key_cols), result_names=(\"previous\", \"new\")\n", + ")\n", + "comparison" ] }, { @@ -159,22 +206,28 @@ "source": [ "# load archived data\n", "file = \"cems_cleaned\"\n", - "key_cols = [\"plant_id_eia\",\"emissions_unit_id_epa\",\"datetime_utc\"]\n", + "key_cols = [\"plant_id_eia\", \"emissions_unit_id_epa\", \"datetime_utc\"]\n", "\n", "# unzip archived data\n", "if not os.path.exists(data_folder(f\"diff/outputs_{year}\")):\n", " os.mkdir(data_folder(f\"diff/outputs_{year}\"))\n", - " with zipfile.ZipFile(data_folder(f\"zenodo/outputs_{year}.zip\"), \"r\") as zip_to_unzip:\n", + " with zipfile.ZipFile(\n", + " data_folder(f\"zenodo/outputs_{year}.zip\"), \"r\"\n", + " ) as zip_to_unzip:\n", " zip_to_unzip.extractall(data_folder(f\"diff/outputs_{year}\"))\n", "\n", "# load archived data\n", - "prev_data = pd.read_csv(data_folder(f\"diff/outputs_{year}/{file}_{year}.csv\"), dtype=get_dtypes())\n", + "prev_data = pd.read_csv(\n", + " data_folder(f\"diff/outputs_{year}/{file}_{year}.csv\"), dtype=get_dtypes()\n", + ")\n", "\n", "# load new data\n", "new_data = pd.read_csv(outputs_folder(f\"{year}/{file}_{year}.csv\"), dtype=get_dtypes())\n", "\n", - "comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=(\"previous\",\"new\"))\n", - "comparison\n" + "comparison = prev_data.set_index(key_cols).compare(\n", + " new_data.set_index(key_cols), result_names=(\"previous\", \"new\")\n", + ")\n", + "comparison" ] } ], diff --git a/notebooks/validation/hourly_validation.ipynb b/notebooks/validation/hourly_validation.ipynb index a52c7a73..c5c1d9d0 100644 --- a/notebooks/validation/hourly_validation.ipynb +++ b/notebooks/validation/hourly_validation.ipynb @@ -35,6 +35,7 @@ "outputs": [], "source": [ "import sys\n", + "\n", "sys.path.append(\"../../src\")\n", "\n", "import filepaths" @@ -56,7 +57,11 @@ "outputs": [], "source": [ "# EIA-930 data after timestamp adjustments but no cleaning\n", - "raw = pd.read_csv(f\"{filepaths.data_folder()}/outputs/2020/eia930/eia930_raw.csv\", index_col=0, parse_dates=True)" + "raw = pd.read_csv(\n", + " f\"{filepaths.data_folder()}/outputs/2020/eia930/eia930_raw.csv\",\n", + " index_col=0,\n", + " parse_dates=True,\n", + ")" ] }, { @@ -74,15 +79,15 @@ " ba = ba_f.replace(\".csv\", \"\")\n", " print(ba, end=\"...\")\n", " col_name = GEN_ID.format(ba)\n", - " if col_name not in raw.columns: \n", + " if col_name not in raw.columns:\n", " continue\n", " else:\n", - " dat = pd.read_csv(path+ba_f, parse_dates=[\"datetime_utc\"])\n", - " dat = dat[dat.fuel_category==\"total\"]\n", - " dat = dat.merge(raw[ col_name], left_on=\"datetime_utc\", right_index=True)\n", - " c = dat[[\"net_generation_mwh\", col_name]].corr().to_numpy()[0,1]\n", + " dat = pd.read_csv(path + ba_f, parse_dates=[\"datetime_utc\"])\n", + " dat = dat[dat.fuel_category == \"total\"]\n", + " dat = dat.merge(raw[col_name], left_on=\"datetime_utc\", right_index=True)\n", + " c = dat[[\"net_generation_mwh\", col_name]].corr().to_numpy()[0, 1]\n", " cors[ba] = c\n", - " difs = (dat[col_name]-dat[\"net_generation_mwh\"])/dat[\"net_generation_mwh\"]\n", + " difs = (dat[col_name] - dat[\"net_generation_mwh\"]) / dat[\"net_generation_mwh\"]\n", " difs = difs.replace(np.inf, np.nan)\n", " percent_difs[ba] = difs.median()\n", " annual_gen[ba] = dat[\"net_generation_mwh\"].sum()" @@ -94,11 +99,22 @@ "metadata": {}, "outputs": [], "source": [ - "os.makedirs(f\"{filepaths.data_folder()}/outputs/{year}/validation_metrics/us_units\", exist_ok=True)\n", + "os.makedirs(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation_metrics/us_units\",\n", + " exist_ok=True,\n", + ")\n", "\n", - "out = pd.DataFrame(data={\"Difference as percent of hourly-egrid\":percent_difs, \"Correlation\":cors, \"Annual BA generation\":annual_gen})\n", + "out = pd.DataFrame(\n", + " data={\n", + " \"Difference as percent of hourly-egrid\": percent_difs,\n", + " \"Correlation\": cors,\n", + " \"Annual BA generation\": annual_gen,\n", + " }\n", + ")\n", "out = out.sort_values(\"Annual BA generation\", ascending=False)\n", - "out.to_csv(f\"{filepaths.data_folder()}/outputs/{year}/validation_metrics/us_units/compare_930_hourlyegrid.csv\")" + "out.to_csv(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation_metrics/us_units/compare_930_hourlyegrid.csv\"\n", + ")" ] }, { @@ -116,9 +132,9 @@ "source": [ "ba = \"NYIS\"\n", "col_name = GEN_ID.format(ba)\n", - "dat = pd.read_csv(path+ba+\".csv\", parse_dates=[\"datetime_utc\"])\n", - "dat = dat[dat.fuel_category==\"total\"]\n", - "dat = dat.merge(raw[ col_name], left_on=\"datetime_utc\", right_index=True)\n", + "dat = pd.read_csv(path + ba + \".csv\", parse_dates=[\"datetime_utc\"])\n", + "dat = dat[dat.fuel_category == \"total\"]\n", + "dat = dat.merge(raw[col_name], left_on=\"datetime_utc\", right_index=True)\n", "\n", "px.line(dat, x=\"datetime_utc\", y=[\"net_generation_mwh\", col_name])" ] @@ -136,7 +152,11 @@ "metadata": {}, "outputs": [], "source": [ - "eia930 = pd.read_csv(f\"{filepaths.data_folder()}/outputs/{year}/eia930/eia930_rolling.csv\", parse_dates=True, index_col=0)" + "eia930 = pd.read_csv(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/eia930/eia930_rolling.csv\",\n", + " parse_dates=True,\n", + " index_col=0,\n", + ")" ] }, { @@ -148,21 +168,21 @@ "## Load factors from Singularity API\n", "\n", "# Use last year's egrid because that's all we have in real time\n", - "# TODO: could expand to other pollutants if we use eGRID download \n", - "url = f\"https://api.singularity.energy/v1/emissions/\" \n", - "egrid_year = str(year-1) # use last year as eGRID year\n", + "# TODO: could expand to other pollutants if we use eGRID download\n", + "url = f\"https://api.singularity.energy/v1/emissions/\"\n", + "egrid_year = str(year - 1) # use last year as eGRID year\n", "\n", "headers = {\n", - " 'X-Api-Key': os.environ['SINGULARITY_API_KEY'],\n", + " \"X-Api-Key\": os.environ[\"SINGULARITY_API_KEY\"],\n", "}\n", "\n", "factors = {}\n", "\n", - "for adjustment in [\"adjusted\", \"unadjusted\"]: \n", + "for adjustment in [\"adjusted\", \"unadjusted\"]:\n", " adjusted = adjustment == \"adjusted\"\n", " key = f\"EGRID_{egrid_year}\" if adjusted else f\"EGRID_u{egrid_year}\"\n", - " response = requests.request(\"GET\", url+key, headers=headers)\n", - " factors[adjustment] = json.loads(response.content)[\"data\"]\n" + " response = requests.request(\"GET\", url + key, headers=headers)\n", + " factors[adjustment] = json.loads(response.content)[\"data\"]" ] }, { @@ -171,7 +191,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Default factors: coal factor is missing in FPC, PACW; so need national factor \n", + "# Default factors: coal factor is missing in FPC, PACW; so need national factor\n", "default_factors = {}\n", "default_factors[\"adjusted\"] = {}\n", "default_factors[\"unadjusted\"] = {}\n", @@ -195,13 +215,13 @@ "outputs": [], "source": [ "EIA_REGIONS = {\n", - " 'BPAT',\n", - " 'CISO',\n", - " 'ISNE',\n", - " 'MISO',\n", - " 'NYIS',\n", - " 'PJM',\n", - " 'SWPP',\n", + " \"BPAT\",\n", + " \"CISO\",\n", + " \"ISNE\",\n", + " \"MISO\",\n", + " \"NYIS\",\n", + " \"PJM\",\n", + " \"SWPP\",\n", "}" ] }, @@ -211,18 +231,23 @@ "metadata": {}, "outputs": [], "source": [ - "## For each BA, use singularity factors to calculate emission rate \n", - "bas_to_calc = [ba.replace(\".csv\", \"\") for ba in os.listdir(f\"{filepaths.results_folder()}/2020/power_sector_data/hourly/us_units/\")]\n", + "## For each BA, use singularity factors to calculate emission rate\n", + "bas_to_calc = [\n", + " ba.replace(\".csv\", \"\")\n", + " for ba in os.listdir(\n", + " f\"{filepaths.results_folder()}/2020/power_sector_data/hourly/us_units/\"\n", + " )\n", + "]\n", "\n", "fuel_categories = {\n", - " \"coal\":\"COL\",\n", - " \"natural_gas\":\"NG\",\n", - " \"other\":\"OTH\",\n", - " \"hydro\":\"WAT\",\n", - " \"wind\":\"WND\",\n", - " \"solar\":\"SUN\",\n", - " \"nuclear\":\"NUC\",\n", - " \"petroleum\":\"OIL\"\n", + " \"coal\": \"COL\",\n", + " \"natural_gas\": \"NG\",\n", + " \"other\": \"OTH\",\n", + " \"hydro\": \"WAT\",\n", + " \"wind\": \"WND\",\n", + " \"solar\": \"SUN\",\n", + " \"nuclear\": \"NUC\",\n", + " \"petroleum\": \"OIL\",\n", "}\n", "\n", "for ba in bas_to_calc:\n", @@ -231,24 +256,41 @@ " print(f\"missing ba {singularity_ba}\")\n", " continue\n", "\n", - " out = pd.DataFrame(index=eia930.index, columns=[\"adjusted_carbon\",\"unajusted_carbon\", \"adjusted_rate\", \"unadjusted_rate\"])\n", + " out = pd.DataFrame(\n", + " index=eia930.index,\n", + " columns=[\n", + " \"adjusted_carbon\",\n", + " \"unajusted_carbon\",\n", + " \"adjusted_rate\",\n", + " \"unadjusted_rate\",\n", + " ],\n", + " )\n", "\n", " for adjustment in [\"adjusted\", \"unadjusted\"]:\n", " s_fuels = list(factors[adjustment][singularity_ba].keys())\n", - " s_factors = [factors[adjustment][singularity_ba][f]['value'] for f in s_fuels]\n", - " # Add default factors for missing fuel types \n", + " s_factors = [factors[adjustment][singularity_ba][f][\"value\"] for f in s_fuels]\n", + " # Add default factors for missing fuel types\n", " for f in default_factors[adjustment].keys():\n", - " if f not in s_fuels: \n", + " if f not in s_fuels:\n", " s_fuels.append(f)\n", " s_factors.append(default_factors[adjustment][f])\n", " fuels = [fuel_categories[f] for f in s_fuels]\n", " generation_labels = [f\"EBA.{ba}-ALL.NG.{f}.H\" for f in fuels]\n", "\n", - " out.loc[:,f\"{adjustment}_carbon\"] = eia930[generation_labels].mul(s_factors, axis='columns').sum(axis='columns')\n", - " out.loc[:,f\"{adjustment}_rate\"] = out.loc[:,f\"{adjustment}_carbon\"] / eia930.loc[:,f\"EBA.{ba}-ALL.NG.H\"]\n", + " out.loc[:, f\"{adjustment}_carbon\"] = (\n", + " eia930[generation_labels].mul(s_factors, axis=\"columns\").sum(axis=\"columns\")\n", + " )\n", + " out.loc[:, f\"{adjustment}_rate\"] = (\n", + " out.loc[:, f\"{adjustment}_carbon\"] / eia930.loc[:, f\"EBA.{ba}-ALL.NG.H\"]\n", + " )\n", "\n", - " os.makedirs(f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/\", exist_ok=True)\n", - " out.to_csv(f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba}.csv\")\n" + " os.makedirs(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/\",\n", + " exist_ok=True,\n", + " )\n", + " out.to_csv(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba}.csv\"\n", + " )" ] }, { @@ -266,8 +308,12 @@ "metadata": {}, "outputs": [], "source": [ - "gen_path = f\"{filepaths.data_folder()}/results/{year}/power_sector_data/hourly/us_units/\"\n", - "consumed_path = f\"{filepaths.data_folder()}/results/{year}/carbon_accounting/hourly/us_units/\"" + "gen_path = (\n", + " f\"{filepaths.data_folder()}/results/{year}/power_sector_data/hourly/us_units/\"\n", + ")\n", + "consumed_path = (\n", + " f\"{filepaths.data_folder()}/results/{year}/carbon_accounting/hourly/us_units/\"\n", + ")" ] }, { @@ -290,39 +336,60 @@ "med_rate = {}\n", "cors = {}\n", "max_difs = {}\n", - "for ba in os.listdir(f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/\"):\n", - " if ba == \".DS_Store\": # just some os stuff\n", - " continue \n", + "for ba in os.listdir(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/\"\n", + "):\n", + " if ba == \".DS_Store\": # just some os stuff\n", + " continue\n", " ba = ba.replace(\".csv\", \"\")\n", - " singularity_dat = pd.read_csv(f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba}.csv\", index_col=0, parse_dates=True)\n", + " singularity_dat = pd.read_csv(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba}.csv\",\n", + " index_col=0,\n", + " parse_dates=True,\n", + " )\n", " # hourly_consumed = pd.read_csv(consumed_path+ba+\".csv\",\n", - " # usecols=[\"datetime_utc\", \"consumed_co2_rate_lb_per_mwh_for_electricity\", \"consumed_co2_rate_lb_per_mwh_adjusted\"], \n", + " # usecols=[\"datetime_utc\", \"consumed_co2_rate_lb_per_mwh_for_electricity\", \"consumed_co2_rate_lb_per_mwh_adjusted\"],\n", " # index_col=\"datetime_utc\", parse_dates=True)\n", - " hourly_generated = pd.read_csv(gen_path+ba+\".csv\", \n", - " usecols=[\"datetime_utc\", \"generated_co2_rate_lb_per_mwh_for_electricity\", \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\", \"co2_mass_lb\", \"fuel_category\"], \n", - " index_col=\"datetime_utc\", parse_dates=True)\n", - " hourly_generated = hourly_generated.loc[hourly_generated.fuel_category==\"total\"]\n", + " hourly_generated = pd.read_csv(\n", + " gen_path + ba + \".csv\",\n", + " usecols=[\n", + " \"datetime_utc\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\",\n", + " \"co2_mass_lb\",\n", + " \"fuel_category\",\n", + " ],\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + " )\n", + " hourly_generated = hourly_generated.loc[hourly_generated.fuel_category == \"total\"]\n", " hourly_generated = hourly_generated.sort_index()\n", - " all_dat = pd.concat([singularity_dat, hourly_generated], axis='columns')\n", + " all_dat = pd.concat([singularity_dat, hourly_generated], axis=\"columns\")\n", "\n", " dat_key = \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\"\n", "\n", " # Patch fix for PJM, see https://github.com/singularity-energy/open-grid-emissions/issues/230\n", - " if ba==\"PJM\":\n", + " if ba == \"PJM\":\n", " all_dat.loc[all_dat[dat_key] < 100, dat_key] = np.nan\n", " all_dat = all_dat[\"2020-02-01T00:00\":]\n", "\n", - " # Patch fix for FPL real-time issue not caught by rolling filter \n", - " if ba==\"FPL\":\n", + " # Patch fix for FPL real-time issue not caught by rolling filter\n", + " if ba == \"FPL\":\n", " all_dat.loc[all_dat[\"adjusted_rate\"] > 5000, \"adjusted_rate\"] = np.nan\n", "\n", - "\n", " all_dat = all_dat.sort_index()\n", - " cors[ba] = all_dat[[dat_key, \"adjusted_rate\"]].corr().to_numpy()[0,1]\n", - " percent_difs[ba] = ((all_dat[\"adjusted_rate\"] - all_dat[dat_key])/all_dat[dat_key]).median()\n", - " max_difs[ba] = ((all_dat[\"adjusted_rate\"] - all_dat[dat_key])/all_dat[dat_key]).abs().replace(1.0, np.nan).max()\n", - " abs_difs[ba] = ((all_dat[\"adjusted_rate\"] - all_dat[dat_key])).median()\n", - " med_rate[ba] = all_dat[\"adjusted_rate\"].median()\n" + " cors[ba] = all_dat[[dat_key, \"adjusted_rate\"]].corr().to_numpy()[0, 1]\n", + " percent_difs[ba] = (\n", + " (all_dat[\"adjusted_rate\"] - all_dat[dat_key]) / all_dat[dat_key]\n", + " ).median()\n", + " max_difs[ba] = (\n", + " ((all_dat[\"adjusted_rate\"] - all_dat[dat_key]) / all_dat[dat_key])\n", + " .abs()\n", + " .replace(1.0, np.nan)\n", + " .max()\n", + " )\n", + " abs_difs[ba] = (all_dat[\"adjusted_rate\"] - all_dat[dat_key]).median()\n", + " med_rate[ba] = all_dat[\"adjusted_rate\"].median()" ] }, { @@ -331,35 +398,48 @@ "metadata": {}, "outputs": [], "source": [ - "out = pd.DataFrame(data={\n", - " \"Median rate difference\":abs_difs,\n", - " \"Difference as percent of OGE\":percent_difs,\n", - " \"Correlation\":cors, \n", - " \"Annual BA generation\":annual_gen,\n", - " \"Median rate\":med_rate,\n", - " })\n", + "out = pd.DataFrame(\n", + " data={\n", + " \"Median rate difference\": abs_difs,\n", + " \"Difference as percent of OGE\": percent_difs,\n", + " \"Correlation\": cors,\n", + " \"Annual BA generation\": annual_gen,\n", + " \"Median rate\": med_rate,\n", + " }\n", + ")\n", "out = out.sort_values(\"Annual BA generation\", ascending=False)\n", "\n", - "# Exclude BAs for which we couldn't calculate a real-time rate \n", - "todrop = [b for b in out.index if (b not in factors[\"adjusted\"].keys()) and (\"EIA.\"+b not in factors[\"adjusted\"].keys())]\n", - "print(f\"dropping {todrop} because they aren't included in Singularity's emission rate API\")\n", + "# Exclude BAs for which we couldn't calculate a real-time rate\n", + "todrop = [\n", + " b\n", + " for b in out.index\n", + " if (b not in factors[\"adjusted\"].keys())\n", + " and (\"EIA.\" + b not in factors[\"adjusted\"].keys())\n", + "]\n", + "print(\n", + " f\"dropping {todrop} because they aren't included in Singularity's emission rate API\"\n", + ")\n", "out = out.drop(labels=todrop)\n", "# exclude BAs for which rate is always zero (Hydro-only BAs)\n", "zero_rates = []\n", - "for ba in out.index: \n", - " if (out.loc[ba, \"Median rate\"] == 0) and (out.loc[ba, \"Median rate difference\"] == 0):\n", + "for ba in out.index:\n", + " if (out.loc[ba, \"Median rate\"] == 0) and (\n", + " out.loc[ba, \"Median rate difference\"] == 0\n", + " ):\n", " zero_rates.append(ba)\n", "print(f\"Note {zero_rates} have zero rates in OGE data\")\n", - "#out = out.drop(labels=todrop)\n", + "# out = out.drop(labels=todrop)\n", "# exclude BAs with zero net gen according to our data\n", "zero_gen = []\n", - "for ba in out.index: \n", - " if (out.loc[ba, \"Annual BA generation\"] == 0):\n", + "for ba in out.index:\n", + " if out.loc[ba, \"Annual BA generation\"] == 0:\n", " zero_gen.append(ba)\n", "print(f\"Dropping {zero_gen} because they have zero generation in OGE data\")\n", "out = out.drop(labels=zero_gen)\n", "\n", - "out.to_csv(f\"{filepaths.data_folder()}/outputs/{year}/validation_metrics/us_units/compare_real_time_rates.csv\")" + "out.to_csv(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation_metrics/us_units/compare_real_time_rates.csv\"\n", + ")" ] }, { @@ -377,11 +457,15 @@ "metadata": {}, "outputs": [], "source": [ - "out_tbl = out.copy()#.round(2)\n", - "out_tbl[\"Annual BA generation\"] = out_tbl[\"Annual BA generation\"]/1000000 # convert to millions\n", - "out_tbl[\"Difference as percent of OGE\"] = out_tbl[\"Difference as percent of OGE\"]*100 # convert to %\n", + "out_tbl = out.copy() # .round(2)\n", + "out_tbl[\"Annual BA generation\"] = (\n", + " out_tbl[\"Annual BA generation\"] / 1000000\n", + ") # convert to millions\n", + "out_tbl[\"Difference as percent of OGE\"] = (\n", + " out_tbl[\"Difference as percent of OGE\"] * 100\n", + ") # convert to %\n", "out_tbl = out_tbl.round(2)\n", - "for line in out_tbl.to_markdown().split(\"/n\"): \n", + "for line in out_tbl.to_markdown().split(\"/n\"):\n", " print(line)" ] }, @@ -411,47 +495,80 @@ "ba_of_interest = \"BPAT\"\n", "\n", "\n", - "\n", - "real_time = pd.read_csv(f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv\", index_col=0, parse_dates=True)\n", + "real_time = pd.read_csv(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv\",\n", + " index_col=0,\n", + " parse_dates=True,\n", + ")\n", "real_time = real_time[\"2020-01-01T00:00\":]\n", "if ba_of_interest == \"NYIS\":\n", - " # NYIS has a hole in the EIA data that's not there in ISO data: fill it \n", - " nyis_hole = pd.Series(data=[313, 287.79, 262.215], index=[\"2020-03-30T01:00+00\", \"2020-03-30T02:00+00\", \"2020-03-30T03:00+00\"])\n", + " # NYIS has a hole in the EIA data that's not there in ISO data: fill it\n", + " nyis_hole = pd.Series(\n", + " data=[313, 287.79, 262.215],\n", + " index=[\"2020-03-30T01:00+00\", \"2020-03-30T02:00+00\", \"2020-03-30T03:00+00\"],\n", + " )\n", " real_time.loc[nyis_hole.index, \"adjusted_rate\"] = nyis_hole\n", "\n", - "hourly_consumed = pd.read_csv(consumed_path+ba_of_interest+\".csv\",\n", - " usecols=[\"datetime_utc\", \"consumed_co2_rate_lb_per_mwh_for_electricity\", \"consumed_co2_rate_lb_per_mwh_for_electricity_adjusted\"], \n", - " index_col=\"datetime_utc\", parse_dates=True)\n", - "hourly_generated = pd.read_csv(gen_path+ba_of_interest+\".csv\", \n", - " usecols=[\"datetime_utc\", \"generated_co2_rate_lb_per_mwh_for_electricity\", \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\", \"co2_mass_lb\", \"fuel_category\"], \n", - " index_col=\"datetime_utc\", parse_dates=True)\n", + "hourly_consumed = pd.read_csv(\n", + " consumed_path + ba_of_interest + \".csv\",\n", + " usecols=[\n", + " \"datetime_utc\",\n", + " \"consumed_co2_rate_lb_per_mwh_for_electricity\",\n", + " \"consumed_co2_rate_lb_per_mwh_for_electricity_adjusted\",\n", + " ],\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + ")\n", + "hourly_generated = pd.read_csv(\n", + " gen_path + ba_of_interest + \".csv\",\n", + " usecols=[\n", + " \"datetime_utc\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\",\n", + " \"co2_mass_lb\",\n", + " \"fuel_category\",\n", + " ],\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + ")\n", "\n", - "all_dat = pd.concat([real_time, hourly_consumed, hourly_generated.loc[hourly_generated.fuel_category==\"total\"]], axis='columns')\n", + "all_dat = pd.concat(\n", + " [\n", + " real_time,\n", + " hourly_consumed,\n", + " hourly_generated.loc[hourly_generated.fuel_category == \"total\"],\n", + " ],\n", + " axis=\"columns\",\n", + ")\n", "all_dat = all_dat.sort_index()\n", "\n", - "all_dat[\"percent_difs\"] = (all_dat[\"adjusted_rate\"] - all_dat[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\"])/all_dat[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\"]\n", + "all_dat[\"percent_difs\"] = (\n", + " all_dat[\"adjusted_rate\"]\n", + " - all_dat[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\"]\n", + ") / all_dat[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\"]\n", "\n", - "#all_dat = all_dat.loc[parse_dt(\"2020-07-19T00:00+00\"):parse_dt(\"2020-08-06T00:00+00\")]\n", - "#all_dat = all_dat.loc[parse_dt(\"2020-02-10T00:00+00\"):parse_dt(\"2020-02-28T00:00+00\")]\n", + "# all_dat = all_dat.loc[parse_dt(\"2020-07-19T00:00+00\"):parse_dt(\"2020-08-06T00:00+00\")]\n", + "# all_dat = all_dat.loc[parse_dt(\"2020-02-10T00:00+00\"):parse_dt(\"2020-02-28T00:00+00\")]\n", "\n", - "fig = px.line(all_dat, x=all_dat.index, y=[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\", \"adjusted_rate\"], \n", + "fig = px.line(\n", + " all_dat,\n", + " x=all_dat.index,\n", + " y=[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\", \"adjusted_rate\"],\n", " title=f\"Real time accuracy in {ba_of_interest}\",\n", - " labels={\n", - " \"value\":\"CO2 emission rate (lb/mwh)\",\n", - " \"index\":\"Hour (UTC)\"\n", - " }, \n", - " template='plotly_white',\n", + " labels={\"value\": \"CO2 emission rate (lb/mwh)\", \"index\": \"Hour (UTC)\"},\n", + " template=\"plotly_white\",\n", ")\n", "\n", "newnames = {\n", - " 'generated_co2_rate_lb_per_mwh_for_electricity_adjusted': 'Historical benchmark',\n", - " 'adjusted_rate': 'Real-time data'}\n", - "fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))\n", - "fig.update_layout(legend_title_text='')\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\": \"Historical benchmark\",\n", + " \"adjusted_rate\": \"Real-time data\",\n", + "}\n", + "fig.for_each_trace(lambda t: t.update(name=newnames[t.name]))\n", + "fig.update_layout(legend_title_text=\"\")\n", "fig.show()\n", "\n", "os.makedirs(f\"{filepaths.data_folder()}/outputs/viz/\", exist_ok=True)\n", - "#pio.write_image(fig, f\"{filepaths.data_folder()}/outputs/viz/{ba_of_interest}_aug_sm.jpg\", width=1000*(2/3), height=500*(2/3), scale=3)" + "# pio.write_image(fig, f\"{filepaths.data_folder()}/outputs/viz/{ba_of_interest}_aug_sm.jpg\", width=1000*(2/3), height=500*(2/3), scale=3)" ] }, { @@ -469,34 +586,60 @@ "metadata": {}, "outputs": [], "source": [ - "### Plot natural gas emission rate: does this explain larger gap in summer? \n", + "### Plot natural gas emission rate: does this explain larger gap in summer?\n", "\n", - "hourly_rate = pd.read_csv(gen_path+ba_of_interest+\".csv\", \n", - " usecols=[\"datetime_utc\", \"generated_co2_rate_lb_per_mwh_for_electricity\", \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\", \"co2_mass_lb\", \"fuel_category\"], \n", - " index_col=\"datetime_utc\", parse_dates=True)\n", + "hourly_rate = pd.read_csv(\n", + " gen_path + ba_of_interest + \".csv\",\n", + " usecols=[\n", + " \"datetime_utc\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\",\n", + " \"co2_mass_lb\",\n", + " \"fuel_category\",\n", + " ],\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + ")\n", "hourly_rate = hourly_rate[hourly_rate.fuel_category == \"natural_gas\"]\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(go.Scatter(x=hourly_rate.index, y=hourly_rate[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\"], name=\"Hourly emission rate\"))\n", - "fig.add_trace(go.Scatter(x=[parse_dt(\"2020-01-01T00:00\"), parse_dt(\"2021-01-01T00:00\")], \n", - " y=[factors[\"adjusted\"][\"EIA.\"+ba_of_interest][\"natural_gas\"][\"value\"], factors[\"adjusted\"][\"EIA.\"+ba_of_interest][\"natural_gas\"][\"value\"]], \n", - " name=\"eGRID annual emission rate\", mode=\"lines\"\n", - "))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=hourly_rate.index,\n", + " y=hourly_rate[\"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\"],\n", + " name=\"Hourly emission rate\",\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=[parse_dt(\"2020-01-01T00:00\"), parse_dt(\"2021-01-01T00:00\")],\n", + " y=[\n", + " factors[\"adjusted\"][\"EIA.\" + ba_of_interest][\"natural_gas\"][\"value\"],\n", + " factors[\"adjusted\"][\"EIA.\" + ba_of_interest][\"natural_gas\"][\"value\"],\n", + " ],\n", + " name=\"eGRID annual emission rate\",\n", + " mode=\"lines\",\n", + " )\n", + ")\n", "\n", "fig.update_xaxes(range=(parse_dt(\"2020-01-01T00:00\"), parse_dt(\"2021-01-01T00:00\")))\n", - "fig.update_layout(template=\"plotly_white\", title=f\"Natural gas emission rates in {ba_of_interest}O\",\n", - "legend=dict(\n", - " yanchor=\"top\",\n", - " y=0.99,\n", - " xanchor=\"left\",\n", - " x=0.01\n", - "))\n", + "fig.update_layout(\n", + " template=\"plotly_white\",\n", + " title=f\"Natural gas emission rates in {ba_of_interest}O\",\n", + " legend=dict(yanchor=\"top\", y=0.99, xanchor=\"left\", x=0.01),\n", + ")\n", "\n", - "fig.update_yaxes(title_text='Natural gas emission rate
(lb CO2/MWh)')\n", + "fig.update_yaxes(title_text=\"Natural gas emission rate
(lb CO2/MWh)\")\n", "\n", "fig.show()\n", "\n", - "pio.write_image(fig, f\"{filepaths.data_folder()}/outputs/viz/gas_rate_{ba_of_interest}.jpg\", width=1000*(4/5), height=500*(4/5), scale=3)" + "pio.write_image(\n", + " fig,\n", + " f\"{filepaths.data_folder()}/outputs/viz/gas_rate_{ba_of_interest}.jpg\",\n", + " width=1000 * (4 / 5),\n", + " height=500 * (4 / 5),\n", + " scale=3,\n", + ")" ] }, { @@ -505,21 +648,28 @@ "metadata": {}, "outputs": [], "source": [ - "oge_generation = pd.read_csv(gen_path+ba_of_interest+\".csv\", \n", - " usecols=[\"datetime_utc\", \"fuel_category\", \"net_generation_mwh\"], \n", - " index_col=\"datetime_utc\", parse_dates=True)\n", - "oge_generation = oge_generation.pivot(columns=\"fuel_category\", values=\"net_generation_mwh\")\n", + "oge_generation = pd.read_csv(\n", + " gen_path + ba_of_interest + \".csv\",\n", + " usecols=[\"datetime_utc\", \"fuel_category\", \"net_generation_mwh\"],\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + ")\n", + "oge_generation = oge_generation.pivot(\n", + " columns=\"fuel_category\", values=\"net_generation_mwh\"\n", + ")\n", "\n", - "# plot real-time and OGE per-fuel generation in FPC to identify source of neg correlation \n", - "eiacols = [f'EBA.{ba_of_interest}-ALL.NG.COL.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.NG.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.NUC.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.OIL.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.OTH.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.SUN.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.UNK.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.WAT.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.WND.H']\n", + "# plot real-time and OGE per-fuel generation in FPC to identify source of neg correlation\n", + "eiacols = [\n", + " f\"EBA.{ba_of_interest}-ALL.NG.COL.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.NG.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.NUC.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.OIL.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.OTH.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.SUN.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.UNK.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.WAT.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.WND.H\",\n", + "]\n", "\n", "toplot = pd.concat([eia930[eiacols], oge_generation])" ] @@ -539,29 +689,29 @@ "metadata": {}, "outputs": [], "source": [ - "# plot real-time and OGE per-fuel generation in FPC to identify source of neg correlation \n", + "# plot real-time and OGE per-fuel generation in FPC to identify source of neg correlation\n", "plotcols = [\n", - " #f'EBA.{ba_of_interest}-ALL.NG.COL.H',\n", - " #f'EBA.{ba_of_interest}-ALL.NG.NG.H',\n", - " #f'EBA.{ba_of_interest}-ALL.NG.NUC.H',\n", - " #f'EBA.{ba_of_interest}-ALL.NG.OIL.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.OTH.H',\n", - " #f'EBA.{ba_of_interest}-ALL.NG.SUN.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.UNK.H',\n", - " f'EBA.{ba_of_interest}-ALL.NG.WAT.H',\n", - " #f'EBA.{ba_of_interest}-ALL.NG.WND.H',\n", - " #\"biomass\",\n", - " #\"natural_gas\",\n", - " #\"petroleum\",\n", - " #\"solar\",\n", - " #\"total\",\n", - " #\"waste\",\n", - " #\"geothermal\", \n", - " \"hydro\",\n", - " #\"wind\",\n", - " ]\n", + " # f'EBA.{ba_of_interest}-ALL.NG.COL.H',\n", + " # f'EBA.{ba_of_interest}-ALL.NG.NG.H',\n", + " # f'EBA.{ba_of_interest}-ALL.NG.NUC.H',\n", + " # f'EBA.{ba_of_interest}-ALL.NG.OIL.H',\n", + " f\"EBA.{ba_of_interest}-ALL.NG.OTH.H\",\n", + " # f'EBA.{ba_of_interest}-ALL.NG.SUN.H',\n", + " f\"EBA.{ba_of_interest}-ALL.NG.UNK.H\",\n", + " f\"EBA.{ba_of_interest}-ALL.NG.WAT.H\",\n", + " # f'EBA.{ba_of_interest}-ALL.NG.WND.H',\n", + " # \"biomass\",\n", + " # \"natural_gas\",\n", + " # \"petroleum\",\n", + " # \"solar\",\n", + " # \"total\",\n", + " # \"waste\",\n", + " # \"geothermal\",\n", + " \"hydro\",\n", + " # \"wind\",\n", + "]\n", "\n", - "px.line(toplot[plotcols])\n" + "px.line(toplot[plotcols])" ] }, { @@ -570,7 +720,7 @@ "metadata": {}, "outputs": [], "source": [ - "# What plants " + "# What plants" ] }, { @@ -579,7 +729,11 @@ "metadata": {}, "outputs": [], "source": [ - "px.histogram(all_dat, x=\"percent_difs\", title=\"NYIS hourly difference between benchmark and real-time
as percent of benchmark \")" + "px.histogram(\n", + " all_dat,\n", + " x=\"percent_difs\",\n", + " title=\"NYIS hourly difference between benchmark and real-time
as percent of benchmark \",\n", + ")" ] }, { @@ -605,7 +759,7 @@ "metadata": {}, "outputs": [], "source": [ - "(55539223793.10689 - 57691924000)/57691924000" + "(55539223793.10689 - 57691924000) / 57691924000" ] }, { @@ -633,7 +787,13 @@ "metadata": {}, "outputs": [], "source": [ - "px.scatter(out, x=\"Difference as percent of OGE\", y=\"Correlation\", size=\"Annual BA generation\", template=\"plotly_white\")#, text=out.index)" + "px.scatter(\n", + " out,\n", + " x=\"Difference as percent of OGE\",\n", + " y=\"Correlation\",\n", + " size=\"Annual BA generation\",\n", + " template=\"plotly_white\",\n", + ") # , text=out.index)" ] }, { @@ -642,17 +802,38 @@ "metadata": {}, "outputs": [], "source": [ - "#fig = px.scatter(out, x=\"Annual BA generation\", y=\"Correlation\", template=\"plotly_white\")#, text=out.index)\n", + "# fig = px.scatter(out, x=\"Annual BA generation\", y=\"Correlation\", template=\"plotly_white\")#, text=out.index)\n", "fig = go.Figure()\n", "\n", - "fig.add_trace(go.Scatter(y=[-3000000,805000000], x=[1,1], line={\"width\":2, \"color\":\"lightslategrey\"}, mode=\"lines\"))\n", - "fig.add_trace( go.Scatter(y=out[\"Annual BA generation\"], x=out[\"Correlation\"], text=out.index, mode=\"markers\", marker={\"color\":\"rgb(17, 119, 51)\"})) #, color=\"Median rate\")#, text=out.index)\n", - "fig.update_yaxes(range=(-3000000,805000000))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " y=[-3000000, 805000000],\n", + " x=[1, 1],\n", + " line={\"width\": 2, \"color\": \"lightslategrey\"},\n", + " mode=\"lines\",\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " y=out[\"Annual BA generation\"],\n", + " x=out[\"Correlation\"],\n", + " text=out.index,\n", + " mode=\"markers\",\n", + " marker={\"color\": \"rgb(17, 119, 51)\"},\n", + " )\n", + ") # , color=\"Median rate\")#, text=out.index)\n", + "fig.update_yaxes(range=(-3000000, 805000000))\n", "fig.update_layout(template=\"plotly_white\", showlegend=False)\n", "\n", - "fig.update_xaxes(dtick=.250)\n", + "fig.update_xaxes(dtick=0.250)\n", "fig.show()\n", - "pio.write_image(fig, f\"{filepaths.data_folder()}/outputs/viz/cor_ba_gen.jpg\", width=800*(1/2), height=900*(1/2), scale=4)" + "pio.write_image(\n", + " fig,\n", + " f\"{filepaths.data_folder()}/outputs/viz/cor_ba_gen.jpg\",\n", + " width=800 * (1 / 2),\n", + " height=900 * (1 / 2),\n", + " scale=4,\n", + ")" ] }, { @@ -661,7 +842,7 @@ "metadata": {}, "outputs": [], "source": [ - "#px.scatter(out, x=\"Annual BA generation\", y=\"Difference as percent of OGE\")#, text=out.index)" + "# px.scatter(out, x=\"Annual BA generation\", y=\"Difference as percent of OGE\")#, text=out.index)" ] }, { @@ -672,13 +853,34 @@ "source": [ "fig = go.Figure()\n", "\n", - "fig.add_trace(go.Scatter(y=[-3000000,805000000], x=[0,0], line={\"width\":2, \"color\":\"lightslategrey\"}, mode=\"lines\"))\n", - "fig.add_trace( go.Scatter(y=out[\"Annual BA generation\"], x=out[\"Median rate difference\"], text=out.index, mode=\"markers\", marker={\"color\":\"rgb(17, 119, 51)\"})) #, color=\"Median rate\")#, text=out.index)\n", - "fig.update_yaxes(range=(-3000000,805000000))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " y=[-3000000, 805000000],\n", + " x=[0, 0],\n", + " line={\"width\": 2, \"color\": \"lightslategrey\"},\n", + " mode=\"lines\",\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " y=out[\"Annual BA generation\"],\n", + " x=out[\"Median rate difference\"],\n", + " text=out.index,\n", + " mode=\"markers\",\n", + " marker={\"color\": \"rgb(17, 119, 51)\"},\n", + " )\n", + ") # , color=\"Median rate\")#, text=out.index)\n", + "fig.update_yaxes(range=(-3000000, 805000000))\n", "fig.update_layout(template=\"plotly_white\", showlegend=False)\n", "fig.update_xaxes(dtick=500)\n", "fig.show()\n", - "pio.write_image(fig, f\"{filepaths.data_folder()}/outputs/viz/dif_ba_gen.jpg\", width=800*(1/2), height=900*(1/2), scale=4)" + "pio.write_image(\n", + " fig,\n", + " f\"{filepaths.data_folder()}/outputs/viz/dif_ba_gen.jpg\",\n", + " width=800 * (1 / 2),\n", + " height=900 * (1 / 2),\n", + " scale=4,\n", + ")" ] }, { @@ -703,7 +905,7 @@ "metadata": {}, "outputs": [], "source": [ - "dat = " + "# dat =" ] }, { @@ -719,9 +921,9 @@ "metadata": {}, "outputs": [], "source": [ - "good = len(out[out[\"Difference as percent of OGE\"].abs() <= .1])\n", - "bad = len(out[out[\"Difference as percent of OGE\"].abs() > .1])\n", - "print(good/(bad+good))" + "good = len(out[out[\"Difference as percent of OGE\"].abs() <= 0.1])\n", + "bad = len(out[out[\"Difference as percent of OGE\"].abs() > 0.1])\n", + "print(good / (bad + good))" ] }, { @@ -730,9 +932,9 @@ "metadata": {}, "outputs": [], "source": [ - "for col in out.columns: \n", + "for col in out.columns:\n", " out = out.replace(np.inf, np.nan)\n", - " out = out.replace(-1*np.inf, np.nan)\n", + " out = out.replace(-1 * np.inf, np.nan)\n", " non_nan_out = out.dropna(subset=col)\n", " a = np.average(non_nan_out[col].abs(), weights=non_nan_out[\"Annual BA generation\"])\n", " print(f\"{col} = {a}\")" @@ -751,32 +953,64 @@ "metadata": {}, "outputs": [], "source": [ - "# Plot and save all BAs \n", - "for ba_of_interest in os.listdir(f\"{filepaths.data_folder()}/outputs/2020/validation/real_time_rate/\"):\n", + "# Plot and save all BAs\n", + "for ba_of_interest in os.listdir(\n", + " f\"{filepaths.data_folder()}/outputs/2020/validation/real_time_rate/\"\n", + "):\n", " ba_of_interest = ba_of_interest.replace(\".csv\", \"\")\n", " if \".DS_\" in ba_of_interest:\n", " continue\n", - " \n", - " real_time = pd.read_csv(f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv\", index_col=0, parse_dates=True)\n", + "\n", + " real_time = pd.read_csv(\n", + " f\"{filepaths.data_folder()}/outputs/{year}/validation/real_time_rate/{ba_of_interest}.csv\",\n", + " index_col=0,\n", + " parse_dates=True,\n", + " )\n", " real_time = real_time[\"2020-01-01T00:00\":]\n", "\n", - " hourly_generated = pd.read_csv(gen_path+ba_of_interest+\".csv\", \n", - " usecols=[\"datetime_utc\", \"generated_co2_rate_lb_per_mwh_for_electricity\", \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\", \"co2_mass_lb\", \"fuel_category\"], \n", - " index_col=\"datetime_utc\", parse_dates=True)\n", + " hourly_generated = pd.read_csv(\n", + " gen_path + ba_of_interest + \".csv\",\n", + " usecols=[\n", + " \"datetime_utc\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity_adjusted\",\n", + " \"co2_mass_lb\",\n", + " \"fuel_category\",\n", + " ],\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + " )\n", "\n", - " all_dat = pd.concat([real_time, hourly_consumed, hourly_generated.loc[hourly_generated.fuel_category==\"total\"]], axis='columns')\n", + " all_dat = pd.concat(\n", + " [\n", + " real_time,\n", + " hourly_consumed,\n", + " hourly_generated.loc[hourly_generated.fuel_category == \"total\"],\n", + " ],\n", + " axis=\"columns\",\n", + " )\n", " all_dat = all_dat.sort_index()\n", "\n", - " fig = px.line(all_dat, x=all_dat.index, y=[\"generated_co2_rate_lb_per_mwh_for_electricity\", \"adjusted_rate\"], \n", + " fig = px.line(\n", + " all_dat,\n", + " x=all_dat.index,\n", + " y=[\"generated_co2_rate_lb_per_mwh_for_electricity\", \"adjusted_rate\"],\n", " title=f\"{ba_of_interest} rate comparison\",\n", - " labels={\n", - " \"value\":\"Adjsuted CO2 emission rate (lb/mwh)\",\n", - " \"index\":\"Hour\"\n", - " })\n", + " labels={\"value\": \"Adjsuted CO2 emission rate (lb/mwh)\", \"index\": \"Hour\"},\n", + " )\n", "\n", - " newnames = {'generated_co2_rate_lb_per_mwh_for_electricity': 'Our data', 'adjusted_rate': 'Real-time data'}\n", - " fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))\n", - " pio.write_image(fig, f\"{filepaths.data_folder()}/outputs/viz/{ba_of_interest}.jpg\", width=1000, height=400, scale=3)" + " newnames = {\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity\": \"Our data\",\n", + " \"adjusted_rate\": \"Real-time data\",\n", + " }\n", + " fig.for_each_trace(lambda t: t.update(name=newnames[t.name]))\n", + " pio.write_image(\n", + " fig,\n", + " f\"{filepaths.data_folder()}/outputs/viz/{ba_of_interest}.jpg\",\n", + " width=1000,\n", + " height=400,\n", + " scale=3,\n", + " )" ] }, { @@ -789,7 +1023,7 @@ ], "metadata": { "kernelspec": { - "display_name": "oge_update", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -803,9 +1037,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8 | packaged by conda-forge | (main, Nov 24 2022, 14:07:00) [MSC v.1916 64 bit (AMD64)]" + "version": "3.10.4" }, - "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "a6b598b31fc646bdc0acd5c2004810e407b47ff3b1550e1d461e8498c70ba381" @@ -813,5 +1046,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/validation/validate_data_cleaning.ipynb b/notebooks/validation/validate_data_cleaning.ipynb index 149f5f00..ebdd650e 100644 --- a/notebooks/validation/validate_data_cleaning.ipynb +++ b/notebooks/validation/validate_data_cleaning.ipynb @@ -12,9 +12,10 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", - "# Tell python where to look for modules. \n", + "# Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/\")\n", "\n", "# import local modules\n", "import src.validation as validation\n", @@ -22,7 +23,7 @@ "from src.column_checks import get_dtypes\n", "\n", "year = 2020\n", - "path_prefix = ''\n", + "path_prefix = \"\"\n", "path_prefix = f\"{path_prefix}{year}\"" ] }, @@ -44,7 +45,11 @@ "metadata": {}, "outputs": [], "source": [ - "eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])" + "eia923_allocated = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/eia923_allocated_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"report_date\"],\n", + ")" ] }, { @@ -55,10 +60,21 @@ "source": [ "# perform checks on allocated data\n", "# fuel consumption and co2 emissions should be positive\n", - "negative_test = validation.test_for_negative_values(eia923_allocated, ['fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])\n", + "negative_test = validation.test_for_negative_values(\n", + " eia923_allocated,\n", + " [\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ],\n", + ")\n", "\n", "# if net generation is positive, fuel consumption should be non zero\n", - "missing_fuel_test = validation.test_for_missing_fuel(eia923_allocated, 'net_generation_mwh')\n", + "missing_fuel_test = validation.test_for_missing_fuel(\n", + " eia923_allocated, \"net_generation_mwh\"\n", + ")\n", "\n", "# fuel consumed for electricity should be less than fuel consumed\n", "chp_allocation_test = validation.test_chp_allocation(eia923_allocated)\n", @@ -67,16 +83,38 @@ "missing_co2_test = validation.test_for_missing_co2(eia923_allocated)\n", "\n", "# check for generators with no data\n", - "missing_data_test = validation.test_for_missing_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])\n", + "missing_data_test = validation.test_for_missing_data(\n", + " eia923_allocated,\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ],\n", + ")\n", "\n", "# check for generators with all data = 0\n", - "zero_data_test = validation.test_for_zero_data(eia923_allocated, ['net_generation_mwh','fuel_consumed_mmbtu','fuel_consumed_for_electricity_mmbtu','co2_mass_lb','co2_mass_lb_for_electricity','co2_mass_lb_adjusted'])\n", + "zero_data_test = validation.test_for_zero_data(\n", + " eia923_allocated,\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_adjusted\",\n", + " ],\n", + ")\n", "\n", "# check for missing energy source code\n", "missing_esc_test = validation.test_for_missing_energy_source_code(eia923_allocated)\n", "\n", "# check for missing and incorrect prime movers\n", - "incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(eia923_allocated, year)\n", + "incorrect_pm_test, missing_pm_test = validation.test_for_missing_incorrect_prime_movers(\n", + " eia923_allocated, year\n", + ")\n", "\n", "# check for missing subplant ids\n", "eia_missing_subplant_test = validation.test_for_missing_subplant_id(eia923_allocated)" @@ -109,7 +147,26 @@ "source": [ "# what percent of emissions is reported in CEMS vs EIA\n", "# NOTE: This does not include emissions only reported by CEMS, so the % may be higher\n", - "(eia923_allocated.groupby('hourly_data_source')[[\"net_generation_mwh\",\"fuel_consumed_mmbtu\", 'co2_mass_lb',\"co2_mass_lb_for_electricity\"]].sum() / eia923_allocated.groupby('hourly_data_source')[[\"net_generation_mwh\",\"fuel_consumed_mmbtu\", 'co2_mass_lb',\"co2_mass_lb_for_electricity\"]].sum().sum(axis=0)).round(3)" + "(\n", + " eia923_allocated.groupby(\"hourly_data_source\")[\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " ]\n", + " ].sum()\n", + " / eia923_allocated.groupby(\"hourly_data_source\")[\n", + " [\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " ]\n", + " ]\n", + " .sum()\n", + " .sum(axis=0)\n", + ").round(3)" ] }, { @@ -125,7 +182,11 @@ "metadata": {}, "outputs": [], "source": [ - "cems = pd.read_csv(f'../data/outputs/{path_prefix}/cems_subplant_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])" + "cems = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/cems_subplant_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"report_date\"],\n", + ")" ] }, { @@ -138,7 +199,7 @@ "cems_negative_test = validation.test_for_negative_values(cems)\n", "\n", "# if net generation is positive, fuel consumption should be non zero\n", - "cems_missing_fuel_test = validation.test_for_missing_fuel(cems,'gross_generation_mwh')\n", + "cems_missing_fuel_test = validation.test_for_missing_fuel(cems, \"gross_generation_mwh\")\n", "\n", "# fuel consumed for electricity should be less than fuel consumed\n", "cems_chp_allocation_test = validation.test_chp_allocation(cems)\n", @@ -147,13 +208,13 @@ "cems_missing_co2_test = validation.test_for_missing_co2(cems)\n", "\n", "# check for missing energy source code\n", - "#cems_missing_esc_test = validation.test_for_missing_energy_source_code(cems)\n", + "# cems_missing_esc_test = validation.test_for_missing_energy_source_code(cems)\n", "\n", "# test to make sure that there is a complete subplant mapping\n", "cems_missing_subplant_test = validation.test_for_missing_subplant_id(cems)\n", "\n", "# test to see if there are any net generation values greater than gross generation\n", - "gtn_test = validation.test_gtn_results(cems)\n" + "gtn_test = validation.test_gtn_results(cems)" ] }, { diff --git a/notebooks/validation/validate_hourly_profiles.ipynb b/notebooks/validation/validate_hourly_profiles.ipynb index 1564a00a..ec589bc7 100644 --- a/notebooks/validation/validate_hourly_profiles.ipynb +++ b/notebooks/validation/validate_hourly_profiles.ipynb @@ -10,7 +10,7 @@ "import sys\n", "import plotly.express as px\n", "\n", - "sys.path.append('../../../open-grid-emissions/')\n", + "sys.path.append(\"../../../open-grid-emissions/\")\n", "%reload_ext autoreload\n", "%autoreload 2\n", "from src.column_checks import get_dtypes, apply_dtypes\n", @@ -23,7 +23,7 @@ "\n", "# load data from csv\n", "year = 2020\n", - "path_prefix = ''\n", + "path_prefix = \"\"\n", "\n", "path_prefix = f\"{path_prefix}{year}\"" ] @@ -53,9 +53,27 @@ "metadata": {}, "outputs": [], "source": [ - "eia930_data_raw = eia930.load_chalendar_for_pipeline(f\"../data/outputs/{path_prefix}/eia930/eia930_raw.csv\", year=year).pipe(eia930.remove_imputed_ones).pipe(eia930.remove_months_with_zero_data)\n", - "eia930_data_roll = eia930.load_chalendar_for_pipeline(f\"../data/outputs/{path_prefix}/eia930/eia930_rolling.csv\", year=year).pipe(eia930.remove_imputed_ones).pipe(eia930.remove_months_with_zero_data)\n", - "eia930_data_cleaned = eia930.load_chalendar_for_pipeline(f\"../data/outputs/{path_prefix}/eia930/eia930_elec.csv\", year=year).pipe(eia930.remove_imputed_ones).pipe(eia930.remove_months_with_zero_data)" + "eia930_data_raw = (\n", + " eia930.load_chalendar_for_pipeline(\n", + " f\"../data/outputs/{path_prefix}/eia930/eia930_raw.csv\", year=year\n", + " )\n", + " .pipe(eia930.remove_imputed_ones)\n", + " .pipe(eia930.remove_months_with_zero_data)\n", + ")\n", + "eia930_data_roll = (\n", + " eia930.load_chalendar_for_pipeline(\n", + " f\"../data/outputs/{path_prefix}/eia930/eia930_rolling.csv\", year=year\n", + " )\n", + " .pipe(eia930.remove_imputed_ones)\n", + " .pipe(eia930.remove_months_with_zero_data)\n", + ")\n", + "eia930_data_cleaned = (\n", + " eia930.load_chalendar_for_pipeline(\n", + " f\"../data/outputs/{path_prefix}/eia930/eia930_elec.csv\", year=year\n", + " )\n", + " .pipe(eia930.remove_imputed_ones)\n", + " .pipe(eia930.remove_months_with_zero_data)\n", + ")" ] }, { @@ -64,11 +82,27 @@ "metadata": {}, "outputs": [], "source": [ - "cems = pd.read_csv(f'../data/outputs/{path_prefix}/cems_subplant_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])\n", - "plant_attributes = pd.read_csv(f\"../data/outputs/{path_prefix}/plant_static_attributes_{year}.csv\")\n", - "primary_fuel_table = plant_attributes.drop_duplicates(subset=\"plant_id_eia\")[[\"plant_id_eia\", \"plant_primary_fuel\"]]\n", + "cems = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/cems_subplant_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "partial_cems_scaled = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/partial_cems_scaled_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "eia923_allocated = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/eia923_allocated_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"report_date\"],\n", + ")\n", + "plant_attributes = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/plant_static_attributes_{year}.csv\"\n", + ")\n", + "primary_fuel_table = plant_attributes.drop_duplicates(subset=\"plant_id_eia\")[\n", + " [\"plant_id_eia\", \"plant_primary_fuel\"]\n", + "]\n", "\n", "# aggregate cems data to subplant level\n", "cems = data_cleaning.aggregate_cems_to_subplant(cems)\n", @@ -80,7 +114,7 @@ "monthly_eia_data_to_shape = eia923_allocated[\n", " (eia923_allocated[\"hourly_data_source\"] == \"eia\")\n", " & ~(eia923_allocated[\"fuel_consumed_mmbtu\"].isna())\n", - "]\n" + "]" ] }, { @@ -89,7 +123,10 @@ "metadata": {}, "outputs": [], "source": [ - "data_to_graph = eia930_data_roll[(eia930_data_roll[\"fuel_category_eia930\"] == \"solar\") & (eia930_data_roll[\"report_date\"] == \"2020-07-01\")]\n", + "data_to_graph = eia930_data_roll[\n", + " (eia930_data_roll[\"fuel_category_eia930\"] == \"solar\")\n", + " & (eia930_data_roll[\"report_date\"] == \"2020-07-01\")\n", + "]\n", "\n", "px.line(data_to_graph, x=\"datetime_local\", y=\"net_generation_mwh_930\", color=\"ba_code\")" ] @@ -144,7 +181,11 @@ "metadata": {}, "outputs": [], "source": [ - "hourly_profiles_raw[[\"ba_code\", \"fuel_category\", \"report_date\", \"profile_method\"]].drop_duplicates().drop(columns=[\"ba_code\"]).pivot_table(index=\"fuel_category\", columns=\"profile_method\", aggfunc=\"count\").fillna(0).astype(int)" + "hourly_profiles_raw[\n", + " [\"ba_code\", \"fuel_category\", \"report_date\", \"profile_method\"]\n", + "].drop_duplicates().drop(columns=[\"ba_code\"]).pivot_table(\n", + " index=\"fuel_category\", columns=\"profile_method\", aggfunc=\"count\"\n", + ").fillna(0).astype(int)" ] }, { @@ -163,7 +204,9 @@ "metadata": {}, "outputs": [], "source": [ - "national_validation = validation.validate_national_imputation_method(hourly_profiles_raw, year)\n", + "national_validation = validation.validate_national_imputation_method(\n", + " hourly_profiles_raw, year\n", + ")\n", "national_validation.groupby([\"fuel_category\"]).mean()[\"imputed_profile\"]" ] }, @@ -180,7 +223,11 @@ "metadata": {}, "outputs": [], "source": [ - "hourly_profiles_roll[[\"ba_code\", \"fuel_category\", \"report_date\", \"profile_method\"]].drop_duplicates().drop(columns=[\"ba_code\"]).pivot_table(index=\"fuel_category\", columns=\"profile_method\", aggfunc=\"count\").fillna(0).astype(int)" + "hourly_profiles_roll[\n", + " [\"ba_code\", \"fuel_category\", \"report_date\", \"profile_method\"]\n", + "].drop_duplicates().drop(columns=[\"ba_code\"]).pivot_table(\n", + " index=\"fuel_category\", columns=\"profile_method\", aggfunc=\"count\"\n", + ").fillna(0).astype(int)" ] }, { @@ -199,7 +246,9 @@ "metadata": {}, "outputs": [], "source": [ - "national_validation = validation.validate_national_imputation_method(hourly_profiles_roll, year)\n", + "national_validation = validation.validate_national_imputation_method(\n", + " hourly_profiles_roll, year\n", + ")\n", "national_validation.groupby([\"fuel_category\"]).mean()[\"imputed_profile\"]" ] }, @@ -216,7 +265,11 @@ "metadata": {}, "outputs": [], "source": [ - "hourly_profiles_cleaned[[\"ba_code\", \"fuel_category\", \"report_date\", \"profile_method\"]].drop_duplicates().drop(columns=[\"ba_code\"]).pivot_table(index=\"fuel_category\", columns=\"profile_method\", aggfunc=\"count\").fillna(0).astype(int)" + "hourly_profiles_cleaned[\n", + " [\"ba_code\", \"fuel_category\", \"report_date\", \"profile_method\"]\n", + "].drop_duplicates().drop(columns=[\"ba_code\"]).pivot_table(\n", + " index=\"fuel_category\", columns=\"profile_method\", aggfunc=\"count\"\n", + ").fillna(0).astype(int)" ] }, { @@ -225,7 +278,9 @@ "metadata": {}, "outputs": [], "source": [ - "diba_validation = validation.validate_diba_imputation_method(hourly_profiles_cleaned, year)\n", + "diba_validation = validation.validate_diba_imputation_method(\n", + " hourly_profiles_cleaned, year\n", + ")\n", "diba_validation.groupby([\"fuel_category\"]).mean()[\"imputed_profile\"]" ] }, @@ -235,7 +290,9 @@ "metadata": {}, "outputs": [], "source": [ - "national_validation = validation.validate_national_imputation_method(hourly_profiles_cleaned, year)\n", + "national_validation = validation.validate_national_imputation_method(\n", + " hourly_profiles_cleaned, year\n", + ")\n", "national_validation.groupby([\"fuel_category\"]).mean()[\"imputed_profile\"]" ] }, @@ -252,8 +309,12 @@ "metadata": {}, "outputs": [], "source": [ - "hourly_profiles = pd.read_csv(f\"../data/outputs/{path_prefix}/hourly_profiles_{year}.csv\")\n", - "shaped_eia923_data = pd.read_csv(f'../data/outputs/{path_prefix}/shaped_eia923_data_{year}.csv', dtype=get_dtypes())" + "hourly_profiles = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/hourly_profiles_{year}.csv\"\n", + ")\n", + "shaped_eia923_data = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}/shaped_eia923_data_{year}.csv\", dtype=get_dtypes()\n", + ")" ] }, { @@ -271,9 +332,16 @@ "metadata": {}, "outputs": [], "source": [ - "data_to_graph = hourly_profiles[(hourly_profiles[\"fuel_category\"] == \"natural_gas\") & (hourly_profiles[\"ba_code\"] == \"ERCO\")]\n", + "data_to_graph = hourly_profiles[\n", + " (hourly_profiles[\"fuel_category\"] == \"natural_gas\")\n", + " & (hourly_profiles[\"ba_code\"] == \"ERCO\")\n", + "]\n", "\n", - "px.line(data_to_graph, x=\"datetime_local\", y=[\"eia930_profile\",\"cems_profile\",\"residual_profile\",\"scaled_residual_profile\"])" + "px.line(\n", + " data_to_graph,\n", + " x=\"datetime_local\",\n", + " y=[\"eia930_profile\", \"cems_profile\", \"residual_profile\", \"scaled_residual_profile\"],\n", + ")" ] }, { @@ -282,7 +350,10 @@ "metadata": {}, "outputs": [], "source": [ - "data_to_graph = shaped_eia923_data[(shaped_eia923_data[\"fuel_category\"] == \"natural_gas\") & (shaped_eia923_data[\"ba_code\"] == \"ERCO\")]\n", + "data_to_graph = shaped_eia923_data[\n", + " (shaped_eia923_data[\"fuel_category\"] == \"natural_gas\")\n", + " & (shaped_eia923_data[\"ba_code\"] == \"ERCO\")\n", + "]\n", "\n", "px.line(data_to_graph, x=\"datetime_utc\", y=[\"net_generation_mwh\"])" ] @@ -303,7 +374,9 @@ "metadata": {}, "outputs": [], "source": [ - "national_validation = validation.validate_national_imputation_method(hourly_profiles, year)\n", + "national_validation = validation.validate_national_imputation_method(\n", + " hourly_profiles, year\n", + ")\n", "national_validation.groupby([\"fuel_category\"]).mean()[\"imputed_profile\"]" ] }, @@ -318,11 +391,7 @@ " [\"ba_code\", \"fuel_category\", \"report_date\", \"profile_method\"]\n", "].drop_duplicates().drop(columns=[\"ba_code\"]).pivot_table(\n", " index=\"fuel_category\", columns=\"profile_method\", aggfunc=\"count\"\n", - ").fillna(\n", - " 0\n", - ").astype(\n", - " int\n", - ")" + ").fillna(0).astype(int)" ] } ], diff --git a/notebooks/validation/validate_negative_profiles.ipynb b/notebooks/validation/validate_negative_profiles.ipynb index 486b1322..27c44e04 100644 --- a/notebooks/validation/validate_negative_profiles.ipynb +++ b/notebooks/validation/validate_negative_profiles.ipynb @@ -17,7 +17,7 @@ "# # Tell python where to look for modules.\n", "import sys\n", "\n", - "sys.path.append('../../../open-grid-emissions/')\n", + "sys.path.append(\"../../../open-grid-emissions/\")\n", "\n", "# import local modules\n", "import src.load_data as load_data\n", @@ -39,12 +39,26 @@ "source": [ "# load data from csv\n", "year = 2020\n", - "path_prefix = ''\n", + "path_prefix = \"\"\n", "\n", - "cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])\n", - "eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])\n", - "plant_attributes = pd.read_csv(f\"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv\")" + "cems = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}{year}/cems_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "partial_cems_scaled = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", + "eia923_allocated = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv\",\n", + " dtype=get_dtypes(),\n", + " parse_dates=[\"report_date\"],\n", + ")\n", + "plant_attributes = pd.read_csv(\n", + " f\"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv\"\n", + ")" ] }, { @@ -64,16 +78,41 @@ "# validate method\n", "\n", "# merge together monthly subplant totals from EIA and calculated from CEMS\n", - "eia_netgen = eia923_allocated.groupby(['plant_id_eia',\"subplant_id\",\"report_date\"], dropna=False).sum(min_count=1)['net_generation_mwh'].reset_index().dropna(subset=\"net_generation_mwh\")\n", - "calculated_netgen = cems.groupby(['plant_id_eia',\"subplant_id\",\"report_date\"], dropna=False).sum()['net_generation_mwh'].reset_index()\n", - "validated_ng = eia_netgen.merge(calculated_netgen, how=\"inner\", on=['plant_id_eia',\"subplant_id\",\"report_date\"], suffixes=(\"_eia\",\"_calc\"))\n", + "eia_netgen = (\n", + " eia923_allocated.groupby(\n", + " [\"plant_id_eia\", \"subplant_id\", \"report_date\"], dropna=False\n", + " )\n", + " .sum(min_count=1)[\"net_generation_mwh\"]\n", + " .reset_index()\n", + " .dropna(subset=\"net_generation_mwh\")\n", + ")\n", + "calculated_netgen = (\n", + " cems.groupby([\"plant_id_eia\", \"subplant_id\", \"report_date\"], dropna=False)\n", + " .sum()[\"net_generation_mwh\"]\n", + " .reset_index()\n", + ")\n", + "validated_ng = eia_netgen.merge(\n", + " calculated_netgen,\n", + " how=\"inner\",\n", + " on=[\"plant_id_eia\", \"subplant_id\", \"report_date\"],\n", + " suffixes=(\"_eia\", \"_calc\"),\n", + ")\n", "\n", - "validated_ng = validated_ng.groupby(\"plant_id_eia\").sum().reset_index().drop(columns=[\"subplant_id\"])\n", + "validated_ng = (\n", + " validated_ng.groupby(\"plant_id_eia\")\n", + " .sum()\n", + " .reset_index()\n", + " .drop(columns=[\"subplant_id\"])\n", + ")\n", "\n", "validated_ng = validated_ng.round(3)\n", - "validated_ng = validated_ng[validated_ng[[\"net_generation_mwh_eia\",\"net_generation_mwh_calc\"]].sum(axis=1) != 0]\n", + "validated_ng = validated_ng[\n", + " validated_ng[[\"net_generation_mwh_eia\", \"net_generation_mwh_calc\"]].sum(axis=1) != 0\n", + "]\n", "\n", - "validated_ng['pct_error'] = (validated_ng['net_generation_mwh_calc'] - validated_ng['net_generation_mwh_eia']) / validated_ng['net_generation_mwh_eia']\n", + "validated_ng[\"pct_error\"] = (\n", + " validated_ng[\"net_generation_mwh_calc\"] - validated_ng[\"net_generation_mwh_eia\"]\n", + ") / validated_ng[\"net_generation_mwh_eia\"]\n", "validated_ng.sort_values(by=\"pct_error\")" ] }, @@ -91,9 +130,13 @@ "outputs": [], "source": [ "# what is the most negative\n", - "negative_hourly = eia923_allocated.copy()[[\"plant_id_eia\",\"generator_id\",\"report_date\",\"net_generation_mwh\"]].dropna()\n", + "negative_hourly = eia923_allocated.copy()[\n", + " [\"plant_id_eia\", \"generator_id\", \"report_date\", \"net_generation_mwh\"]\n", + "].dropna()\n", "negative_hourly[\"hours\"] = negative_hourly.report_date.dt.daysinmonth * 24\n", - "negative_hourly[\"avg_generation_mw\"] = negative_hourly[\"net_generation_mwh\"] / negative_hourly[\"hours\"]\n", + "negative_hourly[\"avg_generation_mw\"] = (\n", + " negative_hourly[\"net_generation_mwh\"] / negative_hourly[\"hours\"]\n", + ")\n", "negative_hourly.sort_values(by=\"avg_generation_mw\")" ] }, @@ -127,24 +170,20 @@ " \"co2_mass_lb_adjusted\",\n", "]\n", "all_data = all_data.assign(\n", - " reported_eia923=lambda x: np.where(\n", - " x[columns_to_test].notnull().all(axis=1), 1, 0\n", - " )\n", + " reported_eia923=lambda x: np.where(x[columns_to_test].notnull().all(axis=1), 1, 0)\n", ")\n", "\n", "cems_monthly = (\n", - " cems_test.groupby(\n", - " [\"plant_id_eia\", \"subplant_id\",\"report_date\"], dropna=False\n", - " )\n", - " .count()[\"datetime_utc\"].reset_index())\n", + " cems_test.groupby([\"plant_id_eia\", \"subplant_id\", \"report_date\"], dropna=False)\n", + " .count()[\"datetime_utc\"]\n", + " .reset_index()\n", + ")\n", "\n", "cems_monthly[\"hourly_data_source\"] = \"cems\"\n", "\n", "# merge in the data source column from CEMS\n", "all_data = all_data.merge(\n", - " cems_monthly[\n", - " [\"plant_id_eia\", \"subplant_id\", \"report_date\", \"hourly_data_source\"]\n", - " ],\n", + " cems_monthly[[\"plant_id_eia\", \"subplant_id\", \"report_date\", \"hourly_data_source\"]],\n", " how=\"left\",\n", " on=[\"plant_id_eia\", \"subplant_id\", \"report_date\"],\n", " validate=\"m:1\",\n", @@ -153,7 +192,11 @@ "# for the remaining plants, identify the hourly data source as EIA\n", "all_data[\"hourly_data_source\"] = all_data[\"hourly_data_source\"].fillna(\"eia\")\n", "\n", - "all_data.loc[(all_data[\"hourly_data_source\"] == \"cems\") & (~all_data[\"net_generation_mwh\"].isna()), \"hourly_data_source\"] = \"both\"\n", + "all_data.loc[\n", + " (all_data[\"hourly_data_source\"] == \"cems\")\n", + " & (~all_data[\"net_generation_mwh\"].isna()),\n", + " \"hourly_data_source\",\n", + "] = \"both\"\n", "\n", "all_data" ] @@ -164,7 +207,9 @@ "metadata": {}, "outputs": [], "source": [ - "gtn_conversions = gross_to_net_generation.calculate_gross_to_net_conversion_factors(cems_test, eia_test, plant_attributes, year)\n", + "gtn_conversions = gross_to_net_generation.calculate_gross_to_net_conversion_factors(\n", + " cems_test, eia_test, plant_attributes, year\n", + ")\n", "\n", "factors_to_use = gross_to_net_generation.filter_gtn_conversion_factors(gtn_conversions)\n", "\n", @@ -211,17 +256,23 @@ " cems_test[\"gross_generation_mwh\"] + cems_test[\"annual_plant_shift_mw\"]\n", ")\n", "\n", - "cems_test.loc[cems_test[\"net_generation_mwh\"].isna(), \"gtn_method\"] = \"4_annual_plant_ratio\"\n", + "cems_test.loc[\n", + " cems_test[\"net_generation_mwh\"].isna(), \"gtn_method\"\n", + "] = \"4_annual_plant_ratio\"\n", "cems_test[\"net_generation_mwh\"] = cems_test[\"net_generation_mwh\"].fillna(\n", " cems_test[\"gross_generation_mwh\"] * cems_test[\"annual_plant_ratio\"]\n", ")\n", "\n", - "cems_test.loc[cems_test[\"net_generation_mwh\"].isna(), \"gtn_method\"] = \"5_annual_fuel_ratio\"\n", + "cems_test.loc[\n", + " cems_test[\"net_generation_mwh\"].isna(), \"gtn_method\"\n", + "] = \"5_annual_fuel_ratio\"\n", "cems_test[\"net_generation_mwh\"] = cems_test[\"net_generation_mwh\"].fillna(\n", " cems_test[\"gross_generation_mwh\"] * cems_test[\"annual_fuel_ratio\"]\n", ")\n", "\n", - "cems_test.loc[cems_test[\"net_generation_mwh\"].isna(), \"gtn_method\"] = \"6_gross_equals_net\"\n", + "cems_test.loc[\n", + " cems_test[\"net_generation_mwh\"].isna(), \"gtn_method\"\n", + "] = \"6_gross_equals_net\"\n", "cems_test[\"net_generation_mwh\"] = cems_test[\"net_generation_mwh\"].fillna(\n", " cems_test[\"gross_generation_mwh\"]\n", ")\n", @@ -229,7 +280,7 @@ "# drop intermediate columns\n", "cems_test = cems_test.drop(\n", " columns=[\n", - " #\"data_source\",\n", + " # \"data_source\",\n", " \"annual_subplant_shift_mw\",\n", " \"annual_plant_shift_mw\",\n", " \"annual_subplant_ratio\",\n", @@ -254,7 +305,9 @@ "metadata": {}, "outputs": [], "source": [ - "cems_test.groupby([\"data_source\",\"subplant_id\"], dropna=False).sum()[[\"gross_generation_mwh\",\"net_generation_mwh\"]].reset_index()" + "cems_test.groupby([\"data_source\", \"subplant_id\"], dropna=False).sum()[\n", + " [\"gross_generation_mwh\", \"net_generation_mwh\"]\n", + "].reset_index()" ] }, { @@ -263,7 +316,9 @@ "metadata": {}, "outputs": [], "source": [ - "eia_test.groupby([\"hourly_data_source\",\"subplant_id\"], dropna=False).sum()[\"net_generation_mwh\"].reset_index()" + "eia_test.groupby([\"hourly_data_source\", \"subplant_id\"], dropna=False).sum()[\n", + " \"net_generation_mwh\"\n", + "].reset_index()" ] }, { @@ -272,7 +327,9 @@ "metadata": {}, "outputs": [], "source": [ - "factors_to_use.groupby([\"data_source\",\"subplant_id\"], dropna=False).sum()[\"net_generation_mwh\"].reset_index()" + "factors_to_use.groupby([\"data_source\", \"subplant_id\"], dropna=False).sum()[\n", + " \"net_generation_mwh\"\n", + "].reset_index()" ] }, { @@ -297,9 +354,24 @@ "metadata": {}, "outputs": [], "source": [ - "data_to_graph = hourly_profiles[(hourly_profiles[\"fuel_category\"] == \"coal\") & (hourly_profiles[\"ba_code\"] == \"MISO\")]\n", + "data_to_graph = hourly_profiles[\n", + " (hourly_profiles[\"fuel_category\"] == \"coal\")\n", + " & (hourly_profiles[\"ba_code\"] == \"MISO\")\n", + "]\n", "\n", - "px.line(data_to_graph, x=\"datetime_utc\", y=[\"eia930_profile\",\"cems_profile\",\"residual_profile\",\"scaled_residual_profile\",\"shifted_residual_profile\",\"imputed_profile\",\"profile\"])" + "px.line(\n", + " data_to_graph,\n", + " x=\"datetime_utc\",\n", + " y=[\n", + " \"eia930_profile\",\n", + " \"cems_profile\",\n", + " \"residual_profile\",\n", + " \"scaled_residual_profile\",\n", + " \"shifted_residual_profile\",\n", + " \"imputed_profile\",\n", + " \"profile\",\n", + " ],\n", + ")" ] }, { @@ -311,9 +383,13 @@ "plant_to_test = 3399\n", "subplant = 1\n", "\n", - "cems_to_graph = cems[(cems[\"plant_id_eia\"] == plant_to_test) & (cems[\"subplant_id\"] == subplant)]\n", + "cems_to_graph = cems[\n", + " (cems[\"plant_id_eia\"] == plant_to_test) & (cems[\"subplant_id\"] == subplant)\n", + "]\n", "\n", - "px.line(cems_to_graph, x=\"datetime_utc\", y=[\"gross_generation_mwh\",\"net_generation_mwh\"])" + "px.line(\n", + " cems_to_graph, x=\"datetime_utc\", y=[\"gross_generation_mwh\", \"net_generation_mwh\"]\n", + ")" ] } ], diff --git a/notebooks/validation/validate_vs_egrid.ipynb b/notebooks/validation/validate_vs_egrid.ipynb index 70ec8518..8e9bb8da 100644 --- a/notebooks/validation/validate_vs_egrid.ipynb +++ b/notebooks/validation/validate_vs_egrid.ipynb @@ -16,9 +16,10 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", - "# Tell python where to look for modules. \n", + "# Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "# import local modules\n", "import load_data\n", @@ -75,7 +76,7 @@ ")\n", "\n", "# Load the eGRID plant table\n", - "egrid_plant = validation.load_egrid_plant_file(year)\n" + "egrid_plant = validation.load_egrid_plant_file(year)" ] }, { @@ -97,7 +98,7 @@ ") = validation.identify_plants_missing_from_our_calculations(\n", " egrid_plant, annual_plant_results, year\n", ")\n", - "missing_from_calc\n" + "missing_from_calc" ] }, { @@ -129,7 +130,7 @@ " \"fuel_consumed_for_electricity_mmbtu\",\n", " \"fuel_consumed_mmbtu\",\n", " ],\n", - "]\n" + "]" ] }, { @@ -163,7 +164,7 @@ " (double_ids[\"net_generation_mwh_calc\"] - double_ids[\"net_generation_mwh_egrid\"])\n", " / double_ids[\"net_generation_mwh_egrid\"]\n", ").round(3)\n", - "double_ids\n" + "double_ids" ] }, { @@ -175,7 +176,7 @@ "# compare egrid vs eia plant ids\n", "annual_plant_results[\n", " annual_plant_results[\"plant_id_egrid\"].duplicated(keep=False)\n", - "].groupby([\"plant_id_egrid\", \"plant_id_eia\"]).sum()\n" + "].groupby([\"plant_id_egrid\", \"plant_id_eia\"]).sum()" ] }, { @@ -191,7 +192,9 @@ "metadata": {}, "outputs": [], "source": [ - "ba_code_match = egrid_plant.set_index(\"plant_id_eia\")[[\"plant_name_eia\", \"ba_code\"]].merge(\n", + "ba_code_match = egrid_plant.set_index(\"plant_id_eia\")[\n", + " [\"plant_name_eia\", \"ba_code\"]\n", + "].merge(\n", " annual_plant_results.set_index(\"plant_id_eia\")[[\"ba_code\"]],\n", " how=\"inner\",\n", " left_index=True,\n", @@ -201,7 +204,7 @@ "\n", "# plants with missing ba code\n", "# ba_code_match[(ba_code_match['ba_code_calc'].isna()) & ~(ba_code_match['ba_code_egrid'].isna())]\n", - "ba_code_match[ba_code_match[\"ba_code_calc\"] != ba_code_match[\"ba_code_egrid\"]]\n" + "ba_code_match[ba_code_match[\"ba_code_calc\"] != ba_code_match[\"ba_code_egrid\"]]" ] }, { @@ -214,7 +217,7 @@ "ba_code_match[\n", " (ba_code_match[\"ba_code_calc\"] != ba_code_match[\"ba_code_egrid\"])\n", " & ~(ba_code_match[\"ba_code_egrid\"].isna())\n", - "]\n" + "]" ] }, { @@ -242,7 +245,7 @@ "\n", "fuel_match[\n", " fuel_match[\"plant_primary_fuel_egrid\"] != fuel_match[\"plant_primary_fuel_calc\"]\n", - "]\n" + "]" ] }, { @@ -314,7 +317,7 @@ " eia923_allocated,\n", " pudl_out,\n", " PLANTS_MISSING_FROM_EGRID,\n", - ")\n" + ")" ] }, { @@ -350,7 +353,7 @@ "comparison_count, compared = validation.compare_plant_level_results_to_egrid(\n", " segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID\n", ")\n", - "comparison_count\n" + "comparison_count" ] }, { @@ -359,7 +362,7 @@ "metadata": {}, "outputs": [], "source": [ - "validation.compare_egrid_fuel_total(segment_to_compare, egrid_plant).sum()\n" + "validation.compare_egrid_fuel_total(segment_to_compare, egrid_plant).sum()" ] }, { @@ -379,7 +382,7 @@ "comparison_count, compared = validation.compare_plant_level_results_to_egrid(\n", " segment_to_compare, egrid_plant, PLANTS_MISSING_FROM_EGRID\n", ")\n", - "comparison_count\n" + "comparison_count" ] }, { @@ -413,12 +416,32 @@ "metric = \"so2_mass_lb\"\n", "status = \"<50%\"\n", "\n", - "comparison_df = comparison_df.merge(egrid_plant.set_index(\"plant_id_egrid\")[[metric]], how=\"left\", left_index=True, right_index=True, suffixes=(None,\"_egrid\"))\n", - "comparison_df = comparison_df.merge(annual_plant_results.set_index(\"plant_id_egrid\")[[metric]], how=\"left\", left_index=True, right_index=True, suffixes=(None,\"_calc\"))\n", + "comparison_df = comparison_df.merge(\n", + " egrid_plant.set_index(\"plant_id_egrid\")[[metric]],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + " suffixes=(None, \"_egrid\"),\n", + ")\n", + "comparison_df = comparison_df.merge(\n", + " annual_plant_results.set_index(\"plant_id_egrid\")[[metric]],\n", + " how=\"left\",\n", + " left_index=True,\n", + " right_index=True,\n", + " suffixes=(None, \"_calc\"),\n", + ")\n", "\n", "# show the data\n", - "columns_to_show = [\"plant_name_eia\", \"ba_code\", \"state\", metric, f\"{metric}_status\", f\"{metric}_egrid\", f\"{metric}_calc\"]\n", - "comparison_df.loc[(comparison_df[f\"{metric}_status\"] == status), columns_to_show]\n" + "columns_to_show = [\n", + " \"plant_name_eia\",\n", + " \"ba_code\",\n", + " \"state\",\n", + " metric,\n", + " f\"{metric}_status\",\n", + " f\"{metric}_egrid\",\n", + " f\"{metric}_calc\",\n", + "]\n", + "comparison_df.loc[(comparison_df[f\"{metric}_status\"] == status), columns_to_show]" ] }, { @@ -459,7 +482,9 @@ "# aggregate the plant data up to the BA level\n", "egrid_plant_ba_agg = egrid_plant.groupby([\"ba_code\"]).sum()[DATA_COLUMNS].reset_index()\n", "\n", - "egrid_plant_ba_agg[\"generated_co2_rate_lb_per_mwh\"] = egrid_plant_ba_agg[\"co2_mass_lb\"] / egrid_plant_ba_agg[\"net_generation_mwh\"]" + "egrid_plant_ba_agg[\"generated_co2_rate_lb_per_mwh\"] = (\n", + " egrid_plant_ba_agg[\"co2_mass_lb\"] / egrid_plant_ba_agg[\"net_generation_mwh\"]\n", + ")" ] }, { @@ -488,7 +513,9 @@ "\n", "calculated_ba = pd.concat(calculated_ba, axis=0)\n", "\n", - "calculated_ba[\"generated_co2_rate_lb_per_mwh\"] = calculated_ba[\"co2_mass_lb\"] / calculated_ba[\"net_generation_mwh\"]\n" + "calculated_ba[\"generated_co2_rate_lb_per_mwh\"] = (\n", + " calculated_ba[\"co2_mass_lb\"] / calculated_ba[\"net_generation_mwh\"]\n", + ")" ] }, { @@ -503,7 +530,7 @@ " - egrid_plant_ba_agg.set_index(\"ba_code\").replace(0, 0.1)\n", " )\n", " / egrid_plant_ba_agg.set_index(\"ba_code\").replace(0, 0.1)\n", - ").round(2)\n" + ").round(2)" ] }, { @@ -583,7 +610,7 @@ "]\n", "\n", "with pd.option_context(\"display.max_rows\", None, \"display.max_columns\", None):\n", - " display(ba_metric[~(ba_metric[columns_to_check] == 0).all(axis=1)])\n" + " display(ba_metric[~(ba_metric[columns_to_check] == 0).all(axis=1)])" ] }, { @@ -600,7 +627,7 @@ "metadata": {}, "outputs": [], "source": [ - "plant_to_explore = 58223\n" + "plant_to_explore = 58223" ] }, { @@ -609,7 +636,7 @@ "metadata": {}, "outputs": [], "source": [ - "egrid_plant[egrid_plant[\"plant_id_eia\"] == plant_to_explore]\n" + "egrid_plant[egrid_plant[\"plant_id_eia\"] == plant_to_explore]" ] }, { @@ -618,7 +645,7 @@ "metadata": {}, "outputs": [], "source": [ - "annual_plant_results[annual_plant_results[\"plant_id_eia\"] == plant_to_explore]\n" + "annual_plant_results[annual_plant_results[\"plant_id_eia\"] == plant_to_explore]" ] }, { @@ -627,7 +654,7 @@ "metadata": {}, "outputs": [], "source": [ - "eia923_allocated[eia923_allocated[\"plant_id_eia\"] == plant_to_explore]\n" + "eia923_allocated[eia923_allocated[\"plant_id_eia\"] == plant_to_explore]" ] }, { @@ -639,7 +666,7 @@ "eia923_allocated.loc[\n", " eia923_allocated[\"plant_id_eia\"] == plant_to_explore,\n", " [\"generator_id\", \"subplant_id\"],\n", - "].drop_duplicates()\n" + "].drop_duplicates()" ] } ], diff --git a/notebooks/visualization/map_visualization.ipynb b/notebooks/visualization/map_visualization.ipynb index 73284fbf..0f5c85e1 100644 --- a/notebooks/visualization/map_visualization.ipynb +++ b/notebooks/visualization/map_visualization.ipynb @@ -1,623 +1,788 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Maps for announcement blog post\n", - "\n", - "Visualization 2: The carbon intensity of consumed electricity differs from generated electricity\n", - "* Show a static carbon flow map focused on a single BA plus all directly-interconnected BAs\n", - "* Pick an hour when there is some particularly dirty electricity getting imported\n", - "* Each BA would be represented by a bubble, where the color changes based on carbon intensity, and carbon flows would be represented by colored arrows between the bubbles. \n", - "* To illustrate the difference between produced and consumed, we might want to have a pair of bubbles for each BA - one that shows the produced CI and one that shows the consumed CI. If we do this, we probably don’t want to vary the size of each bubble based on total generation. Or maybe we could do a split bubble - the top half shows produced CI and the bottom shows consumed CI.\n", - "\n", - "\n", - "Visualization 3: Animating hourly and consumed emissions for the whole county\n", - "* This animation should put the previous two concepts together and show how carbon flows and how CI changes across the entire country for a single day (or a week?)\n", - "* We could also potentially have two maps side by side: one that shows annual averages in bubbles (with no carbon flow), and one that shows the animated hourly flow (to really draw the distinction between annual and hourly datasets)\n", - "\n", - "\n", - "### Ref for making gif: \n", - "`https://stackoverflow.com/questions/753190/programmatically-generate-video-or-animated-gif-in-python`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import plotly.express as px\n", - "import plotly.graph_objects as go\n", - "from plotly.colors import * \n", - "import plotly.io as pio\n", - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "from PIL import Image\n", - "import imageio" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# # Tell python where to look for modules.\n", - "import sys\n", - "\n", - "sys.path.append(\"../../src/\")\n", - "\n", - "import output_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ba_coords = pd.read_csv(\"resources/ba_coords.csv\", index_col=0, dtype={\"cx\":np.float64, \"cy\":np.float64})\n", - "ba_meta = pd.read_csv(\"../../data/manual/ba_reference.csv\", index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Note: 150+ BAs are not in ba_coords\n", - "ba_list = ba_meta[(ba_meta.ba_category != \"misellaneous\") & (ba_meta.us_ba) & (ba_meta.index.isin(ba_coords.index))].index\n", - "ba_list = [ba for ba in ba_list if (f\"{ba}.csv\" in os.listdir(\"../../data/results/2020/power_sector_data/hourly/us_units/\"))]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cleaned_io = pd.read_csv(\"../../data/outputs/2020/eia930/eia930_elec.csv\", index_col=0, parse_dates=True)\n", - "cleaned_io = cleaned_io[[c for c in cleaned_io.columns if \".ID.\" in c]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all = []\n", - "for ba in ba_list:\n", - " produced = pd.read_csv(f\"../../data/results/2020/power_sector_data/hourly/us_units/{ba}.csv\", index_col=\"datetime_utc\", parse_dates=True, usecols=[\"datetime_utc\",\"fuel_category\", \"net_generation_mwh\", \"generated_co2_rate_lb_per_mwh_for_electricity\"])\n", - " produced = produced[produced.fuel_category == \"total\"]\n", - " produced = produced.drop(columns=[\"fuel_category\"])\n", - " \n", - " if ba_meta.loc[ba,\"ba_category\"] == \"generation_only\":\n", - " consumed = pd.DataFrame(index=produced.index, columns=[[\"consumed_co2_rate_lb_per_mwh_for_electricity\"]], dtype=np.float64)\n", - " else:\n", - " consumed = pd.read_csv(f\"../../data/results/2020/carbon_accounting/hourly/us_units/{ba}.csv\", index_col=\"datetime_utc\", parse_dates=True, usecols=[\"datetime_utc\", \"consumed_co2_rate_lb_per_mwh_for_electricity\"])\n", - " consumed.columns = consumed.columns.get_level_values(0)\n", - "\n", - " both = pd.concat([produced, consumed], axis='columns')\n", - " #both = both.loc[range_start:range_end]\n", - " both = both.reset_index()\n", - " both[\"BA\"] = ba\n", - " all.append(both)\n", - "\n", - "all = pd.concat(all)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Add coordinates\n", - "all = all.merge(ba_coords, how='left', validate='many_to_one', left_on=\"BA\", right_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# src: https://community.plotly.com/t/how-to-include-a-colorscale-for-color-of-line-graphs/38002/3 \n", - "from ast import literal_eval\n", - "def get_color_for_val(val, vmin, vmax, pl_colors):\n", - " if pl_colors[0][:3] != 'rgb':\n", - " raise ValueError('This function works only with Plotly rgb-colorscales')\n", - " if vmin >= vmax:\n", - " raise ValueError('vmin should be < vmax')\n", - "\n", - " scale = [round(k / (len(pl_colors)), 3) for k in range(len(pl_colors) + 1)]\n", - "\n", - " colors_01 = np.array([literal_eval(color[3:]) for color in pl_colors]) / 255 # color codes in [0,1]\n", - "\n", - " v = (val - vmin) / (vmax - vmin) # val is mapped to v in [0,1]\n", - " # find two consecutive values in plotly_scale such that v is in the corresponding interval\n", - " idx = 0\n", - "\n", - " while idx < (len(scale)-2) and (v > scale[idx + 1]):\n", - " idx += 1\n", - "\n", - " vv = (v - scale[idx]) / (scale[idx + 1] - scale[idx])\n", - "\n", - " # get [0,1]-valued color code representing the rgb color corresponding to val\n", - " if idx == len(pl_colors)-1: # Make this work when some values exceed range\n", - " val_color01 = colors_01[idx] # color by last color \n", - " else: \n", - " val_color01 = colors_01[idx] + vv * (colors_01[idx + 1] - colors_01[idx])\n", - "\n", - " val_color_0255 = (255 * val_color01 + 0.5).astype(int)\n", - " return f'rgb{str(tuple(val_color_0255))}'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "range_start = \"2020-08-01T04:00+00\"\n", - "range_end = \"2020-08-3T04:00+00\"\n", - "\n", - "#range_start = \"2020-07-21T12:00+00\"\n", - "#range_end = \"2020-07-23T12:00+00\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dir_name = \"maps\"\n", - "\n", - "c_max = 2200 #np.floor(all.generated_co2_rate_lb_per_mwh_for_electricity.max() + 100)\n", - "\n", - "hours = all.datetime_utc[(all.datetime_utc < pd.to_datetime(range_end)) & (all.datetime_utc > pd.to_datetime(range_start)) & (all.BA == \"CISO\")]\n", - "for hour in hours:\n", - " print(hour, end=\"...\")\n", - " io_toplot = cleaned_io.loc[hour]\n", - " toplot = all[all.datetime_utc == hour]\n", - " fig = go.Figure()\n", - "\n", - " toplot.loc[toplot.net_generation_mwh < 1, \"net_generation_mwh\"] = 1\n", - " sizes = np.log(toplot.net_generation_mwh)/np.log(1.5)\n", - " offset = sizes/2.5\n", - "\n", - " colorscale = diverging.RdYlGn_r\n", - " #colorscale = cmocean.solar_r\n", - "\n", - " ### From when \n", - " # max_width = io_toplot.max()\n", - " # width_factor = 8/max_width\n", - " #width_factor = 1/200\n", - " for name, val in io_toplot.iteritems():\n", - " if val <= 0: \n", - " continue \n", - " bas = name.split(\".\")[1].split(\"-\")\n", - " (ba1, ba2) = bas\n", - "\n", - " next = False\n", - " for ba in bas: \n", - " if ba not in ba_coords.index:\n", - " next=True\n", - " if ba not in toplot.BA.unique():\n", - " next=True\n", - " if next:\n", - " continue\n", - "\n", - " color = toplot.loc[toplot.BA == ba1, \"generated_co2_rate_lb_per_mwh_for_electricity\"].to_numpy()[0]\n", - "\n", - " fig.add_trace(\n", - " go.Scatter(x = ba_coords.loc[bas,\"cx\"], y = ba_coords.loc[bas,\"cy\"], opacity=1.0,\n", - " mode=\"lines\", line = dict(color=get_color_for_val(color, 0, c_max, colorscale), width=2), showlegend=False\n", - " )\n", - " )\n", - "\n", - " ################################# Plot BAs \n", - " toplot.loc[toplot.net_generation_mwh < 1, \"net_generation_mwh\"] = 1\n", - " sizes = np.log(toplot.net_generation_mwh)/np.log(1.5)\n", - " offset = sizes/1.6\n", - "\n", - " # Zero-generation BAs: plot under BAs with non-zero gen \n", - " zero_gen_bas = (toplot.net_generation_mwh == 1) # (we set to 1 above to make log work)\n", - " fig.add_trace(\n", - " go.Scatter(x=toplot.loc[zero_gen_bas,\"cx\"], y=toplot.loc[zero_gen_bas,\"cy\"]+(2/2.5), mode=\"markers\", \n", - " hoverinfo=\"text\", text=toplot.loc[zero_gen_bas,\"BA\"], \n", - " marker_symbol=\"triangle-up\",\n", - " marker=dict(color='lightgrey', \n", - " line=dict(width=1, color='DarkSlateGrey'),\n", - " size=7, opacity=1.0,\n", - " sizemode='diameter'),\n", - " showlegend=False \n", - " )\n", - " )\n", - "\n", - " fig.add_trace(\n", - " go.Scatter(x=toplot.cx, y=toplot.cy, mode=\"markers\", hoverinfo=\"text\", text=toplot.BA, \n", - " marker_symbol=\"triangle-up\", \n", - " opacity=1.0,\n", - " marker=dict(color=toplot.generated_co2_rate_lb_per_mwh_for_electricity, size=sizes,\n", - " sizemode='diameter', cmin=0, cmax=c_max, opacity=1.0,\n", - " line=dict(width=1, color='DarkSlateGrey'),\n", - " colorscale=\"rdylgn_r\"),\n", - " name=\"Generated\", \n", - " showlegend=False\n", - " )\n", - " )\n", - " consumed_toplot = ~toplot.consumed_co2_rate_lb_per_mwh_for_electricity.isna()\n", - " fig.add_trace(\n", - " go.Scatter(x=toplot.loc[consumed_toplot,\"cx\"], y=toplot.loc[consumed_toplot,\"cy\"]+offset[consumed_toplot], mode=\"markers\", \n", - " hoverinfo=\"text\", text=toplot.loc[consumed_toplot,\"BA\"], \n", - " marker_symbol=\"triangle-down\",\n", - " marker=dict(color=toplot.loc[consumed_toplot,\"consumed_co2_rate_lb_per_mwh_for_electricity\"], \n", - " size=sizes[consumed_toplot], opacity=1.0,\n", - " line=dict(width=1, color='DarkSlateGrey'),\n", - " sizemode='diameter', cmin=0, cmax=c_max, \n", - " colorbar=dict(\n", - " title=\"Emission rate
(lbs/MWh)\", orientation='v', len=.8, \n", - " thickness=20, yanchor='bottom', y=0, xpad=20\n", - " ),\n", - " colorscale=\"rdylgn_r\"\n", - " ),\n", - " name=\"Consumed\", \n", - " showlegend=False \n", - " )\n", - " )\n", - "\n", - " # Legends: don't want colored markers\n", - " fig.add_trace(\n", - " go.Scatter(x=[-10], y=[-10], mode=\"markers\", \n", - " marker_symbol=\"triangle-up\",\n", - " marker=dict(color='white', line=dict(width=2, color='DarkSlateGrey'), size=10),\n", - " name=\"Generated\", \n", - " )\n", - " )\n", - " fig.add_trace(\n", - " go.Scatter(x=[-10], y=[-10], mode=\"markers\", \n", - " marker_symbol=\"triangle-down\",\n", - " marker=dict(color='white', line=dict(width=2, color='DarkSlateGrey'), size=10),\n", - " name=\"Consumed\", \n", - " )\n", - " )\n", - " fig.update_yaxes(range=(550,0)) # autorange=\"reversed\")\n", - " fig.update_xaxes(range=(0,800))\n", - "\n", - " # Add images\n", - " fig.add_layout_image(\n", - " dict(\n", - " source=Image.open(\"resources/usa.png\"),\n", - " xref=\"x\",\n", - " yref=\"y\",\n", - " x=10,\n", - " y=0,\n", - " sizex=790,\n", - " sizey=550,\n", - " sizing=\"stretch\",\n", - " opacity=0.5,\n", - " layer=\"below\")\n", - " )\n", - "\n", - " # Add images\n", - " fig.add_layout_image(\n", - " dict(\n", - " source=Image.open(\"resources/legend_bottom_smaller.png\"),\n", - " xref=\"x\",\n", - " yref=\"y\",\n", - " x=-20,\n", - " y=400,\n", - " sizex=260,\n", - " sizey=200,\n", - " sizing=\"contain\",\n", - " opacity=1.0,\n", - " layer=\"below\")\n", - " )\n", - "\n", - " # Set templates\n", - " fig.update_layout(template=\"plotly_white\", width=800, height=600, \n", - " yaxis_visible=False, xaxis_visible=False,\n", - " title=hour.tz_convert(\"US/Eastern\").strftime(\"%B %-d, %Y - %-I:00 %p ET\"))\n", - " #fig.show()\n", - " os.makedirs(f\"outputs/{dir_name}/\", exist_ok=True)\n", - " fig.write_image(f\"outputs/{dir_name}/{hour}.png\", scale=2)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Make gif \n", - "images = []\n", - "files = [f for f in os.listdir(f\"outputs/{dir_name}/\") if \".png\" in f]\n", - "files.sort()\n", - "for f in files:\n", - " images.append(imageio.imread(f\"outputs/{dir_name}/\"+f))\n", - "imageio.mimsave(f\"outputs/movie_{dir_name}.gif\", images)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Now just CISO and neighbors. \n", - "\n", - "# Get list of CISO neighbors \n", - "ciso_interchanges = [c for c in cleaned_io.columns if \"CISO\" in c]\n", - "ciso_bas = []\n", - "for ci in ciso_interchanges: \n", - " ba1, ba2 = ci.split(\".\")[1].split(\"-\")\n", - " if ba1 not in ciso_bas: \n", - " ciso_bas.append(ba1)\n", - " if ba2 not in ciso_bas:\n", - " ciso_bas.append(ba2)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Identify day with max rate difference in CISO \n", - "tester = (all[all.BA==\"CISO\"]).copy()\n", - "tester[\"difference\"] = tester.consumed_co2_rate_lb_per_mwh_for_electricity - tester.generated_co2_rate_lb_per_mwh_for_electricity\n", - "tester.difference.abs().max()\n", - "tester[tester.difference == tester.difference.abs().max()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hour with max CISO difference generated / consumed \n", - "hour = '2020-09-25 12:00:00+00:00'\n", - "\n", - "io_toplot = cleaned_io.loc[hour, ciso_interchanges]\n", - "toplot = all[all.datetime_utc == hour]\n", - "toplot = toplot[toplot.BA.isin(ciso_bas)]\n", - "fig = go.Figure()\n", - "\n", - "colorscale = diverging.RdYlGn_r\n", - "#colorscale = cmocean.solar_r\n", - "\n", - "c_max = np.floor(toplot.generated_co2_rate_lb_per_mwh_for_electricity.max() + 100)\n", - "\n", - "### From when \n", - "# max_width = io_toplot.max()\n", - "# width_factor = 8/max_width\n", - "#width_factor = 1/200\n", - "for name, val in io_toplot.iteritems():\n", - " if val <= 0: \n", - " continue \n", - " bas = name.split(\".\")[1].split(\"-\")\n", - " (ba1, ba2) = bas\n", - "\n", - " next = False\n", - " for ba in bas: \n", - " if ba not in ba_coords.index:\n", - " next=True\n", - " if ba not in toplot.BA.unique():\n", - " next=True\n", - " if next:\n", - " continue\n", - "\n", - " color = toplot.loc[toplot.BA == ba1, \"generated_co2_rate_lb_per_mwh_for_electricity\"].to_numpy()[0]\n", - " print(color)\n", - "\n", - " fig.add_trace(\n", - " go.Scatter(x = ba_coords.loc[bas,\"cx\"], y = ba_coords.loc[bas,\"cy\"], opacity=1.0,\n", - " mode=\"lines\", line = dict(color=get_color_for_val(color, 0, c_max, colorscale), width=2), showlegend=False\n", - " )\n", - " )\n", - "\n", - "################################# Plot BAs \n", - "toplot.loc[toplot.net_generation_mwh < 1, \"net_generation_mwh\"] = 1\n", - "sizes = np.log(toplot.net_generation_mwh)/np.log(1.5)\n", - "offset = sizes/1.6\n", - "fig.add_trace(\n", - " go.Scatter(x=toplot.cx, y=toplot.cy-offset, mode=\"markers\", hoverinfo=\"text\", text=toplot.BA, \n", - " marker_symbol=\"triangle-up\", \n", - " opacity=1.0,\n", - " marker=dict(line=dict(width=1, color='DarkSlateGrey'),\n", - " color=toplot.generated_co2_rate_lb_per_mwh_for_electricity, size=sizes,\n", - " sizemode='diameter', cmin=0, cmax=c_max, opacity=1.0,\n", - " colorscale=\"rdylgn_r\"),\n", - " name=\"Generated\", \n", - " showlegend=False\n", - " )\n", - ")\n", - "fig.add_trace(\n", - " go.Scatter(x=toplot.cx, y=toplot.cy, mode=\"markers\", hoverinfo=\"text\", text=toplot.BA, \n", - " marker_symbol=\"triangle-down\",\n", - " marker=dict(color=toplot.consumed_co2_rate_lb_per_mwh_for_electricity, size=sizes, opacity=1.0,\n", - " line=dict(width=1, color='DarkSlateGrey'),\n", - " sizemode='diameter', cmin=0, cmax=c_max, colorbar=dict(\n", - " title=\"Emission rate
(lbs/MWh)\", orientation='v', len=.8, thickness=20, yanchor='bottom', y=0, xpad=20\n", - " ),\n", - " colorscale=\"rdylgn_r\"),\n", - " name=\"Consumed\", \n", - " showlegend=False \n", - " )\n", - ")\n", - "\n", - "\n", - "# Legends: don't want colored markers\n", - "# Legends: don't want colored markers\n", - "fig.add_trace(\n", - " go.Scatter(x=[-10], y=[-10], mode=\"markers\", \n", - " marker_symbol=\"triangle-up\",\n", - " marker=dict(color='white', line=dict(width=2, color='DarkSlateGrey'), size=10),\n", - " name=\"Generated\", \n", - " )\n", - ")\n", - "fig.add_trace(\n", - " go.Scatter(x=[-10], y=[-10], mode=\"markers\", \n", - " marker_symbol=\"triangle-down\",\n", - " marker=dict(color='white', line=dict(width=2, color='DarkSlateGrey'), size=10),\n", - " name=\"Consumed\", \n", - " )\n", - ")\n", - "fig.update_yaxes(range=(500,0)) # autorange=\"reversed\")\n", - "fig.update_xaxes(range=(0,200))\n", - "\n", - "## loop through the labels and add them as annotations\n", - "for x in zip(toplot.BA, toplot.cx, toplot.cy):\n", - " left_bas = [\"BANC\",\"TIDC\",\"CISO\",\"LDWP\",\"IID\"]\n", - " delta = (-12 if x[0] in left_bas else 12)\n", - " fig.add_annotation(\n", - " x=x[1] + delta,\n", - " y=x[2],\n", - " text=x[0],\n", - " showarrow=False,\n", - " xanchor=('right' if x[0] in left_bas else 'left')\n", - " )\n", - "\n", - "# Add images\n", - "fig.add_layout_image(\n", - " dict(\n", - " source=Image.open(\"resources/usa.png\"),\n", - " xref=\"x\",\n", - " yref=\"y\",\n", - " x=10,\n", - " y=0,\n", - " sizex=790,\n", - " sizey=550,\n", - " sizing=\"stretch\",\n", - " opacity=0.5,\n", - " layer=\"below\")\n", - ")\n", - "\n", - "# Add images\n", - "fig.add_layout_image(\n", - " dict(\n", - " source=Image.open(\"resources/legend_bottom_smaller.png\"),\n", - " xref=\"x\",\n", - " yref=\"y\",\n", - " x=-20,\n", - " y=360,\n", - " sizex=220,\n", - " sizey=144,\n", - " sizing=\"contain\",\n", - " opacity=1.0,\n", - " layer=\"below\")\n", - ")\n", - "\n", - "# Set templates\n", - "fig.update_layout(template=\"plotly_white\", width=400, height=550,\n", - " yaxis_visible=False, xaxis_visible=False,\n", - " title=pd.to_datetime(hour).tz_convert(\"US/Pacific\").strftime(\"%B %-d, %Y - %-I:00 %p PT\")\n", - ")\n", - "fig.show()\n", - "fig.write_image(f\"outputs/viz2_legend.png\", scale=3) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "toplot[\"difference\"] = (toplot.generated_co2_rate_lb_per_mwh_for_electricity - toplot.consumed_co2_rate_lb_per_mwh_for_electricity)/toplot.generated_co2_rate_lb_per_mwh_for_electricity\n", - "toplot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all[\"difference\"] = ((all.generated_co2_rate_lb_per_mwh_for_electricity - all.consumed_co2_rate_lb_per_mwh_for_electricity).abs())* all.net_generation_mwh\n", - "px.line(all.groupby(\"datetime_utc\").mean()[\"difference\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all[all.datetime_utc==\"12-01-2020 T05:00+00:00\"].to_csv(\"outputs/problem_date.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "### Old code for arrows\n", - " # # Arrows have to be added separately\n", - " # line_size = val*width_factor\n", - " # fig.add_annotation(\n", - " # x=ba_coords.loc[ba2,\"cx\"], # arrows' head\n", - " # y=ba_coords.loc[ba2,\"cy\"], # arrows' head\n", - " # ax=ba_coords.loc[ba1,\"cx\"], # arrows' tail\n", - " # ay=ba_coords.loc[ba1,\"cy\"], # arrows' tail\n", - " # xref='x',\n", - " # yref='y',\n", - " # axref='x',\n", - " # ayref='y',\n", - " # text='', # if you want only the arrow\n", - " # showarrow=True,\n", - " # arrowhead=1,\n", - " # arrowsize=1, #max(.3, line_size),\n", - " # arrowwidth=1,\n", - " # arrowcolor='royalblue'\n", - " # )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.5 ('hourly_egrid')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.5" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "65c02dfd2dc2ef471c0b5088763a28c1faaa7cad28937ca42fadf51e669fd8e8" - } - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Maps for announcement blog post\n", + "\n", + "Visualization 2: The carbon intensity of consumed electricity differs from generated electricity\n", + "* Show a static carbon flow map focused on a single BA plus all directly-interconnected BAs\n", + "* Pick an hour when there is some particularly dirty electricity getting imported\n", + "* Each BA would be represented by a bubble, where the color changes based on carbon intensity, and carbon flows would be represented by colored arrows between the bubbles. \n", + "* To illustrate the difference between produced and consumed, we might want to have a pair of bubbles for each BA - one that shows the produced CI and one that shows the consumed CI. If we do this, we probably don’t want to vary the size of each bubble based on total generation. Or maybe we could do a split bubble - the top half shows produced CI and the bottom shows consumed CI.\n", + "\n", + "\n", + "Visualization 3: Animating hourly and consumed emissions for the whole county\n", + "* This animation should put the previous two concepts together and show how carbon flows and how CI changes across the entire country for a single day (or a week?)\n", + "* We could also potentially have two maps side by side: one that shows annual averages in bubbles (with no carbon flow), and one that shows the animated hourly flow (to really draw the distinction between annual and hourly datasets)\n", + "\n", + "\n", + "### Ref for making gif: \n", + "`https://stackoverflow.com/questions/753190/programmatically-generate-video-or-animated-gif-in-python`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "import plotly.graph_objects as go\n", + "from plotly.colors import *\n", + "import plotly.io as pio\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "from PIL import Image\n", + "import imageio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# # Tell python where to look for modules.\n", + "import sys\n", + "\n", + "sys.path.append(\"../../src/\")\n", + "\n", + "import output_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ba_coords = pd.read_csv(\n", + " \"resources/ba_coords.csv\", index_col=0, dtype={\"cx\": np.float64, \"cy\": np.float64}\n", + ")\n", + "ba_meta = pd.read_csv(\"../../data/manual/ba_reference.csv\", index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Note: 150+ BAs are not in ba_coords\n", + "ba_list = ba_meta[\n", + " (ba_meta.ba_category != \"misellaneous\")\n", + " & (ba_meta.us_ba)\n", + " & (ba_meta.index.isin(ba_coords.index))\n", + "].index\n", + "ba_list = [\n", + " ba\n", + " for ba in ba_list\n", + " if (\n", + " f\"{ba}.csv\"\n", + " in os.listdir(\"../../data/results/2020/power_sector_data/hourly/us_units/\")\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_io = pd.read_csv(\n", + " \"../../data/outputs/2020/eia930/eia930_elec.csv\", index_col=0, parse_dates=True\n", + ")\n", + "cleaned_io = cleaned_io[[c for c in cleaned_io.columns if \".ID.\" in c]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all = []\n", + "for ba in ba_list:\n", + " produced = pd.read_csv(\n", + " f\"../../data/results/2020/power_sector_data/hourly/us_units/{ba}.csv\",\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + " usecols=[\n", + " \"datetime_utc\",\n", + " \"fuel_category\",\n", + " \"net_generation_mwh\",\n", + " \"generated_co2_rate_lb_per_mwh_for_electricity\",\n", + " ],\n", + " )\n", + " produced = produced[produced.fuel_category == \"total\"]\n", + " produced = produced.drop(columns=[\"fuel_category\"])\n", + "\n", + " if ba_meta.loc[ba, \"ba_category\"] == \"generation_only\":\n", + " consumed = pd.DataFrame(\n", + " index=produced.index,\n", + " columns=[[\"consumed_co2_rate_lb_per_mwh_for_electricity\"]],\n", + " dtype=np.float64,\n", + " )\n", + " else:\n", + " consumed = pd.read_csv(\n", + " f\"../../data/results/2020/carbon_accounting/hourly/us_units/{ba}.csv\",\n", + " index_col=\"datetime_utc\",\n", + " parse_dates=True,\n", + " usecols=[\"datetime_utc\", \"consumed_co2_rate_lb_per_mwh_for_electricity\"],\n", + " )\n", + " consumed.columns = consumed.columns.get_level_values(0)\n", + "\n", + " both = pd.concat([produced, consumed], axis=\"columns\")\n", + " # both = both.loc[range_start:range_end]\n", + " both = both.reset_index()\n", + " both[\"BA\"] = ba\n", + " all.append(both)\n", + "\n", + "all = pd.concat(all)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Add coordinates\n", + "all = all.merge(\n", + " ba_coords, how=\"left\", validate=\"many_to_one\", left_on=\"BA\", right_index=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# src: https://community.plotly.com/t/how-to-include-a-colorscale-for-color-of-line-graphs/38002/3\n", + "from ast import literal_eval\n", + "\n", + "\n", + "def get_color_for_val(val, vmin, vmax, pl_colors):\n", + " if pl_colors[0][:3] != \"rgb\":\n", + " raise ValueError(\"This function works only with Plotly rgb-colorscales\")\n", + " if vmin >= vmax:\n", + " raise ValueError(\"vmin should be < vmax\")\n", + "\n", + " scale = [round(k / (len(pl_colors)), 3) for k in range(len(pl_colors) + 1)]\n", + "\n", + " colors_01 = (\n", + " np.array([literal_eval(color[3:]) for color in pl_colors]) / 255\n", + " ) # color codes in [0,1]\n", + "\n", + " v = (val - vmin) / (vmax - vmin) # val is mapped to v in [0,1]\n", + " # find two consecutive values in plotly_scale such that v is in the corresponding interval\n", + " idx = 0\n", + "\n", + " while idx < (len(scale) - 2) and (v > scale[idx + 1]):\n", + " idx += 1\n", + "\n", + " vv = (v - scale[idx]) / (scale[idx + 1] - scale[idx])\n", + "\n", + " # get [0,1]-valued color code representing the rgb color corresponding to val\n", + " if idx == len(pl_colors) - 1: # Make this work when some values exceed range\n", + " val_color01 = colors_01[idx] # color by last color\n", + " else:\n", + " val_color01 = colors_01[idx] + vv * (colors_01[idx + 1] - colors_01[idx])\n", + "\n", + " val_color_0255 = (255 * val_color01 + 0.5).astype(int)\n", + " return f\"rgb{str(tuple(val_color_0255))}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "range_start = \"2020-08-01T04:00+00\"\n", + "range_end = \"2020-08-3T04:00+00\"\n", + "\n", + "# range_start = \"2020-07-21T12:00+00\"\n", + "# range_end = \"2020-07-23T12:00+00\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dir_name = \"maps\"\n", + "\n", + "c_max = 2200 # np.floor(all.generated_co2_rate_lb_per_mwh_for_electricity.max() + 100)\n", + "\n", + "hours = all.datetime_utc[\n", + " (all.datetime_utc < pd.to_datetime(range_end))\n", + " & (all.datetime_utc > pd.to_datetime(range_start))\n", + " & (all.BA == \"CISO\")\n", + "]\n", + "for hour in hours:\n", + " print(hour, end=\"...\")\n", + " io_toplot = cleaned_io.loc[hour]\n", + " toplot = all[all.datetime_utc == hour]\n", + " fig = go.Figure()\n", + "\n", + " toplot.loc[toplot.net_generation_mwh < 1, \"net_generation_mwh\"] = 1\n", + " sizes = np.log(toplot.net_generation_mwh) / np.log(1.5)\n", + " offset = sizes / 2.5\n", + "\n", + " colorscale = diverging.RdYlGn_r\n", + " # colorscale = cmocean.solar_r\n", + "\n", + " ### From when\n", + " # max_width = io_toplot.max()\n", + " # width_factor = 8/max_width\n", + " # width_factor = 1/200\n", + " for name, val in io_toplot.iteritems():\n", + " if val <= 0:\n", + " continue\n", + " bas = name.split(\".\")[1].split(\"-\")\n", + " (ba1, ba2) = bas\n", + "\n", + " next = False\n", + " for ba in bas:\n", + " if ba not in ba_coords.index:\n", + " next = True\n", + " if ba not in toplot.BA.unique():\n", + " next = True\n", + " if next:\n", + " continue\n", + "\n", + " color = toplot.loc[\n", + " toplot.BA == ba1, \"generated_co2_rate_lb_per_mwh_for_electricity\"\n", + " ].to_numpy()[0]\n", + "\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=ba_coords.loc[bas, \"cx\"],\n", + " y=ba_coords.loc[bas, \"cy\"],\n", + " opacity=1.0,\n", + " mode=\"lines\",\n", + " line=dict(\n", + " color=get_color_for_val(color, 0, c_max, colorscale), width=2\n", + " ),\n", + " showlegend=False,\n", + " )\n", + " )\n", + "\n", + " ################################# Plot BAs\n", + " toplot.loc[toplot.net_generation_mwh < 1, \"net_generation_mwh\"] = 1\n", + " sizes = np.log(toplot.net_generation_mwh) / np.log(1.5)\n", + " offset = sizes / 1.6\n", + "\n", + " # Zero-generation BAs: plot under BAs with non-zero gen\n", + " zero_gen_bas = (\n", + " toplot.net_generation_mwh == 1\n", + " ) # (we set to 1 above to make log work)\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=toplot.loc[zero_gen_bas, \"cx\"],\n", + " y=toplot.loc[zero_gen_bas, \"cy\"] + (2 / 2.5),\n", + " mode=\"markers\",\n", + " hoverinfo=\"text\",\n", + " text=toplot.loc[zero_gen_bas, \"BA\"],\n", + " marker_symbol=\"triangle-up\",\n", + " marker=dict(\n", + " color=\"lightgrey\",\n", + " line=dict(width=1, color=\"DarkSlateGrey\"),\n", + " size=7,\n", + " opacity=1.0,\n", + " sizemode=\"diameter\",\n", + " ),\n", + " showlegend=False,\n", + " )\n", + " )\n", + "\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=toplot.cx,\n", + " y=toplot.cy,\n", + " mode=\"markers\",\n", + " hoverinfo=\"text\",\n", + " text=toplot.BA,\n", + " marker_symbol=\"triangle-up\",\n", + " opacity=1.0,\n", + " marker=dict(\n", + " color=toplot.generated_co2_rate_lb_per_mwh_for_electricity,\n", + " size=sizes,\n", + " sizemode=\"diameter\",\n", + " cmin=0,\n", + " cmax=c_max,\n", + " opacity=1.0,\n", + " line=dict(width=1, color=\"DarkSlateGrey\"),\n", + " colorscale=\"rdylgn_r\",\n", + " ),\n", + " name=\"Generated\",\n", + " showlegend=False,\n", + " )\n", + " )\n", + " consumed_toplot = ~toplot.consumed_co2_rate_lb_per_mwh_for_electricity.isna()\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=toplot.loc[consumed_toplot, \"cx\"],\n", + " y=toplot.loc[consumed_toplot, \"cy\"] + offset[consumed_toplot],\n", + " mode=\"markers\",\n", + " hoverinfo=\"text\",\n", + " text=toplot.loc[consumed_toplot, \"BA\"],\n", + " marker_symbol=\"triangle-down\",\n", + " marker=dict(\n", + " color=toplot.loc[\n", + " consumed_toplot, \"consumed_co2_rate_lb_per_mwh_for_electricity\"\n", + " ],\n", + " size=sizes[consumed_toplot],\n", + " opacity=1.0,\n", + " line=dict(width=1, color=\"DarkSlateGrey\"),\n", + " sizemode=\"diameter\",\n", + " cmin=0,\n", + " cmax=c_max,\n", + " colorbar=dict(\n", + " title=\"Emission rate
(lbs/MWh)\",\n", + " orientation=\"v\",\n", + " len=0.8,\n", + " thickness=20,\n", + " yanchor=\"bottom\",\n", + " y=0,\n", + " xpad=20,\n", + " ),\n", + " colorscale=\"rdylgn_r\",\n", + " ),\n", + " name=\"Consumed\",\n", + " showlegend=False,\n", + " )\n", + " )\n", + "\n", + " # Legends: don't want colored markers\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=[-10],\n", + " y=[-10],\n", + " mode=\"markers\",\n", + " marker_symbol=\"triangle-up\",\n", + " marker=dict(\n", + " color=\"white\", line=dict(width=2, color=\"DarkSlateGrey\"), size=10\n", + " ),\n", + " name=\"Generated\",\n", + " )\n", + " )\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=[-10],\n", + " y=[-10],\n", + " mode=\"markers\",\n", + " marker_symbol=\"triangle-down\",\n", + " marker=dict(\n", + " color=\"white\", line=dict(width=2, color=\"DarkSlateGrey\"), size=10\n", + " ),\n", + " name=\"Consumed\",\n", + " )\n", + " )\n", + " fig.update_yaxes(range=(550, 0)) # autorange=\"reversed\")\n", + " fig.update_xaxes(range=(0, 800))\n", + "\n", + " # Add images\n", + " fig.add_layout_image(\n", + " dict(\n", + " source=Image.open(\"resources/usa.png\"),\n", + " xref=\"x\",\n", + " yref=\"y\",\n", + " x=10,\n", + " y=0,\n", + " sizex=790,\n", + " sizey=550,\n", + " sizing=\"stretch\",\n", + " opacity=0.5,\n", + " layer=\"below\",\n", + " )\n", + " )\n", + "\n", + " # Add images\n", + " fig.add_layout_image(\n", + " dict(\n", + " source=Image.open(\"resources/legend_bottom_smaller.png\"),\n", + " xref=\"x\",\n", + " yref=\"y\",\n", + " x=-20,\n", + " y=400,\n", + " sizex=260,\n", + " sizey=200,\n", + " sizing=\"contain\",\n", + " opacity=1.0,\n", + " layer=\"below\",\n", + " )\n", + " )\n", + "\n", + " # Set templates\n", + " fig.update_layout(\n", + " template=\"plotly_white\",\n", + " width=800,\n", + " height=600,\n", + " yaxis_visible=False,\n", + " xaxis_visible=False,\n", + " title=hour.tz_convert(\"US/Eastern\").strftime(\"%B %-d, %Y - %-I:00 %p ET\"),\n", + " )\n", + " # fig.show()\n", + " os.makedirs(f\"outputs/{dir_name}/\", exist_ok=True)\n", + " fig.write_image(f\"outputs/{dir_name}/{hour}.png\", scale=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Make gif\n", + "images = []\n", + "files = [f for f in os.listdir(f\"outputs/{dir_name}/\") if \".png\" in f]\n", + "files.sort()\n", + "for f in files:\n", + " images.append(imageio.imread(f\"outputs/{dir_name}/\" + f))\n", + "imageio.mimsave(f\"outputs/movie_{dir_name}.gif\", images)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now just CISO and neighbors.\n", + "\n", + "# Get list of CISO neighbors\n", + "ciso_interchanges = [c for c in cleaned_io.columns if \"CISO\" in c]\n", + "ciso_bas = []\n", + "for ci in ciso_interchanges:\n", + " ba1, ba2 = ci.split(\".\")[1].split(\"-\")\n", + " if ba1 not in ciso_bas:\n", + " ciso_bas.append(ba1)\n", + " if ba2 not in ciso_bas:\n", + " ciso_bas.append(ba2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Identify day with max rate difference in CISO\n", + "tester = (all[all.BA == \"CISO\"]).copy()\n", + "tester[\"difference\"] = (\n", + " tester.consumed_co2_rate_lb_per_mwh_for_electricity\n", + " - tester.generated_co2_rate_lb_per_mwh_for_electricity\n", + ")\n", + "tester.difference.abs().max()\n", + "tester[tester.difference == tester.difference.abs().max()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hour with max CISO difference generated / consumed\n", + "hour = \"2020-09-25 12:00:00+00:00\"\n", + "\n", + "io_toplot = cleaned_io.loc[hour, ciso_interchanges]\n", + "toplot = all[all.datetime_utc == hour]\n", + "toplot = toplot[toplot.BA.isin(ciso_bas)]\n", + "fig = go.Figure()\n", + "\n", + "colorscale = diverging.RdYlGn_r\n", + "# colorscale = cmocean.solar_r\n", + "\n", + "c_max = np.floor(toplot.generated_co2_rate_lb_per_mwh_for_electricity.max() + 100)\n", + "\n", + "### From when\n", + "# max_width = io_toplot.max()\n", + "# width_factor = 8/max_width\n", + "# width_factor = 1/200\n", + "for name, val in io_toplot.iteritems():\n", + " if val <= 0:\n", + " continue\n", + " bas = name.split(\".\")[1].split(\"-\")\n", + " (ba1, ba2) = bas\n", + "\n", + " next = False\n", + " for ba in bas:\n", + " if ba not in ba_coords.index:\n", + " next = True\n", + " if ba not in toplot.BA.unique():\n", + " next = True\n", + " if next:\n", + " continue\n", + "\n", + " color = toplot.loc[\n", + " toplot.BA == ba1, \"generated_co2_rate_lb_per_mwh_for_electricity\"\n", + " ].to_numpy()[0]\n", + " print(color)\n", + "\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=ba_coords.loc[bas, \"cx\"],\n", + " y=ba_coords.loc[bas, \"cy\"],\n", + " opacity=1.0,\n", + " mode=\"lines\",\n", + " line=dict(color=get_color_for_val(color, 0, c_max, colorscale), width=2),\n", + " showlegend=False,\n", + " )\n", + " )\n", + "\n", + "################################# Plot BAs\n", + "toplot.loc[toplot.net_generation_mwh < 1, \"net_generation_mwh\"] = 1\n", + "sizes = np.log(toplot.net_generation_mwh) / np.log(1.5)\n", + "offset = sizes / 1.6\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=toplot.cx,\n", + " y=toplot.cy - offset,\n", + " mode=\"markers\",\n", + " hoverinfo=\"text\",\n", + " text=toplot.BA,\n", + " marker_symbol=\"triangle-up\",\n", + " opacity=1.0,\n", + " marker=dict(\n", + " line=dict(width=1, color=\"DarkSlateGrey\"),\n", + " color=toplot.generated_co2_rate_lb_per_mwh_for_electricity,\n", + " size=sizes,\n", + " sizemode=\"diameter\",\n", + " cmin=0,\n", + " cmax=c_max,\n", + " opacity=1.0,\n", + " colorscale=\"rdylgn_r\",\n", + " ),\n", + " name=\"Generated\",\n", + " showlegend=False,\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=toplot.cx,\n", + " y=toplot.cy,\n", + " mode=\"markers\",\n", + " hoverinfo=\"text\",\n", + " text=toplot.BA,\n", + " marker_symbol=\"triangle-down\",\n", + " marker=dict(\n", + " color=toplot.consumed_co2_rate_lb_per_mwh_for_electricity,\n", + " size=sizes,\n", + " opacity=1.0,\n", + " line=dict(width=1, color=\"DarkSlateGrey\"),\n", + " sizemode=\"diameter\",\n", + " cmin=0,\n", + " cmax=c_max,\n", + " colorbar=dict(\n", + " title=\"Emission rate
(lbs/MWh)\",\n", + " orientation=\"v\",\n", + " len=0.8,\n", + " thickness=20,\n", + " yanchor=\"bottom\",\n", + " y=0,\n", + " xpad=20,\n", + " ),\n", + " colorscale=\"rdylgn_r\",\n", + " ),\n", + " name=\"Consumed\",\n", + " showlegend=False,\n", + " )\n", + ")\n", + "\n", + "\n", + "# Legends: don't want colored markers\n", + "# Legends: don't want colored markers\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=[-10],\n", + " y=[-10],\n", + " mode=\"markers\",\n", + " marker_symbol=\"triangle-up\",\n", + " marker=dict(color=\"white\", line=dict(width=2, color=\"DarkSlateGrey\"), size=10),\n", + " name=\"Generated\",\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=[-10],\n", + " y=[-10],\n", + " mode=\"markers\",\n", + " marker_symbol=\"triangle-down\",\n", + " marker=dict(color=\"white\", line=dict(width=2, color=\"DarkSlateGrey\"), size=10),\n", + " name=\"Consumed\",\n", + " )\n", + ")\n", + "fig.update_yaxes(range=(500, 0)) # autorange=\"reversed\")\n", + "fig.update_xaxes(range=(0, 200))\n", + "\n", + "## loop through the labels and add them as annotations\n", + "for x in zip(toplot.BA, toplot.cx, toplot.cy):\n", + " left_bas = [\"BANC\", \"TIDC\", \"CISO\", \"LDWP\", \"IID\"]\n", + " delta = -12 if x[0] in left_bas else 12\n", + " fig.add_annotation(\n", + " x=x[1] + delta,\n", + " y=x[2],\n", + " text=x[0],\n", + " showarrow=False,\n", + " xanchor=(\"right\" if x[0] in left_bas else \"left\"),\n", + " )\n", + "\n", + "# Add images\n", + "fig.add_layout_image(\n", + " dict(\n", + " source=Image.open(\"resources/usa.png\"),\n", + " xref=\"x\",\n", + " yref=\"y\",\n", + " x=10,\n", + " y=0,\n", + " sizex=790,\n", + " sizey=550,\n", + " sizing=\"stretch\",\n", + " opacity=0.5,\n", + " layer=\"below\",\n", + " )\n", + ")\n", + "\n", + "# Add images\n", + "fig.add_layout_image(\n", + " dict(\n", + " source=Image.open(\"resources/legend_bottom_smaller.png\"),\n", + " xref=\"x\",\n", + " yref=\"y\",\n", + " x=-20,\n", + " y=360,\n", + " sizex=220,\n", + " sizey=144,\n", + " sizing=\"contain\",\n", + " opacity=1.0,\n", + " layer=\"below\",\n", + " )\n", + ")\n", + "\n", + "# Set templates\n", + "fig.update_layout(\n", + " template=\"plotly_white\",\n", + " width=400,\n", + " height=550,\n", + " yaxis_visible=False,\n", + " xaxis_visible=False,\n", + " title=pd.to_datetime(hour)\n", + " .tz_convert(\"US/Pacific\")\n", + " .strftime(\"%B %-d, %Y - %-I:00 %p PT\"),\n", + ")\n", + "fig.show()\n", + "fig.write_image(f\"outputs/viz2_legend.png\", scale=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "toplot[\"difference\"] = (\n", + " toplot.generated_co2_rate_lb_per_mwh_for_electricity\n", + " - toplot.consumed_co2_rate_lb_per_mwh_for_electricity\n", + ") / toplot.generated_co2_rate_lb_per_mwh_for_electricity\n", + "toplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all[\"difference\"] = (\n", + " (\n", + " all.generated_co2_rate_lb_per_mwh_for_electricity\n", + " - all.consumed_co2_rate_lb_per_mwh_for_electricity\n", + " ).abs()\n", + ") * all.net_generation_mwh\n", + "px.line(all.groupby(\"datetime_utc\").mean()[\"difference\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all[all.datetime_utc == \"12-01-2020 T05:00+00:00\"].to_csv(\"outputs/problem_date.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### Old code for arrows\n", + "# # Arrows have to be added separately\n", + "# line_size = val*width_factor\n", + "# fig.add_annotation(\n", + "# x=ba_coords.loc[ba2,\"cx\"], # arrows' head\n", + "# y=ba_coords.loc[ba2,\"cy\"], # arrows' head\n", + "# ax=ba_coords.loc[ba1,\"cx\"], # arrows' tail\n", + "# ay=ba_coords.loc[ba1,\"cy\"], # arrows' tail\n", + "# xref='x',\n", + "# yref='y',\n", + "# axref='x',\n", + "# ayref='y',\n", + "# text='', # if you want only the arrow\n", + "# showarrow=True,\n", + "# arrowhead=1,\n", + "# arrowsize=1, #max(.3, line_size),\n", + "# arrowwidth=1,\n", + "# arrowcolor='royalblue'\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.5 ('hourly_egrid')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 }, - "nbformat": 4, - "nbformat_minor": 2 + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "65c02dfd2dc2ef471c0b5088763a28c1faaa7cad28937ca42fadf51e669fd8e8" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/notebooks/visualization/plot_timeseries_data.ipynb b/notebooks/visualization/plot_timeseries_data.ipynb index baf8e715..0228b61d 100644 --- a/notebooks/visualization/plot_timeseries_data.ipynb +++ b/notebooks/visualization/plot_timeseries_data.ipynb @@ -15,7 +15,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "from filepaths import *\n", "import validation\n", @@ -41,7 +42,9 @@ "pollutant = \"co2\"\n", "rate_type = \"for_electricity\"\n", "\n", - "data_to_graph = visualization.load_ba_ef_data_to_graph(ba, year, pollutant, rate_type, show_egrid)\n", + "data_to_graph = visualization.load_ba_ef_data_to_graph(\n", + " ba, year, pollutant, rate_type, show_egrid\n", + ")\n", "data_to_graph" ] }, @@ -77,7 +80,7 @@ " labels={\"value\": \"lbCO2e/MWh\"},\n", " template=\"plotly_white\",\n", " title=ba,\n", - ")\n" + ")" ] }, { @@ -117,7 +120,7 @@ ")\n", "\n", "\n", - "fig.show()\n" + "fig.show()" ] }, { @@ -145,7 +148,9 @@ "ba = \"CISO\"\n", "year = 2020\n", "\n", - "power_sector_data = pd.read_csv(results_folder(f\"{year}/power_sector_data/hourly/us_units/{ba}.csv\"))" + "power_sector_data = pd.read_csv(\n", + " results_folder(f\"{year}/power_sector_data/hourly/us_units/{ba}.csv\")\n", + ")" ] }, { @@ -160,7 +165,7 @@ " column_name=\"net_generation_mwh\",\n", " fuel_category_name=\"fuel_category\",\n", " plot_type=\"area\",\n", - ")\n" + ")" ] }, { @@ -175,7 +180,7 @@ " column_name=\"co2_mass_lb\",\n", " fuel_category_name=\"fuel_category\",\n", " plot_type=\"area\",\n", - ")\n" + ")" ] } ], diff --git a/notebooks/work_in_progress/GH102_test_dask.ipynb b/notebooks/work_in_progress/GH102_test_dask.ipynb index db4d08ad..5d0553aa 100644 --- a/notebooks/work_in_progress/GH102_test_dask.ipynb +++ b/notebooks/work_in_progress/GH102_test_dask.ipynb @@ -28,8 +28,11 @@ "outputs": [], "source": [ "# Start client so can see worker mem use\n", - "from dask.distributed import Client \n", - "client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB') # limit worker size to create issues even on --small dataset\n", + "from dask.distributed import Client\n", + "\n", + "client = Client(\n", + " n_workers=2, threads_per_worker=2, memory_limit=\"1GB\"\n", + ") # limit worker size to create issues even on --small dataset\n", "client" ] }, @@ -45,7 +48,7 @@ "# # Tell python where to look for modules.\n", "import sys\n", "\n", - "sys.path.append('../../open-grid-emissions/')\n", + "sys.path.append(\"../../open-grid-emissions/\")\n", "\n", "# import local modules\n", "import src.data_cleaning as data_cleaning" @@ -57,12 +60,21 @@ "metadata": {}, "outputs": [], "source": [ - "o_shaped_eia_data = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/outputs/small/shaped_eia923_data_2020.csv', parse_dates=['datetime_utc'])\n", - "o_partial_cems_scaled = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/outputs/small/partial_cems_scaled_2020.csv', parse_dates=['datetime_utc']) #NOT FINAL VERSION \n", - "o_cems = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/outputs/small/cems_2020.csv', parse_dates=['datetime_utc']) # NOT FINAL VERSION\n", + "o_shaped_eia_data = pd.read_csv(\n", + " PATH_TO_LOCAL_REPO + \"data/outputs/small/shaped_eia923_data_2020.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ")\n", + "o_partial_cems_scaled = pd.read_csv(\n", + " PATH_TO_LOCAL_REPO + \"data/outputs/small/partial_cems_scaled_2020.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ") # NOT FINAL VERSION\n", + "o_cems = pd.read_csv(\n", + " PATH_TO_LOCAL_REPO + \"data/outputs/small/cems_2020.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ") # NOT FINAL VERSION\n", "\n", "# shaped_eia_data = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/outputs/shaped_eia923_data_2020.csv')\n", - "# partial_cems_scaled = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/outputs/partial_cems_scaled_2020.csv') #NOT FINAL VERSION \n", + "# partial_cems_scaled = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/outputs/partial_cems_scaled_2020.csv') #NOT FINAL VERSION\n", "# cems = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/outputs/cems_2020.csv') # NOT FINAL VERSION" ] }, @@ -72,10 +84,12 @@ "metadata": {}, "outputs": [], "source": [ - "# most of the stuff done to partial_cems_scaled and cems in data_pipeline is in plant_static_attributes: \n", - "#plant_static_attributes = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/results/plant_data/plant_static_attributes.csv')\n", + "# most of the stuff done to partial_cems_scaled and cems in data_pipeline is in plant_static_attributes:\n", + "# plant_static_attributes = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/results/plant_data/plant_static_attributes.csv')\n", "\n", - "plant_static_attributes = pd.read_csv(PATH_TO_LOCAL_REPO + 'data/results/small/plant_data/plant_static_attributes.csv')" + "plant_static_attributes = pd.read_csv(\n", + " PATH_TO_LOCAL_REPO + \"data/results/small/plant_data/plant_static_attributes.csv\"\n", + ")" ] }, { @@ -84,8 +98,10 @@ "metadata": {}, "outputs": [], "source": [ - "o_partial_cems_scaled = o_partial_cems_scaled.merge(plant_static_attributes, how='left', on='plant_id_eia')\n", - "o_cems = o_cems.merge(plant_static_attributes, how='left', on='plant_id_eia')" + "o_partial_cems_scaled = o_partial_cems_scaled.merge(\n", + " plant_static_attributes, how=\"left\", on=\"plant_id_eia\"\n", + ")\n", + "o_cems = o_cems.merge(plant_static_attributes, how=\"left\", on=\"plant_id_eia\")" ] }, { @@ -98,7 +114,7 @@ "# On full dataset, these break with worker size 16GB (ie Gailin's laptop)\n", "\n", "combined_plant_data = data_cleaning.combine_subplant_data(\n", - " o_cems, o_partial_cems_scaled, o_shaped_eia_data\n", + " o_cems, o_partial_cems_scaled, o_shaped_eia_data\n", ")\n", "\n", "# 12. Aggregate CEMS data to BA-fuel and write power sector results\n", @@ -113,8 +129,7 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "#plants = combined_plant_data.groupby('plant_id_eia').sum().compute()" + "# plants = combined_plant_data.groupby('plant_id_eia').sum().compute()" ] }, { @@ -139,7 +154,7 @@ "metadata": {}, "outputs": [], "source": [ - "#dt_shaped['datetime_utc'] = dt_shaped.datetime_utc.dt.to_datetime64()\n" + "# dt_shaped['datetime_utc'] = dt_shaped.datetime_utc.dt.to_datetime64()\n" ] }, { @@ -148,7 +163,7 @@ "metadata": {}, "outputs": [], "source": [ - "dt_shaped = o_shaped_eia_data.set_index('datetime_utc')\n", + "dt_shaped = o_shaped_eia_data.set_index(\"datetime_utc\")\n", "shaped_eia_dask = dd.from_pandas(dt_shaped, npartitions=50)" ] }, @@ -167,7 +182,7 @@ "metadata": {}, "outputs": [], "source": [ - "smol = shaped_eia_dask[['net_generation_mwh','fuel_consumed_mmbtu']]" + "smol = shaped_eia_dask[[\"net_generation_mwh\", \"fuel_consumed_mmbtu\"]]" ] }, { @@ -176,8 +191,8 @@ "metadata": {}, "outputs": [], "source": [ - "# even this breaks. If can get this to work, maybe that solution works for rest? \n", - "smol.groupby('datetime_utc').sum().compute()" + "# even this breaks. If can get this to work, maybe that solution works for rest?\n", + "smol.groupby(\"datetime_utc\").sum().compute()" ] }, { diff --git a/notebooks/work_in_progress/GH153_fill_missing_nox_so2_cems.ipynb b/notebooks/work_in_progress/GH153_fill_missing_nox_so2_cems.ipynb index d9c1c94b..1196e6a6 100644 --- a/notebooks/work_in_progress/GH153_fill_missing_nox_so2_cems.ipynb +++ b/notebooks/work_in_progress/GH153_fill_missing_nox_so2_cems.ipynb @@ -122,7 +122,9 @@ "outputs": [], "source": [ "epa_eia_crosswalk = load_data.load_epa_eia_crosswalk(year)\n", - "boiler_to_unit_crosswalk = epa_eia_crosswalk[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"boiler_id\"]].drop_duplicates()\n", + "boiler_to_unit_crosswalk = epa_eia_crosswalk[\n", + " [\"plant_id_eia\", \"emissions_unit_id_epa\", \"boiler_id\"]\n", + "].drop_duplicates()\n", "boiler_to_unit_crosswalk" ] }, @@ -175,8 +177,7 @@ " \"boiler_bottom_type\",\n", " \"boiler_firing_type\",\n", " ]\n", - ")\n", - "\n" + ")" ] }, { @@ -201,7 +202,9 @@ "outputs": [], "source": [ "epa_eia_crosswalk = load_data.load_epa_eia_crosswalk(year)\n", - "boiler_to_unit_crosswalk = epa_eia_crosswalk[[\"plant_id_eia\",\"emissions_unit_id_epa\",\"boiler_id\"]].drop_duplicates()" + "boiler_to_unit_crosswalk = epa_eia_crosswalk[\n", + " [\"plant_id_eia\", \"emissions_unit_id_epa\", \"boiler_id\"]\n", + "].drop_duplicates()" ] }, { @@ -211,13 +214,31 @@ "outputs": [], "source": [ "# merge in all possible fuel-specific emission factors for the pm boiler design parameters\n", - "boiler_factors = boiler_firing_type.merge(nox_emission_factors, how=\"left\", on=[\"prime_mover_code\",\"boiler_bottom_type\",\"boiler_firing_type\"])\n", + "boiler_factors = boiler_firing_type.merge(\n", + " nox_emission_factors,\n", + " how=\"left\",\n", + " on=[\"prime_mover_code\", \"boiler_bottom_type\", \"boiler_firing_type\"],\n", + ")\n", "\n", "# associate a unit\n", - "boiler_factors = boiler_factors.merge(boiler_to_unit_crosswalk, how=\"left\", on=[\"plant_id_eia\",\"boiler_id\"])\n", + "boiler_factors = boiler_factors.merge(\n", + " boiler_to_unit_crosswalk, how=\"left\", on=[\"plant_id_eia\", \"boiler_id\"]\n", + ")\n", "\n", "# aggregate by unit fuel\n", - "boiler_factors = boiler_factors.groupby([\"plant_id_eia\",\"emissions_unit_id_epa\",\"energy_source_code\",\"emission_factor_denominator\"], dropna=False)[\"emission_factor\"].mean().reset_index()\n", + "boiler_factors = (\n", + " boiler_factors.groupby(\n", + " [\n", + " \"plant_id_eia\",\n", + " \"emissions_unit_id_epa\",\n", + " \"energy_source_code\",\n", + " \"emission_factor_denominator\",\n", + " ],\n", + " dropna=False,\n", + " )[\"emission_factor\"]\n", + " .mean()\n", + " .reset_index()\n", + ")\n", "\n", "boiler_factors" ] diff --git a/notebooks/work_in_progress/GH240_eia930_physics_reconciliation.ipynb b/notebooks/work_in_progress/GH240_eia930_physics_reconciliation.ipynb index b67df60b..92cc342c 100644 --- a/notebooks/work_in_progress/GH240_eia930_physics_reconciliation.ipynb +++ b/notebooks/work_in_progress/GH240_eia930_physics_reconciliation.ipynb @@ -33,7 +33,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import download_data\n", "import load_data\n", @@ -64,7 +65,18 @@ "eia930_raw = eia930.load_chalendar_for_pipeline(raw_930_file, year=year)\n", "eia930_data = eia930.load_chalendar_for_pipeline(clean_930_file, year=year)\n", "\n", - "eia930_merged = eia930_raw.merge(eia930_data, how=\"left\", on=[\"ba_code\",\"fuel_category_eia930\",\"datetime_utc\",\"datetime_local\",\"report_date\"], suffixes=(\"_raw\",\"_cleaned\"))" + "eia930_merged = eia930_raw.merge(\n", + " eia930_data,\n", + " how=\"left\",\n", + " on=[\n", + " \"ba_code\",\n", + " \"fuel_category_eia930\",\n", + " \"datetime_utc\",\n", + " \"datetime_local\",\n", + " \"report_date\",\n", + " ],\n", + " suffixes=(\"_raw\", \"_cleaned\"),\n", + ")" ] }, { @@ -74,10 +86,18 @@ "outputs": [], "source": [ "# calculate how well correlated the raw and cleaned data is\n", - "correlations = eia930_merged.groupby([\"ba_code\",\"fuel_category_eia930\",\"report_date\"], dropna=False)[[\"net_generation_mwh_930_raw\",\"net_generation_mwh_930_cleaned\"]].corr().reset_index()\n", + "correlations = (\n", + " eia930_merged.groupby(\n", + " [\"ba_code\", \"fuel_category_eia930\", \"report_date\"], dropna=False\n", + " )[[\"net_generation_mwh_930_raw\", \"net_generation_mwh_930_cleaned\"]]\n", + " .corr()\n", + " .reset_index()\n", + ")\n", "correlations = correlations[correlations[\"level_3\"] == \"net_generation_mwh_930_raw\"]\n", - "correlations = correlations.drop(columns=[\"level_3\",\"net_generation_mwh_930_raw\"])\n", - "correlations = correlations.rename(columns={\"net_generation_mwh_930_cleaned\":\"correlation_with_raw\"})\n", + "correlations = correlations.drop(columns=[\"level_3\", \"net_generation_mwh_930_raw\"])\n", + "correlations = correlations.rename(\n", + " columns={\"net_generation_mwh_930_cleaned\": \"correlation_with_raw\"}\n", + ")\n", "correlations = correlations[correlations[\"report_date\"].dt.year == 2020]\n", "correlations" ] @@ -91,7 +111,9 @@ "ba = \"PJM\"\n", "fuel = \"coal\"\n", "\n", - "correlations[(correlations[\"ba_code\"] == ba) & (correlations[\"fuel_category_eia930\"] == fuel)]" + "correlations[\n", + " (correlations[\"ba_code\"] == ba) & (correlations[\"fuel_category_eia930\"] == fuel)\n", + "]" ] }, { @@ -112,9 +134,15 @@ "ba = \"BPAT\"\n", "fuel = \"nuclear\"\n", "\n", - "data_to_plot = eia930_merged[(eia930_merged[\"ba_code\"] == ba) & (eia930_merged[\"fuel_category_eia930\"] == fuel)]\n", + "data_to_plot = eia930_merged[\n", + " (eia930_merged[\"ba_code\"] == ba) & (eia930_merged[\"fuel_category_eia930\"] == fuel)\n", + "]\n", "\n", - "px.line(data_to_plot, x=\"datetime_local\", y=[\"net_generation_mwh_930_raw\",\"net_generation_mwh_930_cleaned\"])" + "px.line(\n", + " data_to_plot,\n", + " x=\"datetime_local\",\n", + " y=[\"net_generation_mwh_930_raw\", \"net_generation_mwh_930_cleaned\"],\n", + ")" ] } ], diff --git a/notebooks/work_in_progress/clean_cems_outliers.ipynb b/notebooks/work_in_progress/clean_cems_outliers.ipynb index 9df519cf..5c4a565d 100644 --- a/notebooks/work_in_progress/clean_cems_outliers.ipynb +++ b/notebooks/work_in_progress/clean_cems_outliers.ipynb @@ -59,7 +59,9 @@ "metadata": {}, "outputs": [], "source": [ - "cems = pd.read_csv(f\"{outputs_folder()}/{year}/cems_cleaned_{year}.csv\", dtype=get_dtypes())" + "cems = pd.read_csv(\n", + " f\"{outputs_folder()}/{year}/cems_cleaned_{year}.csv\", dtype=get_dtypes()\n", + ")" ] }, { @@ -68,8 +70,13 @@ "metadata": {}, "outputs": [], "source": [ - "# example CEMS data \n", - "px.line(cems[cems.plant_id_eia==3], x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"subplant_id\")" + "# example CEMS data\n", + "px.line(\n", + " cems[cems.plant_id_eia == 3],\n", + " x=\"datetime_utc\",\n", + " y=\"net_generation_mwh\",\n", + " color=\"subplant_id\",\n", + ")" ] }, { @@ -108,12 +115,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Table is unique by plant ID, generator ID. \n", + "# Table is unique by plant ID, generator ID.\n", "# For each plant and generator, find the maximum of the three capacity values (summer, winter, nameplate)\n", "gens = gens_pudl.copy(deep=True)\n", "gens[\"net_capacity_mw\"] = gens.winter_capacity_mw.combine(gens.summer_capacity_mw, max)\n", "gens[\"net_capacity_mw\"] = gens.net_capacity_mw.combine(gens.capacity_mw, max)\n", - "gens = gens.loc[:,[\"plant_id_eia\",\"generator_id\",\"net_capacity_mw\"]]" + "gens = gens.loc[:, [\"plant_id_eia\", \"generator_id\", \"net_capacity_mw\"]]" ] }, { @@ -132,7 +139,9 @@ "metadata": {}, "outputs": [], "source": [ - "gens_unit = gens.merge(subplant_crosswalk, how='left', on=['plant_id_eia','generator_id'])\n", + "gens_unit = gens.merge(\n", + " subplant_crosswalk, how=\"left\", on=[\"plant_id_eia\", \"generator_id\"]\n", + ")\n", "print(f\"Setting {sum(gens_unit.subplant_id.isna())} NaN subplants to 1 in 860 data\")\n", "gens_unit.loc[gens_unit.subplant_id.isna()] = 1" ] @@ -143,7 +152,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Group gens by subplant \n", + "# Group gens by subplant\n", "gens_per_sub = gens_unit.groupby([\"plant_id_eia\", \"subplant_id\"]).sum().reset_index()" ] }, @@ -172,8 +181,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Now that nans are gone, we can switch from the weird pandas int dtype to numpy dtype, which is required for merge \n", - "cems = cems.astype(dtype={\"subplant_id\":np.int32})" + "# Now that nans are gone, we can switch from the weird pandas int dtype to numpy dtype, which is required for merge\n", + "cems = cems.astype(dtype={\"subplant_id\": np.int32})" ] }, { @@ -182,7 +191,11 @@ "metadata": {}, "outputs": [], "source": [ - "cems_cap = cems.merge(gens_per_sub[[\"plant_id_eia\", \"subplant_id\", \"net_capacity_mw\"]], how='left', on=[\"plant_id_eia\", \"subplant_id\"])" + "cems_cap = cems.merge(\n", + " gens_per_sub[[\"plant_id_eia\", \"subplant_id\", \"net_capacity_mw\"]],\n", + " how=\"left\",\n", + " on=[\"plant_id_eia\", \"subplant_id\"],\n", + ")" ] }, { @@ -191,10 +204,37 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO: here we're assuming that all columns are bad if net gen is bad, and that all bad rows have bad net gen. \n", - "dat_cols = ['gross_generation_mwh', 'steam_load_1000_lb', 'fuel_consumed_mmbtu', 'co2_mass_lb', 'nox_mass_lb', 'so2_mass_lb', 'plant_id_epa', 'co2_mass_measurement_code', 'nox_mass_measurement_code', 'so2_mass_measurement_code', 'report_date', 'energy_source_code', 'ch4_mass_lb', 'n2o_mass_lb', 'fuel_consumed_for_electricity_mmbtu', 'co2_mass_lb_for_electricity', 'ch4_mass_lb_for_electricity', 'n2o_mass_lb_for_electricity', 'nox_mass_lb_for_electricity', 'so2_mass_lb_for_electricity', 'co2_mass_lb_adjusted', 'ch4_mass_lb_adjusted', 'n2o_mass_lb_adjusted', 'nox_mass_lb_adjusted', 'so2_mass_lb_adjusted','net_generation_mwh']\n", + "# TODO: here we're assuming that all columns are bad if net gen is bad, and that all bad rows have bad net gen.\n", + "dat_cols = [\n", + " \"gross_generation_mwh\",\n", + " \"steam_load_1000_lb\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"nox_mass_lb\",\n", + " \"so2_mass_lb\",\n", + " \"plant_id_epa\",\n", + " \"co2_mass_measurement_code\",\n", + " \"nox_mass_measurement_code\",\n", + " \"so2_mass_measurement_code\",\n", + " \"report_date\",\n", + " \"energy_source_code\",\n", + " \"ch4_mass_lb\",\n", + " \"n2o_mass_lb\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"ch4_mass_lb_for_electricity\",\n", + " \"n2o_mass_lb_for_electricity\",\n", + " \"nox_mass_lb_for_electricity\",\n", + " \"so2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_adjusted\",\n", + " \"ch4_mass_lb_adjusted\",\n", + " \"n2o_mass_lb_adjusted\",\n", + " \"nox_mass_lb_adjusted\",\n", + " \"so2_mass_lb_adjusted\",\n", + " \"net_generation_mwh\",\n", + "]\n", "bad = cems_cap.net_generation_mwh > cems_cap.net_capacity_mw\n", - "cems_cap.loc[bad,dat_cols] = np.nan" + "cems_cap.loc[bad, dat_cols] = np.nan" ] }, { @@ -212,8 +252,8 @@ "metadata": {}, "outputs": [], "source": [ - "## What proportion of CEMS data was ID'ed as bad using capacity filter? \n", - "sum(bad)/len(cems)" + "## What proportion of CEMS data was ID'ed as bad using capacity filter?\n", + "sum(bad) / len(cems)" ] }, { @@ -224,9 +264,19 @@ "source": [ "plant = 2410\n", "\n", - "print(gens_unit.loc[gens_unit.plant_id_eia==plant,[\"plant_id_eia\", \"subplant_id\", \"generator_id\",\"net_capacity_mw\"]])\n", + "print(\n", + " gens_unit.loc[\n", + " gens_unit.plant_id_eia == plant,\n", + " [\"plant_id_eia\", \"subplant_id\", \"generator_id\", \"net_capacity_mw\"],\n", + " ]\n", + ")\n", "\n", - "px.line(cems_cap[cems_cap.plant_id_eia==plant], x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"subplant_id\")" + "px.line(\n", + " cems_cap[cems_cap.plant_id_eia == plant],\n", + " x=\"datetime_utc\",\n", + " y=\"net_generation_mwh\",\n", + " color=\"subplant_id\",\n", + ")" ] }, { @@ -235,9 +285,20 @@ "metadata": {}, "outputs": [], "source": [ - "print(gens_unit.loc[gens_unit.plant_id_eia==plant,[\"plant_id_eia\", \"subplant_id\", \"generator_id\",\"net_capacity_mw\"]])\n", + "print(\n", + " gens_unit.loc[\n", + " gens_unit.plant_id_eia == plant,\n", + " [\"plant_id_eia\", \"subplant_id\", \"generator_id\", \"net_capacity_mw\"],\n", + " ]\n", + ")\n", "\n", - "px.line(cems[cems.plant_id_eia==plant], x=\"datetime_utc\", y=\"net_generation_mwh\", color=\"subplant_id\", title=f\"plant id = {plant}\")" + "px.line(\n", + " cems[cems.plant_id_eia == plant],\n", + " x=\"datetime_utc\",\n", + " y=\"net_generation_mwh\",\n", + " color=\"subplant_id\",\n", + " title=f\"plant id = {plant}\",\n", + ")" ] }, { @@ -256,9 +317,41 @@ "metadata": {}, "outputs": [], "source": [ - "# Get per-plant, per-variable median and IQR \n", - "numeric_cols = ['gross_generation_mwh', 'steam_load_1000_lb', 'fuel_consumed_mmbtu', 'co2_mass_lb', 'ch4_mass_lb', 'n2o_mass_lb', 'nox_mass_lb', 'so2_mass_lb', 'co2_mass_lb_adjusted', 'ch4_mass_lb_adjusted', 'n2o_mass_lb_adjusted', 'nox_mass_lb_adjusted', 'so2_mass_lb_adjusted', 'net_generation_mwh', 'fuel_consumed_for_electricity_mmbtu', 'co2_mass_lb_for_electricity', 'co2_mass_lb_for_electricity_adjusted', 'ch4_mass_lb_for_electricity', 'ch4_mass_lb_for_electricity_adjusted', 'n2o_mass_lb_for_electricity', 'n2o_mass_lb_for_electricity_adjusted', 'nox_mass_lb_for_electricity', 'nox_mass_lb_for_electricity_adjusted', 'so2_mass_lb_for_electricity', 'so2_mass_lb_for_electricity_adjusted', 'co2e_mass_lb', 'co2e_mass_lb_adjusted', 'co2e_mass_lb_for_electricity', 'co2e_mass_lb_for_electricity_adjusted']\n", - "iqr = cems.groupby([\"plant_id_eia\", \"subplant_id\"])[numeric_cols].quantile(.75) - cems.groupby([\"plant_id_eia\",\"subplant_id\"])[numeric_cols].quantile(.25)" + "# Get per-plant, per-variable median and IQR\n", + "numeric_cols = [\n", + " \"gross_generation_mwh\",\n", + " \"steam_load_1000_lb\",\n", + " \"fuel_consumed_mmbtu\",\n", + " \"co2_mass_lb\",\n", + " \"ch4_mass_lb\",\n", + " \"n2o_mass_lb\",\n", + " \"nox_mass_lb\",\n", + " \"so2_mass_lb\",\n", + " \"co2_mass_lb_adjusted\",\n", + " \"ch4_mass_lb_adjusted\",\n", + " \"n2o_mass_lb_adjusted\",\n", + " \"nox_mass_lb_adjusted\",\n", + " \"so2_mass_lb_adjusted\",\n", + " \"net_generation_mwh\",\n", + " \"fuel_consumed_for_electricity_mmbtu\",\n", + " \"co2_mass_lb_for_electricity\",\n", + " \"co2_mass_lb_for_electricity_adjusted\",\n", + " \"ch4_mass_lb_for_electricity\",\n", + " \"ch4_mass_lb_for_electricity_adjusted\",\n", + " \"n2o_mass_lb_for_electricity\",\n", + " \"n2o_mass_lb_for_electricity_adjusted\",\n", + " \"nox_mass_lb_for_electricity\",\n", + " \"nox_mass_lb_for_electricity_adjusted\",\n", + " \"so2_mass_lb_for_electricity\",\n", + " \"so2_mass_lb_for_electricity_adjusted\",\n", + " \"co2e_mass_lb\",\n", + " \"co2e_mass_lb_adjusted\",\n", + " \"co2e_mass_lb_for_electricity\",\n", + " \"co2e_mass_lb_for_electricity_adjusted\",\n", + "]\n", + "iqr = cems.groupby([\"plant_id_eia\", \"subplant_id\"])[numeric_cols].quantile(\n", + " 0.75\n", + ") - cems.groupby([\"plant_id_eia\", \"subplant_id\"])[numeric_cols].quantile(0.25)" ] }, { @@ -267,7 +360,7 @@ "metadata": {}, "outputs": [], "source": [ - "median = cems.groupby([\"plant_id_eia\",\"subplant_id\"]).median()" + "median = cems.groupby([\"plant_id_eia\", \"subplant_id\"]).median()" ] }, { @@ -287,8 +380,7 @@ "outputs": [], "source": [ "cems_filtered = cems.copy()\n", - "#for plant in cems_filtered.plant_id_eia.unique():\n", - " " + "# for plant in cems_filtered.plant_id_eia.unique():" ] }, { @@ -297,7 +389,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems_filtered = cems_filtered.set_index([\"plant_id_eia\",\"subplant_id\"])" + "cems_filtered = cems_filtered.set_index([\"plant_id_eia\", \"subplant_id\"])" ] }, { @@ -306,7 +398,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems_filtered.loc[(3,1)]" + "cems_filtered.loc[(3, 1)]" ] }, { @@ -315,7 +407,9 @@ "metadata": {}, "outputs": [], "source": [ - "checked = (cems_filtered.loc[(3,1), lower_bound.columns] < lower_bound.loc[(3,1)]) | (cems_filtered.loc[(3,1), lower_bound.columns] > upper_bound.loc[(3, 1)])" + "checked = (cems_filtered.loc[(3, 1), lower_bound.columns] < lower_bound.loc[(3, 1)]) | (\n", + " cems_filtered.loc[(3, 1), lower_bound.columns] > upper_bound.loc[(3, 1)]\n", + ")" ] }, { @@ -333,8 +427,13 @@ "metadata": {}, "outputs": [], "source": [ - "toplot = cems_filtered.loc[(3,1)]\n", - "px.scatter(toplot, x=\"datetime_utc\", y=\"fuel_consumed_mmbtu\", color=checked[\"fuel_consumed_mmbtu\"])" + "toplot = cems_filtered.loc[(3, 1)]\n", + "px.scatter(\n", + " toplot,\n", + " x=\"datetime_utc\",\n", + " y=\"fuel_consumed_mmbtu\",\n", + " color=checked[\"fuel_consumed_mmbtu\"],\n", + ")" ] }, { @@ -343,7 +442,7 @@ "metadata": {}, "outputs": [], "source": [ - "median.loc[(3,1)]" + "median.loc[(3, 1)]" ] } ], diff --git a/notebooks/work_in_progress/issue230_spikes.ipynb b/notebooks/work_in_progress/issue230_spikes.ipynb index 5a3468ed..7a98e8e2 100644 --- a/notebooks/work_in_progress/issue230_spikes.ipynb +++ b/notebooks/work_in_progress/issue230_spikes.ipynb @@ -24,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd \n", + "import pandas as pd\n", "import plotly.express as px" ] }, @@ -39,11 +39,12 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../src/')\n", + "\n", + "sys.path.append(\"../../src/\")\n", "\n", "import impute_hourly_profiles\n", "from filepaths import outputs_folder, results_folder\n", - "from column_checks import get_dtypes\n" + "from column_checks import get_dtypes" ] }, { @@ -52,7 +53,9 @@ "metadata": {}, "outputs": [], "source": [ - "problem_profiles = pd.read_csv(f\"{outputs_folder()}/2020/hourly_profiles_2020.csv\", dtype=get_dtypes())" + "problem_profiles = pd.read_csv(\n", + " f\"{outputs_folder()}/2020/hourly_profiles_2020.csv\", dtype=get_dtypes()\n", + ")" ] }, { @@ -61,7 +64,9 @@ "metadata": {}, "outputs": [], "source": [ - "problem_profiles = problem_profiles[(problem_profiles.ba_code==\"PJM\") & (problem_profiles.fuel_category==\"nuclear\")]" + "problem_profiles = problem_profiles[\n", + " (problem_profiles.ba_code == \"PJM\") & (problem_profiles.fuel_category == \"nuclear\")\n", + "]" ] }, { @@ -72,7 +77,16 @@ "source": [ "problem_profiles.head()\n", "\n", - "px.line(problem_profiles, x=\"datetime_utc\", y=[\"residual_profile\",\"scaled_residual_profile\",\"shifted_residual_profile\",\"profile\"])" + "px.line(\n", + " problem_profiles,\n", + " x=\"datetime_utc\",\n", + " y=[\n", + " \"residual_profile\",\n", + " \"scaled_residual_profile\",\n", + " \"shifted_residual_profile\",\n", + " \"profile\",\n", + " ],\n", + ")" ] }, { @@ -81,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "problem_profiles.profile_method.unique()\n" + "problem_profiles.profile_method.unique()" ] }, { @@ -90,7 +104,10 @@ "metadata": {}, "outputs": [], "source": [ - "problem_profiles[(problem_profiles.datetime_utc > \"2020-04-15\") & (problem_profiles.datetime_utc < \"2020-04-16\")]" + "problem_profiles[\n", + " (problem_profiles.datetime_utc > \"2020-04-15\")\n", + " & (problem_profiles.datetime_utc < \"2020-04-16\")\n", + "]" ] }, { @@ -99,8 +116,10 @@ "metadata": {}, "outputs": [], "source": [ - "### Look for the issue in the next output \n", - "shaped = pd.read_csv(f\"{outputs_folder()}/2020/shaped_eia923_data_2020.csv\", dtype=get_dtypes())" + "### Look for the issue in the next output\n", + "shaped = pd.read_csv(\n", + " f\"{outputs_folder()}/2020/shaped_eia923_data_2020.csv\", dtype=get_dtypes()\n", + ")" ] }, { @@ -109,7 +128,7 @@ "metadata": {}, "outputs": [], "source": [ - "shaped = shaped[(shaped.ba_code==\"PJM\")]" + "shaped = shaped[(shaped.ba_code == \"PJM\")]" ] }, { @@ -118,7 +137,13 @@ "metadata": {}, "outputs": [], "source": [ - "px.line(shaped, x=\"datetime_utc\", y=\"net_generation_mwh\", line_group=\"fuel_category\", color=\"fuel_category\")" + "px.line(\n", + " shaped,\n", + " x=\"datetime_utc\",\n", + " y=\"net_generation_mwh\",\n", + " line_group=\"fuel_category\",\n", + " color=\"fuel_category\",\n", + ")" ] }, { @@ -127,8 +152,10 @@ "metadata": {}, "outputs": [], "source": [ - "#### Ok, issue is not in 923 shaped data, so it must just be in CEMS data \n", - "plant_level = pd.read_csv(f\"{results_folder()}/2020/plant_data/hourly/us_units/individual_plant_data.csv\")" + "#### Ok, issue is not in 923 shaped data, so it must just be in CEMS data\n", + "plant_level = pd.read_csv(\n", + " f\"{results_folder()}/2020/plant_data/hourly/us_units/individual_plant_data.csv\"\n", + ")" ] }, { diff --git a/notebooks/work_in_progress/sandbox.ipynb b/notebooks/work_in_progress/sandbox.ipynb index b69f7db2..37714709 100644 --- a/notebooks/work_in_progress/sandbox.ipynb +++ b/notebooks/work_in_progress/sandbox.ipynb @@ -17,7 +17,8 @@ "\n", "# # Tell python where to look for modules.\n", "import sys\n", - "sys.path.append('../../../open-grid-emissions/src/')\n", + "\n", + "sys.path.append(\"../../../open-grid-emissions/src/\")\n", "\n", "import download_data\n", "import load_data\n", @@ -31,6 +32,7 @@ "import gross_to_net_generation\n", "import eia930\n", "from logging_util import get_logger, configure_root_logger\n", + "\n", "configure_root_logger()\n", "logger = get_logger(\"test\")\n", "\n", diff --git a/notebooks/work_in_progress/uncertainty_analysis.ipynb b/notebooks/work_in_progress/uncertainty_analysis.ipynb index e327b28a..79f519d4 100644 --- a/notebooks/work_in_progress/uncertainty_analysis.ipynb +++ b/notebooks/work_in_progress/uncertainty_analysis.ipynb @@ -18,7 +18,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Calculate min and max possible at each hour \n", + "# Calculate min and max possible at each hour\n", "import pandas as pd\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", @@ -34,10 +34,11 @@ "%reload_ext autoreload\n", "%autoreload 2\n", "\n", - "# Tell python where to look for modules. \n", + "# Tell python where to look for modules.\n", "# Depending on how your jupyter handles working directories, this may not be needed.\n", "import sys\n", - "sys.path.append('../../hourly-egrid/')\n", + "\n", + "sys.path.append(\"../../hourly-egrid/\")\n", "\n", "import src.load_data as load_data\n", "import src.column_checks as column_checks" @@ -49,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "year = 2020 " + "year = 2020" ] }, { @@ -74,8 +75,14 @@ "metadata": {}, "outputs": [], "source": [ - "flat = pd.read_csv(f\"../data/results/flat/{year}/power_sector_data/hourly/us_units/{ba}.csv\", parse_dates=[\"datetime_utc\"])\n", - "base = pd.read_csv(f\"../data/results/{year}/power_sector_data/hourly/us_units/{ba}.csv\", parse_dates=[\"datetime_utc\"])\n" + "flat = pd.read_csv(\n", + " f\"../data/results/flat/{year}/power_sector_data/hourly/us_units/{ba}.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ")\n", + "base = pd.read_csv(\n", + " f\"../data/results/{year}/power_sector_data/hourly/us_units/{ba}.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ")" ] }, { @@ -84,8 +91,8 @@ "metadata": {}, "outputs": [], "source": [ - "flat = flat[flat.fuel_category==\"total\"]\n", - "base = base[base.fuel_category==\"total\"]" + "flat = flat[flat.fuel_category == \"total\"]\n", + "base = base[base.fuel_category == \"total\"]" ] }, { @@ -94,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "both = flat.merge(base, how='left', on='datetime_utc', suffixes=(\"_flat\", \"_base\"))" + "both = flat.merge(base, how=\"left\", on=\"datetime_utc\", suffixes=(\"_flat\", \"_base\"))" ] }, { @@ -112,10 +119,18 @@ "metadata": {}, "outputs": [], "source": [ - "fig = px.line(both, x=\"datetime_utc\", y=['generated_co2e_rate_lb_per_mwh_for_electricity_adjusted_flat','generated_co2e_rate_lb_per_mwh_for_electricity_adjusted_base'], title=f\"{ba}, carbon intensity using flat vs. base generation\")\n", + "fig = px.line(\n", + " both,\n", + " x=\"datetime_utc\",\n", + " y=[\n", + " \"generated_co2e_rate_lb_per_mwh_for_electricity_adjusted_flat\",\n", + " \"generated_co2e_rate_lb_per_mwh_for_electricity_adjusted_base\",\n", + " ],\n", + " title=f\"{ba}, carbon intensity using flat vs. base generation\",\n", + ")\n", "fig.show()\n", - "#newnames = {'generated_co2_rate_lb_per_mwh_adjusted': 'Our data', 'generated_co2_rate_lb_per_mwh_for_electricity_adjusted': 'Real-time data'}\n", - "#fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))" + "# newnames = {'generated_co2_rate_lb_per_mwh_adjusted': 'Our data', 'generated_co2_rate_lb_per_mwh_for_electricity_adjusted': 'Real-time data'}\n", + "# fig.for_each_trace(lambda t: t.update(name = newnames[t.name]))" ] }, { @@ -131,7 +146,10 @@ "metadata": {}, "outputs": [], "source": [ - "cems = pd.read_csv(\"../data/results/2020/plant_data/hourly/us_units/individual_plant_data.csv\", parse_dates=[\"datetime_utc\", \"report_date\"])\n", + "cems = pd.read_csv(\n", + " \"../data/results/2020/plant_data/hourly/us_units/individual_plant_data.csv\",\n", + " parse_dates=[\"datetime_utc\", \"report_date\"],\n", + ")\n", "plant_meta = pd.read_csv(\"../data/results/2020/plant_data/plant_static_attributes.csv\")" ] }, @@ -141,7 +159,7 @@ "metadata": {}, "outputs": [], "source": [ - "cems = cems.merge(plant_meta, how='left', on='plant_id_eia')" + "cems = cems.merge(plant_meta, how=\"left\", on=\"plant_id_eia\")" ] }, { @@ -178,8 +196,12 @@ "metadata": {}, "outputs": [], "source": [ - "caps = pudl_reader.gens_eia860()[[\"plant_id_eia\", \"capacity_mw\", \"summer_capacity_mw\", \"winter_capacity_mw\"]].copy()\n", - "caps[\"max_capacity\"] = caps[[\"capacity_mw\", \"summer_capacity_mw\", \"winter_capacity_mw\"]].max(axis=1)\n", + "caps = pudl_reader.gens_eia860()[\n", + " [\"plant_id_eia\", \"capacity_mw\", \"summer_capacity_mw\", \"winter_capacity_mw\"]\n", + "].copy()\n", + "caps[\"max_capacity\"] = caps[\n", + " [\"capacity_mw\", \"summer_capacity_mw\", \"winter_capacity_mw\"]\n", + "].max(axis=1)\n", "caps = caps.groupby(\"plant_id_eia\").sum()[\"max_capacity\"]" ] }, @@ -189,7 +211,10 @@ "metadata": {}, "outputs": [], "source": [ - "monthly_rates = pd.read_csv(\"../data/results/2020/plant_data/monthly/us_units/plant_data.csv\", parse_dates=[\"report_date\"])" + "monthly_rates = pd.read_csv(\n", + " \"../data/results/2020/plant_data/monthly/us_units/plant_data.csv\",\n", + " parse_dates=[\"report_date\"],\n", + ")" ] }, { @@ -198,8 +223,10 @@ "metadata": {}, "outputs": [], "source": [ - "monthly_rates = monthly_rates.merge(caps, how='left', left_on='plant_id_eia', right_index=True)\n", - "monthly_rates = monthly_rates.merge(plant_meta, how='left', on='plant_id_eia')" + "monthly_rates = monthly_rates.merge(\n", + " caps, how=\"left\", left_on=\"plant_id_eia\", right_index=True\n", + ")\n", + "monthly_rates = monthly_rates.merge(plant_meta, how=\"left\", on=\"plant_id_eia\")" ] }, { @@ -208,12 +235,15 @@ "metadata": {}, "outputs": [], "source": [ - "# Calculate per-plant, per-month maximum net generation and emissions: \n", + "# Calculate per-plant, per-month maximum net generation and emissions:\n", "# max net gen = capacity\n", "# max emissions = capacity * monthly rate\n", "pols = [\"co2\", \"ch4\", \"n2o\", \"co2e\", \"nox\", \"so2\"]\n", "for pol in pols:\n", - " monthly_rates[f\"max_hourly_{pol}\"] = monthly_rates[\"max_capacity\"] * (monthly_rates[f\"{pol}_mass_lb_for_electricity_adjusted\"]/monthly_rates[\"net_generation_mwh\"])" + " monthly_rates[f\"max_hourly_{pol}\"] = monthly_rates[\"max_capacity\"] * (\n", + " monthly_rates[f\"{pol}_mass_lb_for_electricity_adjusted\"]\n", + " / monthly_rates[\"net_generation_mwh\"]\n", + " )" ] }, { @@ -224,10 +254,17 @@ "source": [ "# Filter for non-CEMS plants\n", "# Filter for plants with non-zero generation in this month: we know that hourly generation shouldn't include the capacity of these plants\n", - "# Group by BA and month to get monthly max hourly emissions \n", + "# Group by BA and month to get monthly max hourly emissions\n", "pol_cols = [f\"max_hourly_{p}\" for p in pols] + [\"max_capacity\"]\n", - "filtered_monthly_rates = monthly_rates[(monthly_rates.data_availability == \"eia_only\") & (monthly_rates.net_generation_mwh > 0)]\n", - "non_cems_maxes = filtered_monthly_rates.groupby([\"report_date\", \"ba_code\"])[pol_cols].sum().reset_index()" + "filtered_monthly_rates = monthly_rates[\n", + " (monthly_rates.data_availability == \"eia_only\")\n", + " & (monthly_rates.net_generation_mwh > 0)\n", + "]\n", + "non_cems_maxes = (\n", + " filtered_monthly_rates.groupby([\"report_date\", \"ba_code\"])[pol_cols]\n", + " .sum()\n", + " .reset_index()\n", + ")" ] }, { @@ -236,13 +273,16 @@ "metadata": {}, "outputs": [], "source": [ - "# Identify plants with negative net generation. \n", + "# Identify plants with negative net generation.\n", "# These will be included in net generation minimum, emissions maximum\n", - "negative_monthly = monthly_rates[(monthly_rates.data_availability == \"eia_only\") & (monthly_rates.net_generation_mwh < 0)]\n", + "negative_monthly = monthly_rates[\n", + " (monthly_rates.data_availability == \"eia_only\")\n", + " & (monthly_rates.net_generation_mwh < 0)\n", + "]\n", "assert len(negative_monthly) == 0\n", "\n", "\n", - "# ########### FAKE NEGATIVE PLANT FOR TESTING \n", + "# ########### FAKE NEGATIVE PLANT FOR TESTING\n", "# negative_monthly = monthly_rates[(monthly_rates.plant_id_eia == 3)].copy()\n", "# negative_monthly.loc[:,\"net_generation_mwh\"] = -300" ] @@ -264,8 +304,12 @@ "outputs": [], "source": [ "# Add report date back in (dropped during groupby)\n", - "report_dates = cems.groupby([\"datetime_utc\",\"ba_code\"])[\"report_date\"].first().reset_index()\n", - "minimum_bound = minimum_bound.merge(report_dates, how='left', on=[\"datetime_utc\",\"ba_code\"])" + "report_dates = (\n", + " cems.groupby([\"datetime_utc\", \"ba_code\"])[\"report_date\"].first().reset_index()\n", + ")\n", + "minimum_bound = minimum_bound.merge(\n", + " report_dates, how=\"left\", on=[\"datetime_utc\", \"ba_code\"]\n", + ")" ] }, { @@ -274,8 +318,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Calculate maximum by merging hourly max emissions and generation into minimum, then adding \n", - "maximum_bound = minimum_bound.merge(non_cems_maxes, how='left', on=[\"report_date\", \"ba_code\"])" + "# Calculate maximum by merging hourly max emissions and generation into minimum, then adding\n", + "maximum_bound = minimum_bound.merge(\n", + " non_cems_maxes, how=\"left\", on=[\"report_date\", \"ba_code\"]\n", + ")" ] }, { @@ -285,8 +331,13 @@ "outputs": [], "source": [ "for pol in pols:\n", - " maximum_bound[f\"{pol}_mass_lb_for_electricity_adjusted\"] = maximum_bound[f\"{pol}_mass_lb_for_electricity_adjusted\"] + maximum_bound[f\"max_hourly_{pol}\"]\n", - "maximum_bound[\"net_generation_mwh\"] = maximum_bound[\"net_generation_mwh\"] + maximum_bound[\"max_capacity\"]\n", + " maximum_bound[f\"{pol}_mass_lb_for_electricity_adjusted\"] = (\n", + " maximum_bound[f\"{pol}_mass_lb_for_electricity_adjusted\"]\n", + " + maximum_bound[f\"max_hourly_{pol}\"]\n", + " )\n", + "maximum_bound[\"net_generation_mwh\"] = (\n", + " maximum_bound[\"net_generation_mwh\"] + maximum_bound[\"max_capacity\"]\n", + ")\n", "maximum_bound = maximum_bound[cols + [\"datetime_utc\", \"ba_code\"]]" ] }, @@ -312,8 +363,14 @@ "metadata": {}, "outputs": [], "source": [ - "flat = pd.read_csv(f\"../data/results/flat/2020/power_sector_data/hourly/us_units/{ba}.csv\", parse_dates=[\"datetime_utc\"])\n", - "base = pd.read_csv(f\"../data/results/2020/power_sector_data/hourly/us_units/{ba}.csv\", parse_dates=[\"datetime_utc\"])" + "flat = pd.read_csv(\n", + " f\"../data/results/flat/2020/power_sector_data/hourly/us_units/{ba}.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ")\n", + "base = pd.read_csv(\n", + " f\"../data/results/2020/power_sector_data/hourly/us_units/{ba}.csv\",\n", + " parse_dates=[\"datetime_utc\"],\n", + ")" ] }, { @@ -322,7 +379,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Only take totals \n", + "# Only take totals\n", "flat = flat[flat.fuel_category == \"total\"]\n", "base = base[base.fuel_category == \"total\"]" ] @@ -333,30 +390,50 @@ "metadata": {}, "outputs": [], "source": [ - "this_max = maximum_bound[maximum_bound.ba_code==ba]\n", - "this_min = minimum_bound[minimum_bound.ba_code==ba]\n", + "this_max = maximum_bound[maximum_bound.ba_code == ba]\n", + "this_min = minimum_bound[minimum_bound.ba_code == ba]\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(go.Scatter(x=this_min.datetime_utc, y=this_min.net_generation_mwh,\n", - " fill=None,\n", - " mode='lines',\n", - " line_color='indigo', showlegend=False\n", - " ))\n", - "fig.add_trace(go.Scatter(\n", - " x=this_max.datetime_utc,\n", - " y=this_max.net_generation_mwh,\n", - " fill='tonexty', # fill area between trace0 and trace1\n", - " mode='lines', line_color='indigo', name=\"Min/max possible\"))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=this_min.datetime_utc,\n", + " y=this_min.net_generation_mwh,\n", + " fill=None,\n", + " mode=\"lines\",\n", + " line_color=\"indigo\",\n", + " showlegend=False,\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=this_max.datetime_utc,\n", + " y=this_max.net_generation_mwh,\n", + " fill=\"tonexty\", # fill area between trace0 and trace1\n", + " mode=\"lines\",\n", + " line_color=\"indigo\",\n", + " name=\"Min/max possible\",\n", + " )\n", + ")\n", "\n", - "fig.add_trace(go.Scatter(\n", - " x=flat.datetime_utc,\n", - " y=flat.net_generation_mwh,\n", - " mode='lines', line_color='brown', name=\"Flat\"))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=flat.datetime_utc,\n", + " y=flat.net_generation_mwh,\n", + " mode=\"lines\",\n", + " line_color=\"brown\",\n", + " name=\"Flat\",\n", + " )\n", + ")\n", "\n", - "fig.add_trace(go.Scatter(\n", - " x=base.datetime_utc,\n", - " y=base.net_generation_mwh,\n", - " mode='lines', line_color='blue', name=\"Best guess\"))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=base.datetime_utc,\n", + " y=base.net_generation_mwh,\n", + " mode=\"lines\",\n", + " line_color=\"blue\",\n", + " name=\"Best guess\",\n", + " )\n", + ")\n", "\n", "fig.update_layout(\n", " title=f\"{ba} total net generation estimates\",\n", @@ -365,7 +442,7 @@ " legend_title=\"Estimate type\",\n", ")\n", "\n", - "fig.show()\n" + "fig.show()" ] }, { @@ -374,30 +451,50 @@ "metadata": {}, "outputs": [], "source": [ - "this_max = maximum_bound[maximum_bound.ba_code==ba]\n", - "this_min = minimum_bound[minimum_bound.ba_code==ba]\n", + "this_max = maximum_bound[maximum_bound.ba_code == ba]\n", + "this_min = minimum_bound[minimum_bound.ba_code == ba]\n", "\n", "fig = go.Figure()\n", - "fig.add_trace(go.Scatter(x=this_min.datetime_utc, y=this_min.co2_mass_lb_for_electricity_adjusted,\n", - " fill=None,\n", - " mode='lines',\n", - " line_color='indigo', showlegend=False\n", - " ))\n", - "fig.add_trace(go.Scatter(\n", - " x=this_max.datetime_utc,\n", - " y=this_max.co2_mass_lb_for_electricity_adjusted,\n", - " fill='tonexty', # fill area between trace0 and trace1\n", - " mode='lines', line_color='indigo', name=\"Min/max possible\"))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=this_min.datetime_utc,\n", + " y=this_min.co2_mass_lb_for_electricity_adjusted,\n", + " fill=None,\n", + " mode=\"lines\",\n", + " line_color=\"indigo\",\n", + " showlegend=False,\n", + " )\n", + ")\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=this_max.datetime_utc,\n", + " y=this_max.co2_mass_lb_for_electricity_adjusted,\n", + " fill=\"tonexty\", # fill area between trace0 and trace1\n", + " mode=\"lines\",\n", + " line_color=\"indigo\",\n", + " name=\"Min/max possible\",\n", + " )\n", + ")\n", "\n", - "fig.add_trace(go.Scatter(\n", - " x=flat.datetime_utc,\n", - " y=flat.co2_mass_lb_for_electricity_adjusted,\n", - " mode='lines', line_color='brown', name=\"Flat\"))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=flat.datetime_utc,\n", + " y=flat.co2_mass_lb_for_electricity_adjusted,\n", + " mode=\"lines\",\n", + " line_color=\"brown\",\n", + " name=\"Flat\",\n", + " )\n", + ")\n", "\n", - "fig.add_trace(go.Scatter(\n", - " x=base.datetime_utc,\n", - " y=base.co2_mass_lb_for_electricity_adjusted,\n", - " mode='lines', line_color='blue', name=\"Best guess\"))\n", + "fig.add_trace(\n", + " go.Scatter(\n", + " x=base.datetime_utc,\n", + " y=base.co2_mass_lb_for_electricity_adjusted,\n", + " mode=\"lines\",\n", + " line_color=\"blue\",\n", + " name=\"Best guess\",\n", + " )\n", + ")\n", "\n", "fig.update_layout(\n", " title=f\"{ba} total carbon emissions\",\n", @@ -406,7 +503,7 @@ " legend_title=\"Estimate type\",\n", ")\n", "\n", - "fig.show()\n" + "fig.show()" ] }, { @@ -427,23 +524,51 @@ "\n", "issues = []\n", "for ba in os.listdir(\"../data/results/2020/power_sector_data/hourly/us_units/\"):\n", - " if \".DS_Store\" in ba: \n", + " if \".DS_Store\" in ba:\n", " continue\n", - " our_guess = pd.read_csv(f\"../data/results/2020/power_sector_data/hourly/us_units/{ba}\", parse_dates=[\"datetime_utc\"])\n", + " our_guess = pd.read_csv(\n", + " f\"../data/results/2020/power_sector_data/hourly/us_units/{ba}\",\n", + " parse_dates=[\"datetime_utc\"],\n", + " )\n", " our_guess = our_guess[our_guess.fuel_category == \"total\"]\n", " ba = ba.replace(\".csv\", \"\")\n", - " to_compare = our_guess[[\"datetime_utc\", \"co2_mass_lb_for_electricity_adjusted\", \"net_generation_mwh\"]].merge(minimum_bound.loc[minimum_bound.ba_code==ba,[\"datetime_utc\", \"co2_mass_lb_for_electricity_adjusted\", \"net_generation_mwh\"]],\n", - " how='left',\n", - " on='datetime_utc',\n", - " suffixes=(\"\",\"_min\")\n", + " to_compare = our_guess[\n", + " [\"datetime_utc\", \"co2_mass_lb_for_electricity_adjusted\", \"net_generation_mwh\"]\n", + " ].merge(\n", + " minimum_bound.loc[\n", + " minimum_bound.ba_code == ba,\n", + " [\n", + " \"datetime_utc\",\n", + " \"co2_mass_lb_for_electricity_adjusted\",\n", + " \"net_generation_mwh\",\n", + " ],\n", + " ],\n", + " how=\"left\",\n", + " on=\"datetime_utc\",\n", + " suffixes=(\"\", \"_min\"),\n", " )\n", - " to_compare = to_compare.merge(maximum_bound.loc[maximum_bound.ba_code==ba,[\"datetime_utc\", \"co2_mass_lb_for_electricity_adjusted\", \"net_generation_mwh\"]],\n", - " how='left',\n", - " on='datetime_utc',\n", - " suffixes=(\"_best\",\"_max\")\n", + " to_compare = to_compare.merge(\n", + " maximum_bound.loc[\n", + " maximum_bound.ba_code == ba,\n", + " [\n", + " \"datetime_utc\",\n", + " \"co2_mass_lb_for_electricity_adjusted\",\n", + " \"net_generation_mwh\",\n", + " ],\n", + " ],\n", + " how=\"left\",\n", + " on=\"datetime_utc\",\n", + " suffixes=(\"_best\", \"_max\"),\n", " )\n", - " to_compare[\"ok\"] = (to_compare[f\"{col_to_check}_min\"] > to_compare[f\"{col_to_check}_best\"]) | (to_compare[f\"{col_to_check}_best\"] > to_compare[f\"{col_to_check}_max\"])\n", - " issues.append(to_compare[[\"datetime_utc\", \"ok\"]].set_index(\"datetime_utc\").squeeze().rename(ba))\n" + " to_compare[\"ok\"] = (\n", + " to_compare[f\"{col_to_check}_min\"] > to_compare[f\"{col_to_check}_best\"]\n", + " ) | (to_compare[f\"{col_to_check}_best\"] > to_compare[f\"{col_to_check}_max\"])\n", + " issues.append(\n", + " to_compare[[\"datetime_utc\", \"ok\"]]\n", + " .set_index(\"datetime_utc\")\n", + " .squeeze()\n", + " .rename(ba)\n", + " )" ] }, { @@ -470,7 +595,7 @@ "metadata": {}, "outputs": [], "source": [ - "monthly_rates[monthly_rates.ba_code==\"DEAA\"]" + "monthly_rates[monthly_rates.ba_code == \"DEAA\"]" ] }, { From f1a8e3acdf24afd97bea4ff2d0edddf882a7e95c Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Wed, 29 Nov 2023 10:27:27 -0800 Subject: [PATCH 3/8] chore: format __init__ files --- src/__init__.py | 1 + test/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index efd6d259..ef0055b3 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -2,4 +2,5 @@ import logging from .logging_util import configure_root_logger from .filepaths import outputs_folder + configure_root_logger(outputs_folder("logfile.txt"), logging.INFO) diff --git a/test/__init__.py b/test/__init__.py index 3ccc99fe..af248428 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -1,3 +1,4 @@ # Tell path where our code is import sys -sys.path.append('../src') + +sys.path.append("../src") From cb1c4090362a778c7a8b65d03ce91f1aa997f0d1 Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Mon, 11 Dec 2023 13:16:17 -0800 Subject: [PATCH 4/8] chore: format modules --- src/consumed.py | 10 +++++----- src/data_cleaning.py | 12 +++--------- src/eia930.py | 4 +--- src/impute_hourly_profiles.py | 4 +--- src/validation.py | 22 ++++++---------------- 5 files changed, 16 insertions(+), 36 deletions(-) diff --git a/src/consumed.py b/src/consumed.py index 5ff7cce8..ba75b609 100644 --- a/src/consumed.py +++ b/src/consumed.py @@ -439,12 +439,12 @@ def _load_rates(self): ) # Cut off emissions at 9 hours after UTC year - emissions = emissions[:f"{self.year+1}-01-01 09:00:00+00:00"] - rates[((adj, pol))] = emissions + emissions = emissions[: f"{self.year+1}-01-01 09:00:00+00:00"] + rates[(adj, pol)] = emissions # Make generation data frame generation = pd.DataFrame(data=gens) - generation = generation[:f"{self.year+1}-01-01 09:00:00+00:00"] + generation = generation[: f"{self.year+1}-01-01 09:00:00+00:00"] return rates, generation @@ -462,7 +462,7 @@ def build_matrices(self, pol: str, adj: str, date): # Build generation array, using 930 for import-only regions G = np.zeros(len(self.regions)) - for (i, r) in enumerate(self.regions): + for i, r in enumerate(self.regions): if r in self.import_regions: G[i] = self.eia930.df.loc[date, KEYS["E"]["NG"] % r] else: @@ -513,7 +513,7 @@ def run(self): consumed_emissions = np.full(len(self.regions), np.nan) # Export - for (i, r) in enumerate(self.regions): + for i, r in enumerate(self.regions): self.results[r].loc[date, col] = consumed_emissions[i] if total_failed > 0: logger.warning( diff --git a/src/data_cleaning.py b/src/data_cleaning.py index e5c70a14..1589f019 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -88,9 +88,7 @@ def generate_subplant_ids(start_year, end_year, cems_ids): filtered_crosswalk = epacamd_eia.filter_crosswalk(crosswalk, cems_ids) # use graph analysis to identify subplants - crosswalk_with_subplant_ids = make_subplant_ids( - filtered_crosswalk - ) + crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk) # change the eia plant id to int crosswalk_with_subplant_ids["plant_id_eia"] = crosswalk_with_subplant_ids[ @@ -462,9 +460,7 @@ def clean_eia923( "fuel_consumed_mmbtu", "fuel_consumed_for_electricity_mmbtu", ], - ].round( - 1 - ) + ].round(1) validation.test_for_missing_energy_source_code(gen_fuel_allocated) validation.test_for_negative_values(gen_fuel_allocated) @@ -637,9 +633,7 @@ def create_primary_fuel_table(gen_fuel_allocated, add_subplant_id, year): ascending=True, ).drop_duplicates( subset=["plant_id_eia", "subplant_id", "generator_id"], keep="last" - )[ - ["plant_id_eia", "subplant_id", "generator_id", "energy_source_code"] - ] + )[["plant_id_eia", "subplant_id", "generator_id", "energy_source_code"]] if not add_subplant_id: gen_primary_fuel = gen_primary_fuel.drop(columns=["subplant_id"]) diff --git a/src/eia930.py b/src/eia930.py index a4892f64..88da923d 100644 --- a/src/eia930.py +++ b/src/eia930.py @@ -459,9 +459,7 @@ def manual_930_adjust(raw: pd.DataFrame): & (raw.index < "2022-06-16 07:00:00+00") ), cols, - ].shift( - 1, freq="H" - ) + ].shift(1, freq="H") raw = raw.drop(columns=cols) raw = pd.concat([raw, new], axis="columns") diff --git a/src/impute_hourly_profiles.py b/src/impute_hourly_profiles.py index d796b013..102192cd 100644 --- a/src/impute_hourly_profiles.py +++ b/src/impute_hourly_profiles.py @@ -425,9 +425,7 @@ def calculate_residual( ~combined_data["eia930_profile"].isna(), "cems_profile" ] = combined_data.loc[ ~combined_data["eia930_profile"].isna(), "cems_profile" - ].fillna( - 0 - ) + ].fillna(0) combined_data = calculate_scaled_residual(combined_data) combined_data = calculate_shifted_residual(combined_data) diff --git a/src/validation.py b/src/validation.py index 2302762c..beb91e2e 100644 --- a/src/validation.py +++ b/src/validation.py @@ -340,7 +340,7 @@ def check_missing_or_zero_generation_matches(combined_gen_data): # identify when there is zero or NA gross generation associated with positive net generation missing_gross_gen = combined_gen_data[ (combined_gen_data["net_generation_mwh"] > 0) - & ((combined_gen_data["gross_generation_mwh"] == 0)) + & (combined_gen_data["gross_generation_mwh"] == 0) ] # identify when there is zero or NA net generation associated with nonzero gross generation @@ -620,9 +620,7 @@ def ensure_non_overlapping_data_from_all_sources( ["in_eia", "in_cems", "in_partial_cems_subplant", "in_partial_cems_plant"] ] = data_overlap[ ["in_eia", "in_cems", "in_partial_cems_subplant", "in_partial_cems_plant"] - ].fillna( - 0 - ) + ].fillna(0) data_overlap["number_of_locations"] = ( data_overlap["in_eia"] + data_overlap["in_cems"] @@ -1257,9 +1255,7 @@ def summarize_cems_measurement_quality(cems): "so2_mass_measurement_code", "nox_mass_measurement_code", ] - ].astype( - str - ) + ].astype(str) # replace the CEMS mass measurement codes with two categories measurement_code_map = { "Measured": "Measured", @@ -1282,9 +1278,7 @@ def summarize_cems_measurement_quality(cems): "so2_mass_measurement_code", "nox_mass_measurement_code", ] - ].replace( - measurement_code_map - ) + ].replace(measurement_code_map) cems_quality_summary = [] # calculate the percent of mass for each pollutant that is measured or imputed @@ -1797,16 +1791,12 @@ def load_egrid_plant_file(year): ] = egrid_plant.loc[ egrid_plant["plant_primary_fuel"].isin(CLEAN_FUELS), "co2_mass_lb_for_electricity_adjusted", - ].fillna( - 0 - ) + ].fillna(0) egrid_plant.loc[ egrid_plant["plant_primary_fuel"].isin(CLEAN_FUELS), "co2_mass_lb" ] = egrid_plant.loc[ egrid_plant["plant_primary_fuel"].isin(CLEAN_FUELS), "co2_mass_lb" - ].fillna( - 0 - ) + ].fillna(0) # reorder the columns egrid_plant = egrid_plant[ From 8fa55c0751f50840f01addf40d95204d93c89647 Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Mon, 11 Dec 2023 13:24:44 -0800 Subject: [PATCH 5/8] chore: lint modules --- src/data_cleaning.py | 4 +--- src/eia930.py | 2 +- src/gross_to_net_generation.py | 3 +-- src/load_data.py | 1 - src/output_data.py | 2 +- src/validation.py | 2 +- 6 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/data_cleaning.py b/src/data_cleaning.py index 1589f019..c2bf1d24 100644 --- a/src/data_cleaning.py +++ b/src/data_cleaning.py @@ -1,7 +1,5 @@ import pandas as pd import numpy as np -import os -import sqlalchemy as sa import pudl.analysis.allocate_gen_fuel as allocate_gen_fuel import pudl.analysis.epacamd_eia as epacamd_eia @@ -12,7 +10,7 @@ import emissions from emissions import CLEAN_FUELS from column_checks import get_dtypes, apply_dtypes -from filepaths import manual_folder, outputs_folder, downloads_folder +from filepaths import manual_folder, outputs_folder from logging_util import get_logger logger = get_logger(__name__) diff --git a/src/eia930.py b/src/eia930.py index 88da923d..2e49a15a 100644 --- a/src/eia930.py +++ b/src/eia930.py @@ -12,7 +12,7 @@ # Tell gridemissions where to find config before we load gridemissions os.environ["GRIDEMISSIONS_CONFIG_FILE_PATH"] = top_folder("config/gridemissions.json") -from gridemissions.workflows import make_dataset +from gridemissions.workflows import make_dataset # noqa E402 logger = get_logger(__name__) diff --git a/src/gross_to_net_generation.py b/src/gross_to_net_generation.py index ad5aae58..f1cd861e 100644 --- a/src/gross_to_net_generation.py +++ b/src/gross_to_net_generation.py @@ -2,7 +2,6 @@ import os import pandas as pd import statsmodels.formula.api as smf -import sqlalchemy as sa import warnings # import pudl packages @@ -13,7 +12,7 @@ import data_cleaning import validation from column_checks import get_dtypes -from filepaths import outputs_folder, downloads_folder +from filepaths import outputs_folder from logging_util import get_logger logger = get_logger(__name__) diff --git a/src/load_data.py b/src/load_data.py index b1f9725e..43f6ab63 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -1,6 +1,5 @@ import pandas as pd import numpy as np -import os import sqlalchemy as sa import warnings from pathlib import Path diff --git a/src/output_data.py b/src/output_data.py index f8137731..4591e8bb 100644 --- a/src/output_data.py +++ b/src/output_data.py @@ -462,7 +462,7 @@ def write_power_sector_results(ba_fuel_data, path_prefix, skip_outputs): if not skip_outputs: for ba in list(ba_fuel_data.ba_code.unique()): - if type(ba) is not str: + if not isinstance(ba, str): logger.warning( f"not aggregating {sum(ba_fuel_data.ba_code.isna())} plants with numeric BA {ba}" ) diff --git a/src/validation.py b/src/validation.py index beb91e2e..7c9f9016 100644 --- a/src/validation.py +++ b/src/validation.py @@ -220,7 +220,7 @@ def check_for_orphaned_cc_part_in_subplant(subplant_crosswalk): "prime_mover_code" ].agg(["unique"]) cc_subplants["unique_cc_pms"] = [ - ",".join(map(str, l)) for l in cc_subplants["unique"] + ",".join(map(str, L)) for L in cc_subplants["unique"] ] cc_subplants = cc_subplants.drop(columns="unique") # identify where there are subplants that only contain a single CC part From f1675fafeb73474df2599b435ba57a52688c334d Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Mon, 11 Dec 2023 13:35:29 -0800 Subject: [PATCH 6/8] chore: format test modules --- test/test_eia.py | 1 + test/test_logging.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/test/test_eia.py b/test/test_eia.py index 21642077..0203722a 100644 --- a/test/test_eia.py +++ b/test/test_eia.py @@ -7,6 +7,7 @@ from src.eia import EIA + # File open mocking modeled on https://stackoverflow.com/questions/1289894/how-do-i-mock-an-open-used-in-a-with-statement-using-the-mock-framework-in-pyth @pytest.fixture @patch( diff --git a/test/test_logging.py b/test/test_logging.py index f905175f..5758cca5 100644 --- a/test/test_logging.py +++ b/test/test_logging.py @@ -3,8 +3,8 @@ import pandas as pd -sys.path.append('../src') -sys.path.append('..') +sys.path.append("../src") +sys.path.append("..") import src.eia930 as eia930 from src.filepaths import top_folder @@ -13,21 +13,21 @@ pudl_logger = logging.getLogger(name="catalystcoop.pudl") -configure_root_logger(logfile=top_folder('test/test_logfile.txt'), level=logging.INFO) +configure_root_logger(logfile=top_folder("test/test_logfile.txt"), level=logging.INFO) # If you call this again, nothing bad should happen. Logging statements should # still only show up once. -configure_root_logger(logfile=top_folder('test/test_logfile.txt'), level=logging.INFO) -logger = get_logger('test') +configure_root_logger(logfile=top_folder("test/test_logfile.txt"), level=logging.INFO) +logger = get_logger("test") def main(): - """These statements should each be printed once in a nice format.""" - logger.info('This is the OGE logger') - pudl_logger.info('This is the PUDL logger') + """These statements should each be printed once in a nice format.""" + logger.info("This is the OGE logger") + pudl_logger.info("This is the PUDL logger") - df = pd.DataFrame({"a": [1,2,3], "b": [4,5,6]}) - logger.info("\n" + df.to_string()) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + logger.info("\n" + df.to_string()) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() From 6a9e473d75a6d6d04bf261b9a64001c774b6bd47 Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Mon, 11 Dec 2023 13:38:34 -0800 Subject: [PATCH 7/8] chore: lint test modules --- test/test_logging.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_logging.py b/test/test_logging.py index 5758cca5..c97b82d1 100644 --- a/test/test_logging.py +++ b/test/test_logging.py @@ -6,10 +6,9 @@ sys.path.append("../src") sys.path.append("..") -import src.eia930 as eia930 -from src.filepaths import top_folder +from src.filepaths import top_folder # noqa E402 -from src.logging_util import get_logger, configure_root_logger +from src.logging_util import get_logger, configure_root_logger # noqa E402 pudl_logger = logging.getLogger(name="catalystcoop.pudl") From 7580cc77dc3896a90c25054277f0264aa87da214 Mon Sep 17 00:00:00 2001 From: Ben RdO Date: Wed, 29 Nov 2023 10:49:40 -0800 Subject: [PATCH 8/8] chore: add lint github workflow --- .github/workflows/lint.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..5c7c50a6 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,19 @@ +name: Lint + +on: + push: + pull_request: + branches: + - development + - main + +jobs: + formatting: + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + - run: python -m pip install --upgrade ruff + - run: ruff check . --diff + - run: ruff format . --diff