Skip to content

Commit

Permalink
Merge pull request #542 from catalyst-cooperative/eiamecs-udpate
Browse files Browse the repository at this point in the history
Make a multi-year EIA MECS archive
cmgosnell authored Jan 29, 2025
2 parents bdbfa3e + 23cfd00 commit 94cc3ab
Showing 3 changed files with 88 additions and 21 deletions.
104 changes: 84 additions & 20 deletions src/pudl_archiver/archivers/eia/eiamecs.py
Original file line number Diff line number Diff line change
@@ -8,46 +8,110 @@
ArchiveAwaitable,
ResourceInfo,
)
from pudl_archiver.frictionless import ZipLayout

BASE_URL = "https://www.eia.gov/consumption/manufacturing/data"
logger = logging.getLogger(f"catalystcoop.{__name__}")

TABLE_LINK_PATTERNS: dict[str | int, str] = {
"recent": r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)",
2002: r"(RSE|)[Tt]able(\d{1,2}).(\d{1,2})_\d{1,2}(.xlsx|.xls)",
# These earlier years the pattern is functional but not actually very informative.
# so we will just use the original name by making the whole pattern a match
1998: r"((d|e)\d{2}([a-z]\d{1,2})_(\d{1,2})(.xlsx|.xls))",
1994: r"((rse|)m\d{2}_(\d{2})([a-d]|)(.xlsx|.xls))",
1991: r"((rse|)mecs(\d{2})([a-z]|)(.xlsx|.xls))",
}
"""Dictionary of years or "recent" as keys and table link patterns as values.
From 2006 and forward the link pattern is the same but all of the older years
have bespoke table link patterns. The groups to match in the regex patterns
will be used to rename the files for the archives. The order of those match
groups indicate various things:
* first group: whether the file contains only Relative Standard Errors (RSE)
* second group: the major table number
* third group: the minor table number
* forth group: the file extension
The years from 1998 and back have table link patterns that could be used in this
same format with 4 match groups, but the major and minor table numbers are not
actually stored in the file name. So for these older years we've turned the whole
pattern into a group and use that (the original file name) as the stored name in
the archive.
"""


class EiaMECSArchiver(AbstractDatasetArchiver):
"""EIA MECS archiver."""

name = "eiamecs"
concurrency_limit = 5 # Number of files to concurrently download

async def get_resources(self) -> ArchiveAwaitable:
"""Download EIA-MECS resources."""
for year in [2018]:
yield self.get_year_resources(year)
years_url = "https://www.eia.gov/consumption/data.php#mfg"
year_link_pattern = re.compile(r"(manufacturing/data/)(\d{4})/$")
for link in await self.get_hyperlinks(years_url, year_link_pattern):
match = year_link_pattern.search(link)
year = match.groups()[1]
if self.valid_year(year):
yield self.get_year_resources(year)

async def get_year_resources(self, year: int) -> list[ResourceInfo]:
"""Download all excel tables for a year."""
table_link_pattern = re.compile(r"[Tt]able(\d{1,2})_(\d{1,2}).xlsx")
logger.info(f"Attempting to find resources for: {year}")
data_paths_in_archive = set()
year_url = f"{BASE_URL}/{year}"
zip_path = self.download_directory / f"eiamecs-{year}.zip"
max_old_year = max(
[year for year in TABLE_LINK_PATTERNS if isinstance(year, int)]
)
if int(year) > max_old_year:
table_link_pattern = re.compile(TABLE_LINK_PATTERNS["recent"])
else:
table_link_pattern = re.compile(TABLE_LINK_PATTERNS[int(year)])

# Loop through all download links for tables
tables = []
year_url = f"{BASE_URL}/{year}"
for table_link in await self.get_hyperlinks(year_url, table_link_pattern):
table_link = f"{year_url}/{table_link}"
logger.info(f"Fetching {table_link}")
# Get table major/minor number from links
# We are going to rename the files in a standard format by extracting
# patterns from the table_link_pattern
# From 1998 and before there are a bunch of letters in the file names
# in patterns that are probably parsable somehow, but for now we are
# just going to keep the original file names
match = table_link_pattern.search(table_link)
major_num, minor_num = match.group(1), match.group(2)

# Download file
download_path = (
self.download_directory
/ f"eia-mecs-{year}-table-{major_num}-{minor_num}.xlsx"
)
await self.download_zipfile(table_link, download_path)

tables.append(
ResourceInfo(
local_path=download_path,
partitions={"year": year, "table": f"{major_num}_{minor_num}"},
filename = match.group(1)
if int(year) > 1998:
is_rse = match.group(1)
# there are several ways the they indicate that the files are
# "data" vs "rse". we will add this to the end of the file name
# but only for rse bc for many years data and the rse are together
rse_map = {"": "", "d": "", "RSE": "-rse", "e": "-rse"}
rse = rse_map[is_rse]
major_num = match.group(2)
minor_num = match.group(3)
extension = match.group(4)
# Download filename
filename = (
f"eia-mecs-{year}-table-{major_num}-{minor_num}{rse}{extension}"
)
download_path = self.download_directory / filename
await self.download_file(table_link, download_path)
self.add_to_archive(
zip_path=zip_path,
filename=filename,
blob=download_path.open("rb"),
)
return tables
data_paths_in_archive.add(filename)
# Don't want to leave multiple giant CSVs on disk, so delete
# immediately after they're safely stored in the ZIP
download_path.unlink()

resource_info = ResourceInfo(
local_path=zip_path,
partitions={"year": year},
layout=ZipLayout(file_paths=data_paths_in_archive),
)
return resource_info
2 changes: 1 addition & 1 deletion src/pudl_archiver/cli.py
Original file line number Diff line number Diff line change
@@ -29,7 +29,7 @@ def parse_main(args=None):
nargs="*",
help="Years to download data for. Supported datasets: censusdp1tract, censuspep, "
"eia176, eia191, eia757a, eia860, eia860m, eia861, eia923, eia930, eia_bulk_elec, "
"eiaaeo, eiawater, epacamd_eia, epacems, ferc1, ferc2, ferc6, ferc60, ferc714, "
"eiaaeo, eiamecs, eiawater, epacamd_eia, epacems, ferc1, ferc2, ferc6, ferc60, ferc714, "
"mshamines, nrelatb, phmsagas",
type=int,
)
3 changes: 3 additions & 0 deletions src/pudl_archiver/package_data/zenodo_doi.yaml
Original file line number Diff line number Diff line change
@@ -37,6 +37,9 @@ eiaaeo:
eia_bulk_elec:
production_doi: 10.5281/zenodo.7067366
sandbox_doi: 10.5072/zenodo.2356
eiamecs:
production_doi: 10.5281/zenodo.14749820
sandbox_doi: 10.5072/zenodo.158873
eiawater:
production_doi: 10.5281/zenodo.7683135
sandbox_doi: 10.5072/zenodo.3160

0 comments on commit 94cc3ab

Please sign in to comment.