Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optionally skip regenerating geoparquet assets #24

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions src/stactools/goes_glm/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,131 @@ class OrbitalSlot(str, enum.Enum):
]
],
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The columns can't be constant as the columns vary slightly. Sometimes there are for example columns containing ...frame_time_offset..., sometimes they are missing (here it seems they are missing). So you need to read the files to get the actual content...

Copy link
Member Author

@TomAugspurger TomAugspurger Oct 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, that's unfortunate! But thanks for the note. I'll get that info from the parquet file / netcdf file then.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ignore my latest commit, which really mangles parquet.create_asset. I intend to clean it up.

PARQUET_TABLE_COLUMNS_EVENTS = (
[
{"name": "geometry", "type": "point"},
{
"name": "id",
"type": "int32",
"description": "product-unique lightning event identifier",
},
{"name": "time", "type": "datetime"},
{
"name": "time_offset",
"type": "int16",
"description": "GLM L2+ Lightning Detection: event's time of occurrence",
"unit": "seconds since 2020-12-31 23:59:40.000",
},
{
"name": "energy",
"type": "int16",
"description": "GLM L2+ Lightning Detection: event radiant energy",
"unit": "J",
},
{
"name": "parent_group_id",
"type": "int32",
"description": "product-unique lightning group identifier for one or more events",
},
],
)

PARQUET_TABLE_COLUMNS_FLASHES = [
{"name": "geometry", "type": "point"},
{
"name": "id",
"type": "int16",
"description": "product-unique lightning flash identifier",
},
{"name": "time_of_first_event", "type": "datetime"},
{
"name": "time_offset_of_first_event",
"type": "int16",
"description": "GLM L2+ Lightning Detection: time of occurrence of first constituent event in flash",
"unit": "seconds since 2020-12-31 23:59:40.000",
},
{"name": "time_of_last_event", "type": "datetime"},
{
"name": "time_offset_of_last_event",
"type": "int16",
"description": "GLM L2+ Lightning Detection: time of occurrence of last constituent event in flash",
"unit": "seconds since 2020-12-31 23:59:40.000",
},
{"name": "frame_time_of_first_event", "type": "datetime"},
{
"name": "frame_time_offset_of_first_event",
"type": "int16",
"description": "GLM L2+ Lightning Detection: time of occurrence of first constituent event in flash",
"unit": "seconds since 2020-12-31 23:59:40.000",
},
{"name": "frame_time_of_last_event", "type": "datetime"},
{
"name": "frame_time_offset_of_last_event",
"type": "int16",
"description": "GLM L2+ Lightning Detection: time of occurrence of last constituent event in flash",
"unit": "seconds since 2020-12-31 23:59:40.000",
},
{
"name": "area",
"type": "int16",
"description": "GLM L2+ Lightning Detection: flash area coverage (pixels containing at least one constituent event only)",
"unit": "m2",
},
{
"name": "energy",
"type": "int16",
"description": "GLM L2+ Lightning Detection: flash radiant energy",
"unit": "J",
},
{
"name": "quality_flag",
"type": "int16",
"description": "GLM L2+ Lightning Detection: flash data quality flags",
},
]

PARQUET_TABLE_COLUMNS_GROUPS = [
{"name": "geometry", "type": "point"},
{
"name": "id",
"type": "int32",
"description": "product-unique lightning group identifier",
},
{"name": "time", "type": "datetime"},
{
"name": "time_offset",
"type": "int16",
"description": "GLM L2+ Lightning Detection: mean time of group's constituent events' times of occurrence",
"unit": "seconds since 2020-12-31 23:59:40.000",
},
{"name": "frame_time", "type": "datetime"},
{
"name": "frame_time_offset",
"type": "int16",
"description": "GLM L2+ Lightning Detection: mean time of group's constituent events' times of occurrence",
"unit": "seconds since 2020-12-31 23:59:40.000",
},
{
"name": "area",
"type": "int16",
"description": "GLM L2+ Lightning Detection: group area coverage (pixels containing at least one constituent event only)",
"unit": "m2",
},
{
"name": "energy",
"type": "int16",
"description": "GLM L2+ Lightning Detection: group radiant energy",
"unit": "J",
},
{
"name": "quality_flag",
"type": "int16",
"description": "GLM L2+ Lightning Detection: group data quality flags",
},
{
"name": "parent_flash_id",
"type": "int16",
"description": "product-unique lightning flash identifier for one or more groups",
},
]
7 changes: 7 additions & 0 deletions src/stactools/goes_glm/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from geopandas import GeoDataFrame, GeoSeries
from netCDF4 import Dataset
from shapely.geometry import Point
import pyarrow.parquet

from . import constants

Expand Down Expand Up @@ -236,6 +237,12 @@ def create_asset(
return create_asset_metadata(title, file, table_cols, count)


def create_asset_from_geoparquet(
pf: pyarrow.parquet.ParquetFile, href: str, title, table_columns
):
return create_asset_metadata(title, href, table_columns, pf.metadata.num_rows)


def create_asset_metadata(
title: str,
href: Optional[str] = None,
Expand Down
31 changes: 29 additions & 2 deletions src/stactools/goes_glm/stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from dateutil.parser import isoparse
from netCDF4 import Dataset
import pyarrow.parquet
from pystac import (
Asset,
CatalogType,
Expand Down Expand Up @@ -165,6 +166,7 @@ def create_item(
nonetcdf: bool = False,
fixnetcdf: bool = False,
appendctime: bool = False,
geoparquet_hrefs: Optional[dict[str, str]] = None,
) -> Item:
"""Create a STAC Item

Expand Down Expand Up @@ -308,8 +310,33 @@ def create_item(
proj.centroid = centroid

if not nogeoparquet:
target_folder = os.path.dirname(asset_href)
assets = parquet.convert(dataset, target_folder)
if geoparquet_hrefs:
events_href = geoparquet_hrefs[constants.PARQUET_KEY_EVENTS]
flashes_href = geoparquet_hrefs[constants.PARQUET_KEY_FLASHES]
groups_href = geoparquet_hrefs[constants.PARQUET_KEY_GROUPS]
assets = {
constants.PARQUET_KEY_EVENTS: parquet.create_asset_from_geoparquet(
pyarrow.parquet.ParquetFile(events_href),
events_href,
constants.PARQUET_TITLE_EVENTS,
constants.PARQUET_TABLE_COLUMNS_EVENTS,
),
constants.PARQUET_KEY_FLASHES: parquet.create_asset_from_geoparquet(
pyarrow.parquet.ParquetFile(flashes_href),
flashes_href,
constants.PARQUET_TITLE_FLASHES,
constants.PARQUET_TABLE_COLUMNS_FLASHES,
),
constants.PARQUET_KEY_GROUPS: parquet.create_asset_from_geoparquet(
pyarrow.parquet.ParquetFile(groups_href),
groups_href,
constants.PARQUET_TITLE_GROUPS,
constants.PARQUET_TABLE_COLUMNS_GROUPS,
),
}
else:
target_folder = os.path.dirname(asset_href)
assets = parquet.convert(dataset, target_folder)
for key, asset_dict in assets.items():
asset = Asset.from_dict(asset_dict)
item.add_asset(key, asset)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
17 changes: 17 additions & 0 deletions tests/test_stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,20 @@ def test_create_item(self) -> None:
self.assertTrue("dimensions" in var)
self.assertTrue("type" in var)
self.assertTrue("description" in var)


def test_geoparquet_hrefs():
asset_href = "./tests/data-files/OR_GLM-L2-LCFA_G16_s20181591447400_e20181591448000_c20181591448028.nc"
geoparquet_hrefs = {
"geoparquet_flashes": "./tests/data-files/OR_GLM-L2-LCFA_G16_s20181591447400_e20181591448000_c20181591448028-flashes.parquet",
"geoparquet_groups": "./tests/data-files/OR_GLM-L2-LCFA_G16_s20181591447400_e20181591448000_c20181591448028-groups.parquet",
"geoparquet_events": "./tests/data-files/OR_GLM-L2-LCFA_G16_s20181591447400_e20181591448000_c20181591448028-events.parquet",
}

result = stac.create_item(asset_href, geoparquet_hrefs=geoparquet_hrefs)
expected = stac.create_item(asset_href, geoparquet_hrefs=geoparquet_hrefs)

for k, v in expected.assets.items():
v.href = result.assets[k].href

assert result.to_dict() == expected.to_dict()