Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't use pixetl for geojson creation in resample script #610

Merged
merged 23 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
5236433
Replace call to pixetl's create_geojsons in resample script with a ne…
dmannarino Nov 30, 2024
44bed61
Fix (make relative) imports
dmannarino Nov 30, 2024
821dc28
Upload to full key, not just prefix
dmannarino Nov 30, 2024
6ee4704
Print geojsons for debugging
dmannarino Nov 30, 2024
d5a28d9
WIP: Fix resample script uploading to wrong place
dmannarino Dec 1, 2024
f93b536
Sanitize geojsons for NaN values
dmannarino Dec 1, 2024
b4ddb5b
Set max simul. gdalinfo processes to 16
dmannarino Dec 2, 2024
a4f5124
Merge branch 'dont_use_pixetl_for_geojsons' of github.com:wri/gfw-dat…
dmannarino Dec 2, 2024
a487993
Always write coords in tiles.geojson in EPSG:4326 for legacy reasons
dmannarino Dec 3, 2024
d4126c3
Transform extent too, add note
dmannarino Dec 3, 2024
961d266
Remove extra fields from tiles.geojson
dmannarino Dec 3, 2024
96dca40
Change a few var and fcn names
dmannarino Dec 9, 2024
0ffabfc
Move writing geojsons into resample script; use existing run_gdal_sub…
dmannarino Dec 10, 2024
40ed0c1
Remove unneeded filnames from generate_geojsons call
dmannarino Dec 10, 2024
36631ca
Change behavior of get_aws_files to return s3:// URIs, rather than GDAL
dmannarino Dec 10, 2024
f662f0e
Fix imports
dmannarino Dec 10, 2024
90b12c4
switch apply_colormap to use local geojsons code too
dmannarino Dec 10, 2024
b6ffcf4
Fix geojsons logging
dmannarino Dec 10, 2024
0f920e3
Remove extra geotiff from geojsons prefix
dmannarino Dec 10, 2024
6862049
Don't indent geojsons to minimize changes in format
dmannarino Dec 11, 2024
b70d59d
Restore removed metadata fields
dmannarino Dec 12, 2024
ab99a97
Run data types through from_gdal_data_type to match pixetl output
dmannarino Dec 17, 2024
bbd833a
Improve note about coords in tiles.geojson
dmannarino Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
line_length = 88
multi_line_output = 3
include_trailing_comma = True
known_third_party = _pytest,aenum,affine,alembic,asgi_lifespan,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,ee,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,typer
known_third_party = _pytest,aenum,affine,alembic,asgi_lifespan,async_lru,asyncpg,aws_utils,boto3,botocore,click,docker,ee,errors,fastapi,fiona,gdal_utils,geoalchemy2,geojson,gfw_pixetl,gino,gino_starlette,google,httpx,httpx_auth,logger,logging_utils,moto,numpy,orjson,osgeo,pandas,pendulum,pglast,psutil,psycopg2,pydantic,pyproj,pytest,pytest_asyncio,rasterio,shapely,sqlalchemy,sqlalchemy_utils,starlette,tileputty,tiles_geojson,typer
44 changes: 33 additions & 11 deletions batch/python/apply_colormap.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
import rasterio

# Use relative imports because these modules get copied into container
from aws_utils import get_s3_client, get_s3_path_parts
from aws_utils import get_aws_files, get_s3_client, get_s3_path_parts, upload_s3
from errors import GDALError, SubprocessKilledError
from gdal_utils import from_vsi_path, run_gdal_subcommand
from gdal_utils import from_vsi_path, run_gdal_subcommand, to_vsi_path
from logging_utils import listener_configurer, log_client_configurer, log_listener
from pydantic import BaseModel, Extra, Field, StrictInt
from tiles_geojson import generate_geojsons
from typer import Option, run

NUM_PROCESSES = int(
Expand Down Expand Up @@ -267,16 +268,37 @@ def apply_symbology(
for tile_id in executor.map(create_rgb_tile, process_args):
logger.log(logging.INFO, f"Finished processing tile {tile_id}")

# Now run pixetl_prep.create_geojsons to generate a tiles.geojson and
# extent.geojson in the target prefix. But that code appends /geotiff
# to the prefix so remove it first
create_geojsons_prefix = target_prefix.split(f"{dataset}/{version}/")[1].replace(
"/geotiff", ""
)
logger.log(logging.INFO, "Uploading tiles.geojson to {create_geojsons_prefix}")
from gfw_pixetl.pixetl_prep import create_geojsons
# Now generate a tiles.geojson and extent.geojson and upload to the target prefix.
bucket, _ = get_s3_path_parts(source_uri)
tile_paths = [to_vsi_path(uri) for uri in get_aws_files(bucket, target_prefix)]

tiles_output_file = "tiles.geojson"
extent_output_file = "extent.geojson"

logger.log(logging.INFO, "Generating geojsons")
tiles_fc, extent_fc = generate_geojsons(tile_paths, min(16, NUM_PROCESSES))
logger.log(logging.INFO, "Finished generating geojsons")

tiles_txt = json.dumps(tiles_fc, indent=2)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that in existing tiles.geojson, the json is just written out with only spaces and no newlines (i.e. not prettifying with indents, etc.). Do we also want to do that (not indent), or do you think it is worth it for debugging to leave it in pretty format?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a good point. I like being able to look at it for debugging, but I'll remove the indent for now in the interests of changing as little as possible.

with open(tiles_output_file, "w") as f:
print(tiles_txt, file=f)

create_geojsons(list(), dataset, version, create_geojsons_prefix, True)
extent_txt = json.dumps(extent_fc, indent=2)
with open(extent_output_file, "w") as f:
print(extent_txt, file=f)

logger.log(logging.INFO, f"Uploading geojsons to {target_prefix}")
upload_s3(
tiles_output_file,
bucket,
os.path.join(target_prefix, tiles_output_file),
)
upload_s3(
extent_output_file,
bucket,
os.path.join(target_prefix, extent_output_file),
)
logger.log(logging.INFO, f"Finished uploading geojsons to {target_prefix}")

log_queue.put_nowait(None)
listener.join()
Expand Down
32 changes: 31 additions & 1 deletion batch/python/aws_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Tuple
from typing import List, Sequence, Tuple, Dict, Any

import boto3

Expand Down Expand Up @@ -29,3 +29,33 @@ def exists_in_s3(target_bucket, target_key):
for obj in response.get("Contents", []):
if obj["Key"] == target_key:
return obj["Size"] > 0


def get_aws_files(
bucket: str, prefix: str, extensions: Sequence[str] = (".tif",)
) -> List[str]:
"""Get all matching files in S3."""
files: List[str] = list()

s3_client = get_s3_client()
paginator = s3_client.get_paginator("list_objects_v2")

print("get_aws_files")
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
try:
contents = page["Contents"]
except KeyError:
break

for obj in contents:
key = str(obj["Key"])
if any(key.endswith(ext) for ext in extensions):
files.append(f"s3://{bucket}/{key}")

print("done get_aws_files")
return files


def upload_s3(path: str, bucket: str, dst: str) -> Dict[str, Any]:
s3_client = get_s3_client()
return s3_client.upload_file(path, bucket, dst)
13 changes: 13 additions & 0 deletions batch/python/gdal_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import subprocess
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse

from errors import GDALError, SubprocessKilledError

Expand All @@ -21,6 +22,18 @@ def from_vsi_path(file_name: str) -> str:
return vsi


def to_vsi_path(file_name: str) -> str:
prefix = {"s3": "vsis3", "gs": "vsigs"}

parts = urlparse(file_name)
try:
path = f"/{prefix[parts.scheme]}/{parts.netloc}{parts.path}"
except KeyError:
raise ValueError(f"Unknown protocol: {parts.scheme}")

return path


def run_gdal_subcommand(cmd: List[str], env: Optional[Dict] = None) -> Tuple[str, str]:
"""Run GDAL as sub command and catch common errors."""

Expand Down
48 changes: 40 additions & 8 deletions batch/python/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,21 @@
import rasterio

# Use relative imports because these modules get copied into container
from aws_utils import exists_in_s3, get_s3_client, get_s3_path_parts
from aws_utils import (
exists_in_s3,
get_aws_files,
get_s3_client,
get_s3_path_parts,
upload_s3,
)
from errors import SubprocessKilledError
from gdal_utils import from_vsi_path
from gdal_utils import from_vsi_path, to_vsi_path
from gfw_pixetl.grids import grid_factory
from gfw_pixetl.pixetl_prep import create_geojsons
from logging_utils import listener_configurer, log_client_configurer, log_listener
from pyproj import CRS, Transformer
from shapely.geometry import MultiPolygon, Polygon, shape
from shapely.ops import unary_union
from tiles_geojson import generate_geojsons
from typer import Option, run

# Use at least 1 process
Expand Down Expand Up @@ -656,12 +662,38 @@ def resample(
for tile_id in executor.map(process_tile, process_tile_args):
logger.log(logging.INFO, f"Finished processing tile {tile_id}")

# Now run pixetl_prep.create_geojsons to generate a tiles.geojson and
# extent.geojson in the target prefix.
create_geojsons_prefix = target_prefix.split(f"{dataset}/{version}/")[1]
logger.log(logging.INFO, f"Uploading tiles.geojson to {create_geojsons_prefix}")
# Now generate a tiles.geojson and extent.geojson and upload to the target prefix.
tile_paths = [to_vsi_path(uri) for uri in get_aws_files(bucket, target_prefix)]

tiles_output_file = "tiles.geojson"
extent_output_file = "extent.geojson"

logger.log(logging.INFO, "Generating geojsons")
tiles_fc, extent_fc = generate_geojsons(tile_paths, min(16, NUM_PROCESSES))
logger.log(logging.INFO, "Finished generating geojsons")

tiles_txt = json.dumps(tiles_fc, indent=2)
with open(tiles_output_file, "w") as f:
print(tiles_txt, file=f)

create_geojsons(list(), dataset, version, create_geojsons_prefix, True)
extent_txt = json.dumps(extent_fc, indent=2)
with open(extent_output_file, "w") as f:
print(extent_txt, file=f)

geojsons_prefix = os.path.join(target_prefix, "geotiff")

logger.log(logging.INFO, f"Uploading geojsons to {geojsons_prefix}")
upload_s3(
tiles_output_file,
bucket,
os.path.join(geojsons_prefix, tiles_output_file),
)
upload_s3(
extent_output_file,
bucket,
os.path.join(geojsons_prefix, extent_output_file),
)
logger.log(logging.INFO, f"Finished uploading geojsons to {geojsons_prefix}")

log_queue.put_nowait(None)
listener.join()
Expand Down
96 changes: 96 additions & 0 deletions batch/python/tiles_geojson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Any, Dict, List, Tuple

from geojson import Feature, FeatureCollection
from pyproj import CRS, Transformer
from shapely.geometry import Polygon
from shapely.ops import unary_union

from errors import GDALError
from gdal_utils import run_gdal_subcommand


def to_4326(crs: CRS, x: float, y: float) -> Tuple[float, float]:
transformer = Transformer.from_crs(
crs, CRS.from_epsg(4326), always_xy=True
)
return transformer.transform(x, y)


def extract_metadata_from_gdalinfo(gdalinfo_json: Dict[str, Any]) -> Dict[str, Any]:
"""Extract necessary metadata from the gdalinfo JSON output."""
corner_coordinates = gdalinfo_json["cornerCoordinates"]

crs: CRS = CRS.from_string(gdalinfo_json["coordinateSystem"]["wkt"])
metadata = {
# NOTE: pixetl seems to always write features in tiles.geojson in
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Specifically, pixetl is using the metadata as computed by rasterio.open(). So, are you saying that gdalinfo is returning cornerCoordinates in the original projection, but rasterio.open() is giving the extent always in WGS 84. If so, maybe mention that you are emulating the behavior from rasterio.open() here. I do see that the tiles.geojson at s3://gfw-data-lake/umd_tree_cover_loss/v1.11/raster/epsg-3857/zoom_12/default/geotiff/tiles.geojson is showing projection WGS 84 as a property for each of the tiles.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screenshot 2024-12-11 at 1 43 40 PM Screenshot 2024-12-11 at 1 43 10 PM I admit I'm a little bit fuzzy on the terminology, but I think technically EPSG:3857 and EPSG:4326 are both WGS 84? In any case, what I really mean is that the coordinates in the feature are always in degrees, even for WM tiles. I *think* they should be in meters for WM tiles. So that's what I'm trying to duplicate (even though it feels wrong).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll re-write the comment to clarify that it's the units, not the actual projection.

# epsg:4326 coordinates (even when the tiles themselves are
# epsg:3857). Reproduce that behavior for compatibility. If that
# ever changes, remove the call to to_4326 here.
"extent": [
*to_4326(crs, *corner_coordinates["lowerLeft"]),
*to_4326(crs, *corner_coordinates["upperRight"]),
],
"name": gdalinfo_json["description"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the tiles.geojson that we create with pixetl have a whole bunch more properties in them (looks like most that are printed by gdalinfo), but I'm assuming you've verified that we only need the 'extent' and 'name' properties - since those are the only ones you are generating beside the Polygon coordinates?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Huh. I actually had those other attributes in there, but took them out because (I swear) I looked at a tiles.geojson that didn't have them and I was trying to reproduce it. I must have been mistaken, I guess? Maybe I was accidentally looking at a custom geojson I had lying around? I don't know, but you're right, I see all that stuff in recent tiles.geojsons. I'll put it back.

}

return metadata


def process_file(file_path: str) -> Dict[str, Any]:
"""Run gdalinfo and extract metadata for a single file."""
print(f"Running gdalinfo on {file_path}")
try:
stdout,stderr = run_gdal_subcommand(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason you are running 'gdalinfo` here rather than doing the same thing that pixetl does, which is getting the info from a rasterio.open() of the file?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My mind went immediately to gdalinfo because (though it may do it differently in other places) pixetl gets metadata with gdalinfo here: https://github.com/wri/gfw_pixetl/blob/master/gfw_pixetl/utils/gdal.py#L170
I can verify that the results are the same with a few tiles.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, thanks for that pointer to the pixetl code that is getting the meta-data. I don't see any reprojection/resizing (to EPSG:4326 or meters) in that code you are pointing to, so does that mean you don't really need the to_4326 stuff? See https://github.com/wri/gfw_pixetl/blob/cfb7c2fa1c0b6f366982e38839baa1cabc425938/gfw_pixetl/utils/gdal.py#L192 (But maybe I am missing a conversion somewhere else.)

["gdalinfo", "-json", file_path],
)
except GDALError as e:
raise RuntimeError(f"Failed to run gdalinfo on {file_path}: {e}")

gdalinfo_json: Dict = json.loads(stdout)
return extract_metadata_from_gdalinfo(gdalinfo_json)


def generate_geojsons(
geotiffs: List[str],
max_workers: int = None
) -> Tuple[FeatureCollection, FeatureCollection]:
"""Generate tiles.geojson and extent.geojson files."""
features = []
polygons = []

with ProcessPoolExecutor(max_workers=max_workers) as executor:
future_to_file = {executor.submit(process_file, file): file for file in geotiffs}
for future in as_completed(future_to_file):
file = future_to_file[future]
try:
metadata = future.result()
extent = metadata["extent"]
# Create a Polygon from the extent
polygon_coords = [
[extent[0], extent[1]],
[extent[0], extent[3]],
[extent[2], extent[3]],
[extent[2], extent[1]],
[extent[0], extent[1]],
]
polygon = Polygon(polygon_coords)

# Add to GeoJSON features
feature = Feature(geometry=polygon.__geo_interface__, properties=metadata)
features.append(feature)

# Collect for union
polygons.append(polygon)
except Exception as e:
raise RuntimeError(f"Error processing file {file}: {e}")

tiles_fc = FeatureCollection(features)

union_geometry = unary_union(polygons)
extent_fc = FeatureCollection([
Feature(geometry=union_geometry.__geo_interface__, properties={})
])

return tiles_fc, extent_fc
33 changes: 0 additions & 33 deletions batch/scripts/run_pixetl_prep.sh

This file was deleted.

Loading