From 6eb19933282bfcf8b15448712988ad4851dcc8fc Mon Sep 17 00:00:00 2001 From: Josh Cunningham Date: Thu, 5 Sep 2024 17:06:27 -0500 Subject: [PATCH] Merge in main fixes (#42) * bugfix - check input dates within available zarr data * Gage crosswalk (#41) * improve gage to cat crosswalk * set geopandas minimum version to 1.0.0 * fix gage cli * add build on pr * fix build_only workflow naming * Create LICENSE * fix geopandas without fiona --- .github/workflows/build_only.yml | 27 +++++++++++++++ LICENSE | 21 ++++++++++++ modules/data_processing/gpkg_utils.py | 38 ++++++++++++++++++---- modules/data_processing/zarr_utils.py | 22 ++++++++++++- modules/map_app/views.py | 47 ++++++++------------------- pyproject.toml | 2 +- 6 files changed, 115 insertions(+), 42 deletions(-) create mode 100644 .github/workflows/build_only.yml create mode 100644 LICENSE diff --git a/.github/workflows/build_only.yml b/.github/workflows/build_only.yml new file mode 100644 index 0000000..ed4dce8 --- /dev/null +++ b/.github/workflows/build_only.yml @@ -0,0 +1,27 @@ +name: Upload Python Package to PyPI when a Release is Created + +on: + pull_request: + types: [opened, synchronize, reopened] + branches: + - main +jobs: + pip-build: + name: build pip package withouth publishing + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel build + - name: Build package + run: | + python -m build + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6f9a4b4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Alabama Water Institute + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/modules/data_processing/gpkg_utils.py b/modules/data_processing/gpkg_utils.py index 5f3284f..b49a9b7 100644 --- a/modules/data_processing/gpkg_utils.py +++ b/modules/data_processing/gpkg_utils.py @@ -8,6 +8,8 @@ import struct import geopandas as gpd from pathlib import Path +import pyproj +from shapely.ops import transform logger = logging.getLogger(__name__) @@ -38,7 +40,6 @@ def verify_indices(gpkg: str = file_paths.conus_hydrofabric()) -> None: con.commit() con.close() - def create_empty_gpkg(gpkg: str) -> None: """ Create an empty geopackage with the necessary tables and indices. @@ -121,6 +122,19 @@ def blob_to_centre_point(blob: bytes) -> Point: return Point(x, y) +def convert_to_5070(shapely_geometry): + # convert to web mercator + if shapely_geometry.is_empty: + return shapely_geometry + source_crs = pyproj.CRS("EPSG:4326") + target_crs = pyproj.CRS("EPSG:5070") + project = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True).transform + new_geometry = transform(project, shapely_geometry) + logger.debug(f" new geometry: {new_geometry}") + logger.debug(f"old geometry: {shapely_geometry}") + return new_geometry + + def get_catid_from_point(coords): """ Retrieves the watershed boundary ID (catid) of the watershed that contains the given point. @@ -138,12 +152,24 @@ def get_catid_from_point(coords): """ logger.info(f"Getting catid for {coords}") q = file_paths.conus_hydrofabric() - d = {"col1": ["point"], "geometry": [Point(coords["lng"], coords["lat"])]} - point = gpd.GeoDataFrame(d, crs="EPSG:4326") - df = gpd.read_file(q, format="GPKG", layer="divides", mask=point) - if df.empty: + point = Point(coords["lng"], coords["lat"]) + point = convert_to_5070(point) + with sqlite3.connect(q) as con: + sql = f"""SELECT DISTINCT d.divide_id, d.geom + FROM divides d + JOIN rtree_divides_geom r ON d.fid = r.id + WHERE r.minx <= {point.x} AND r.maxx >= {point.x} + AND r.miny <= {point.y} AND r.maxy >= {point.y}""" + results = con.execute(sql).fetchall() + if len(results) == 0: raise IndexError(f"No watershed boundary found for {coords}") - return df["id"].values[0] + if len(results) > 1: + # check the geometries to see which one contains the point + for result in results: + geom = blob_to_geometry(result[1]) + if geom.contains(point): + return result[0] + return results[0][0] def create_rTree_table(table: str, con: sqlite3.Connection) -> None: diff --git a/modules/data_processing/zarr_utils.py b/modules/data_processing/zarr_utils.py index 69c9fa1..42e592f 100644 --- a/modules/data_processing/zarr_utils.py +++ b/modules/data_processing/zarr_utils.py @@ -30,10 +30,29 @@ def load_zarr_datasets() -> xr.Dataset: dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr") return dataset + +def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]: + end_time_in_dataset = dataset.time[-1].values + start_time_in_dataset = dataset.time[0].values + if np.datetime64(start_time) < start_time_in_dataset: + logger.warning( + f"provided start {start_time} is before the start of the dataset {start_time_in_dataset}, selecting from {start_time_in_dataset}" + ) + start_time = start_time_in_dataset + if np.datetime64(end_time) > end_time_in_dataset: + logger.warning( + f"provided end {end_time} is after the end of the dataset {end_time_in_dataset}, selecting until {end_time_in_dataset}" + ) + end_time = end_time_in_dataset + return start_time, end_time + + def clip_dataset_to_bounds( dataset: xr.Dataset, bounds: Tuple[float, float, float, float], start_time: str, end_time: str ) -> xr.Dataset: """Clip the dataset to specified geographical bounds.""" + # check time range here in case just this function is imported and not the whole module + start_time, end_time = validate_time_range(dataset, start_time, end_time) dataset = dataset.sel( x=slice(bounds[0], bounds[2]), y=slice(bounds[1], bounds[3]), @@ -67,10 +86,11 @@ def get_forcing_data( cached_data = xr.open_mfdataset( forcing_paths.cached_nc_file(), parallel=True, engine="h5netcdf" ) + start_time, end_time = validate_time_range(cached_data, start_time, end_time) if cached_data.time[0].values <= np.datetime64(start_time) and cached_data.time[ -1 ].values >= np.datetime64(end_time): - logger.info("Time range is correct") + logger.info("Time range is within cached data") logger.debug(f"Opened cached nc file: [{forcing_paths.cached_nc_file()}]") merged_data = clip_dataset_to_bounds( cached_data, gdf.total_bounds, start_time, end_time diff --git a/modules/map_app/views.py b/modules/map_app/views.py index 1af4811..cb5e573 100644 --- a/modules/map_app/views.py +++ b/modules/map_app/views.py @@ -39,19 +39,6 @@ def index(): return render_template("index.html") -def get_catid_from_point(coords): - # inpute coords are EPSG:4326 - logger.info(coords) - # takes a point and returns the catid of the watershed it is in - # create a geometry mask for the point - # load the watershed boundaries - q = Path(__file__).parent.parent / "data_sources" / "conus.gpkg" - d = {"col1": ["point"], "geometry": [Point(coords["lng"], coords["lat"])]} - point = gpd.GeoDataFrame(d, crs="EPSG:4326") - df = gpd.read_file(q, format="GPKG", layer="divides", mask=point) - return df["divide_id"].values[0] - - @main.route("/handle_map_interaction", methods=["POST"]) def handle_map_interaction(): data = request.get_json() @@ -99,28 +86,20 @@ def get_map_data(): def catids_to_geojson(cat_dict): + # if the cat id is added to the dict by clicking on the map, it will have coodinates + # if it was added using the select by cat_id box, the coordinates are 0 0 + # if we just use the name this doesn't matter for k, v in cat_dict.items(): - if v[0] != 0 and v[1] != 0: - # assuming this was called by clicking on the map - cat_dict[k] = Point(v[1], v[0]) - else: - # this was called without knowing the coordinates - # use sql to get the geometry - conn = sqlite3.connect(file_paths.conus_hydrofabric()) - c = conn.cursor() - c.execute(f"SELECT geom FROM divides WHERE id = '{k}'") - result = c.fetchone() - if result is not None: - cat_dict[k] = convert_to_4326(blob_to_geometry(result[0])).buffer(-0.0001) - - d = {"col1": cat_dict.keys(), "geometry": cat_dict.values()} - points = gpd.GeoDataFrame(d, crs="EPSG:4326") - logger.debug(points) - q = Path(__file__).parent.parent / "data_sources" / "conus.gpkg" - df = gpd.read_file(q, format="GPKG", layer="divides", mask=points) - # convert crs to 4326 - df = df.to_crs(epsg=4326) - return df.to_json() + # use sql to get the geometry + conn = sqlite3.connect(file_paths.conus_hydrofabric()) + c = conn.cursor() + c.execute(f"SELECT geom FROM divides WHERE divide_id = '{k}'") + result = c.fetchone() + if result is not None: + cat_dict[k] = convert_to_4326(blob_to_geometry(result[0])).buffer(-0.0001) + df = {"col1": cat_dict.keys(), "geometry": cat_dict.values()} + gdf = gpd.GeoDataFrame(df, crs="EPSG:4326") + return gdf.to_json() @main.route("/get_geojson_from_catids", methods=["POST"]) diff --git a/pyproject.toml b/pyproject.toml index 57bba3f..d41b8b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "pyproj==3.6.1", "Flask==3.0.2", "Flask-Cors==4.0.1", - "geopandas==0.14.3", + "geopandas>=1.0.0", "requests==2.32.2", "igraph==0.11.4", "s3fs==2024.3.1",