From 6eb19933282bfcf8b15448712988ad4851dcc8fc Mon Sep 17 00:00:00 2001
From: Josh Cunningham <josh.cu@gmail.com>
Date: Thu, 5 Sep 2024 17:06:27 -0500
Subject: [PATCH] Merge in main fixes (#42)

* bugfix - check input dates within available zarr data

* Gage crosswalk (#41)

* improve gage to cat crosswalk

* set geopandas minimum version to 1.0.0

* fix gage cli

* add build on pr

* fix build_only workflow naming

* Create LICENSE

* fix geopandas without fiona
---
 .github/workflows/build_only.yml      | 27 +++++++++++++++
 LICENSE                               | 21 ++++++++++++
 modules/data_processing/gpkg_utils.py | 38 ++++++++++++++++++----
 modules/data_processing/zarr_utils.py | 22 ++++++++++++-
 modules/map_app/views.py              | 47 ++++++++-------------------
 pyproject.toml                        |  2 +-
 6 files changed, 115 insertions(+), 42 deletions(-)
 create mode 100644 .github/workflows/build_only.yml
 create mode 100644 LICENSE

diff --git a/.github/workflows/build_only.yml b/.github/workflows/build_only.yml
new file mode 100644
index 0000000..ed4dce8
--- /dev/null
+++ b/.github/workflows/build_only.yml
@@ -0,0 +1,27 @@
+name: Upload Python Package to PyPI when a Release is Created
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches:
+      - main
+jobs:
+  pip-build:
+    name: build pip package withouth publishing
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.x"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install setuptools wheel build
+      - name: Build package
+        run: |
+          python -m build
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6f9a4b4
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Alabama  Water Institute
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/modules/data_processing/gpkg_utils.py b/modules/data_processing/gpkg_utils.py
index 5f3284f..b49a9b7 100644
--- a/modules/data_processing/gpkg_utils.py
+++ b/modules/data_processing/gpkg_utils.py
@@ -8,6 +8,8 @@
 import struct
 import geopandas as gpd
 from pathlib import Path
+import pyproj
+from shapely.ops import transform
 
 logger = logging.getLogger(__name__)
 
@@ -38,7 +40,6 @@ def verify_indices(gpkg: str = file_paths.conus_hydrofabric()) -> None:
             con.commit()
     con.close()
 
-
 def create_empty_gpkg(gpkg: str) -> None:
     """
     Create an empty geopackage with the necessary tables and indices.
@@ -121,6 +122,19 @@ def blob_to_centre_point(blob: bytes) -> Point:
     return Point(x, y)
 
 
+def convert_to_5070(shapely_geometry):
+    # convert to web mercator
+    if shapely_geometry.is_empty:
+        return shapely_geometry
+    source_crs = pyproj.CRS("EPSG:4326")
+    target_crs = pyproj.CRS("EPSG:5070")
+    project = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True).transform
+    new_geometry = transform(project, shapely_geometry)
+    logger.debug(f" new geometry: {new_geometry}")
+    logger.debug(f"old geometry: {shapely_geometry}")
+    return new_geometry
+
+
 def get_catid_from_point(coords):
     """
     Retrieves the watershed boundary ID (catid) of the watershed that contains the given point.
@@ -138,12 +152,24 @@ def get_catid_from_point(coords):
     """
     logger.info(f"Getting catid for {coords}")
     q = file_paths.conus_hydrofabric()
-    d = {"col1": ["point"], "geometry": [Point(coords["lng"], coords["lat"])]}
-    point = gpd.GeoDataFrame(d, crs="EPSG:4326")
-    df = gpd.read_file(q, format="GPKG", layer="divides", mask=point)
-    if df.empty:
+    point = Point(coords["lng"], coords["lat"])
+    point = convert_to_5070(point)
+    with sqlite3.connect(q) as con:
+        sql = f"""SELECT DISTINCT d.divide_id, d.geom
+                FROM divides d
+                JOIN rtree_divides_geom r ON d.fid = r.id
+                WHERE r.minx <= {point.x} AND r.maxx >= {point.x}
+                AND r.miny <= {point.y} AND r.maxy >= {point.y}"""
+        results = con.execute(sql).fetchall()
+    if len(results) == 0:
         raise IndexError(f"No watershed boundary found for {coords}")
-    return df["id"].values[0]
+    if len(results) > 1:
+        # check the geometries to see which one contains the point
+        for result in results:
+            geom = blob_to_geometry(result[1])
+            if geom.contains(point):
+                return result[0]
+    return results[0][0]
 
 
 def create_rTree_table(table: str, con: sqlite3.Connection) -> None:
diff --git a/modules/data_processing/zarr_utils.py b/modules/data_processing/zarr_utils.py
index 69c9fa1..42e592f 100644
--- a/modules/data_processing/zarr_utils.py
+++ b/modules/data_processing/zarr_utils.py
@@ -30,10 +30,29 @@ def load_zarr_datasets() -> xr.Dataset:
     dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr")
     return dataset
 
+
+def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]:
+    end_time_in_dataset = dataset.time[-1].values
+    start_time_in_dataset = dataset.time[0].values
+    if np.datetime64(start_time) < start_time_in_dataset:
+        logger.warning(
+            f"provided start {start_time} is before the start of the dataset {start_time_in_dataset}, selecting from {start_time_in_dataset}"
+        )
+        start_time = start_time_in_dataset
+    if np.datetime64(end_time) > end_time_in_dataset:
+        logger.warning(
+            f"provided end {end_time} is after the end of the dataset {end_time_in_dataset}, selecting until {end_time_in_dataset}"
+        )
+        end_time = end_time_in_dataset
+    return start_time, end_time
+
+
 def clip_dataset_to_bounds(
     dataset: xr.Dataset, bounds: Tuple[float, float, float, float], start_time: str, end_time: str
 ) -> xr.Dataset:
     """Clip the dataset to specified geographical bounds."""
+    # check time range here in case just this function is imported and not the whole module
+    start_time, end_time = validate_time_range(dataset, start_time, end_time)
     dataset = dataset.sel(
         x=slice(bounds[0], bounds[2]),
         y=slice(bounds[1], bounds[3]),
@@ -67,10 +86,11 @@ def get_forcing_data(
         cached_data = xr.open_mfdataset(
             forcing_paths.cached_nc_file(), parallel=True, engine="h5netcdf"
         )
+        start_time, end_time = validate_time_range(cached_data, start_time, end_time)
         if cached_data.time[0].values <= np.datetime64(start_time) and cached_data.time[
             -1
         ].values >= np.datetime64(end_time):
-            logger.info("Time range is correct")
+            logger.info("Time range is within cached data")
             logger.debug(f"Opened cached nc file: [{forcing_paths.cached_nc_file()}]")
             merged_data = clip_dataset_to_bounds(
                 cached_data, gdf.total_bounds, start_time, end_time
diff --git a/modules/map_app/views.py b/modules/map_app/views.py
index 1af4811..cb5e573 100644
--- a/modules/map_app/views.py
+++ b/modules/map_app/views.py
@@ -39,19 +39,6 @@ def index():
     return render_template("index.html")
 
 
-def get_catid_from_point(coords):
-    # inpute coords are EPSG:4326
-    logger.info(coords)
-    # takes a point and returns the catid of the watershed it is in
-    # create a geometry mask for the point
-    # load the watershed boundaries
-    q = Path(__file__).parent.parent / "data_sources" / "conus.gpkg"
-    d = {"col1": ["point"], "geometry": [Point(coords["lng"], coords["lat"])]}
-    point = gpd.GeoDataFrame(d, crs="EPSG:4326")
-    df = gpd.read_file(q, format="GPKG", layer="divides", mask=point)
-    return df["divide_id"].values[0]
-
-
 @main.route("/handle_map_interaction", methods=["POST"])
 def handle_map_interaction():
     data = request.get_json()
@@ -99,28 +86,20 @@ def get_map_data():
 
 
 def catids_to_geojson(cat_dict):
+    # if the cat id is added to the dict by clicking on the map, it will have coodinates
+    # if it was added using the select by cat_id box, the coordinates are 0 0
+    # if we just use the name this doesn't matter
     for k, v in cat_dict.items():
-        if v[0] != 0 and v[1] != 0:
-            # assuming this was called by clicking on the map
-            cat_dict[k] = Point(v[1], v[0])
-        else:
-            # this was called without knowing the coordinates
-            # use sql to get the geometry
-            conn = sqlite3.connect(file_paths.conus_hydrofabric())
-            c = conn.cursor()
-            c.execute(f"SELECT geom FROM divides WHERE id = '{k}'")
-            result = c.fetchone()
-            if result is not None:
-                cat_dict[k] = convert_to_4326(blob_to_geometry(result[0])).buffer(-0.0001)
-
-    d = {"col1": cat_dict.keys(), "geometry": cat_dict.values()}
-    points = gpd.GeoDataFrame(d, crs="EPSG:4326")
-    logger.debug(points)
-    q = Path(__file__).parent.parent / "data_sources" / "conus.gpkg"
-    df = gpd.read_file(q, format="GPKG", layer="divides", mask=points)
-    # convert crs to 4326
-    df = df.to_crs(epsg=4326)
-    return df.to_json()
+        # use sql to get the geometry
+        conn = sqlite3.connect(file_paths.conus_hydrofabric())
+        c = conn.cursor()
+        c.execute(f"SELECT geom FROM divides WHERE divide_id = '{k}'")
+        result = c.fetchone()
+        if result is not None:
+            cat_dict[k] = convert_to_4326(blob_to_geometry(result[0])).buffer(-0.0001)
+    df = {"col1": cat_dict.keys(), "geometry": cat_dict.values()}
+    gdf = gpd.GeoDataFrame(df, crs="EPSG:4326")
+    return gdf.to_json()
 
 
 @main.route("/get_geojson_from_catids", methods=["POST"])
diff --git a/pyproject.toml b/pyproject.toml
index 57bba3f..d41b8b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
     "pyproj==3.6.1",
     "Flask==3.0.2",
     "Flask-Cors==4.0.1",
-    "geopandas==0.14.3",
+    "geopandas>=1.0.0",
     "requests==2.32.2",
     "igraph==0.11.4",
     "s3fs==2024.3.1",