aodn · HavierD · Jan 13, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -26,7 +26,7 @@ jobs:
           version: "latest"
       - name: Install dependencies
         run: |
-          poetry lock --no-update
+          poetry lock
           poetry install
       - name: Run pre-commit checks
         run: |

diff --git a/Dockerfile b/Dockerfile
@@ -3,7 +3,8 @@ FROM python:3.10-slim
 WORKDIR /app
 RUN useradd -l -m -s /bin/bash appuser
 
-COPY pyproject.toml poetry.lock ./
+COPY pyproject.toml poetry.lock README.md ./
+COPY data_access_service ./data_access_service
 
 # For Docker build to understand the possible env
 RUN apt update && \

diff --git a/README.md b/README.md
@@ -55,12 +55,34 @@ aodn_cloud_optimised = { git = "https://github.com/aodn/aodn_cloud_optimised.git
     ```
 
 2. **Install dependencies using Poetry:**
-
     ```bash
     # after cloning the repo with git clone command
     $ cd data-access-service
     $ poetry install
     ```
+   ```bash
+   # You should not need to install lib locally, if your python version is correct.
+   # https://arrow.apache.org/install/
+   sudo apt update
+   sudo apt install -y -V ca-certificates lsb-release wget
+   wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+   sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+   sudo apt update
+   sudo apt install -y -V libarrow-dev # For C++
+   sudo apt install -y -V libarrow-glib-dev # For GLib (C)
+   sudo apt install -y -V libarrow-dataset-dev # For Apache Arrow Dataset C++
+   sudo apt install -y -V libarrow-dataset-glib-dev # For Apache Arrow Dataset GLib (C)
+   sudo apt install -y -V libarrow-acero-dev # For Apache Arrow Acero
+   sudo apt install -y -V libarrow-flight-dev # For Apache Arrow Flight C++
+   sudo apt install -y -V libarrow-flight-glib-dev # For Apache Arrow Flight GLib (C)
+   sudo apt install -y -V libarrow-flight-sql-dev # For Apache Arrow Flight SQL C++
+   sudo apt install -y -V libarrow-flight-sql-glib-dev # For Apache Arrow Flight SQL GLib (C)
+   sudo apt install -y -V libgandiva-dev # For Gandiva C++
+   sudo apt install -y -V libgandiva-glib-dev # For Gandiva GLib (C)
+   sudo apt install -y -V libparquet-dev # For Apache Parquet C++
+   sudo apt install -y -V libparquet-glib-dev # For Apache Parquet GLib (C)
+   sudo apt install -y ninja-build
+    ```
 
 3. **Run the app:**
 

diff --git a/data_access_service/core/api.py b/data_access_service/core/api.py
@@ -2,10 +2,10 @@
 import pandas as pd
 import logging
 
-from datetime import timedelta, datetime
+from datetime import timedelta, datetime, timezone
 from io import BytesIO
 from typing import Optional
-from aodn_cloud_optimised import ParquetDataQuery
+from aodn_cloud_optimised import DataQuery
 from data_access_service.core.descriptor import Depth, Descriptor
 
 log = logging.getLogger(__name__)
@@ -37,7 +37,7 @@ def __init__(self):
 
         # UUID to metadata mapper and init it, a scheduler need to
         # updated it as times go
-        self._instance = ParquetDataQuery.GetAodn()
+        self._instance = DataQuery.GetAodn()
         self._metadata = self._instance.get_metadata()
         self._create_uuid_dataset_map()
 
@@ -78,26 +78,26 @@ def get_raw_meta_data(self, uuid: str):
         else:
             return None
 
+    """
+    Given a time range, we find if this uuid temporal cover the whole range
+    """
+
     def has_data(self, uuid: str, start_date: datetime, end_date: datetime):
         md: Descriptor = self._cached.get(uuid)
         if md is not None:
-            ds: ParquetDataQuery.Dataset = self._instance.get_dataset(md.dname)
-            while start_date <= end_date:
-
-                # currently use 366 days as a period, to make sure 1 query can cover 1 year
-                period_end = start_date + timedelta(days=366)
-                log.info(f"Checking data for {start_date} to {period_end}")
-                if period_end > end_date:
-                    period_end = end_date
-
-                if not ds.get_data(
-                    start_date, period_end, None, None, None, None, None
-                ).empty:
-                    return True
-                else:
-                    start_date = period_end + timedelta(days=1)
+            ds: DataQuery.Dataset = self._instance.get_dataset(md.dname)
+            te = ds.get_temporal_extent()
+            return start_date <= te[0] and te[1] <= end_date
         return False
 
+    def get_temporal_extent(self, uuid: str) -> (datetime, datetime):
+        md: Descriptor = self._cached.get(uuid)
+        if md is not None:
+            ds: DataQuery.Dataset = self._instance.get_dataset(md.dname)
+            return ds.get_temporal_extent()
+        else:
+            return ()
+
     def get_dataset_data(
         self,
         uuid: str,
@@ -112,11 +112,21 @@ def get_dataset_data(
         md: Descriptor = self._cached.get(uuid)
 
         if md is not None:
-            ds: ParquetDataQuery.Dataset = self._instance.get_dataset(md.dname)
+            ds: DataQuery.Dataset = self._instance.get_dataset(md.dname)
 
             # Default get 10 days of data
             if date_start is None:
-                date_start = datetime.now() - timedelta(days=10)
+                date_start = datetime.now(timezone.utc) - timedelta(days=10)
+
+            # The get_data call the pyarrow and compare only works with non timezone datetime
+            # now make sure the timezone is correctly convert to utc then remove it.
+            # As get_date datetime are all utc, but the pyarrow do not support compare of datetime vs
+            # datetime with timezone.
+            if date_start.tzinfo is not None:
+                date_start = date_start.astimezone(timezone.utc).replace(tzinfo=None)
+
+            if date_end.tzinfo is not None:
+                date_end = date_end.astimezone(timezone.utc).replace(tzinfo=None)
 
             return ds.get_data(
                 date_start, date_end, lat_min, lat_max, lon_min, lon_max, scalar_filter

diff --git a/data_access_service/core/restapi.py b/data_access_service/core/restapi.py
@@ -26,6 +26,8 @@
 log = logging.getLogger(__name__)
 
 RECORD_PER_PARTITION: Optional[int] = 1000
+DATE_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
+MIN_DATE = "1970-01-01T00:00:00Z"
 
 
 # Make all non-numeric and str field to str so that json do not throw serializable error
@@ -206,54 +208,73 @@ def get_raw_metadata(uuid):
     return app.api.get_raw_meta_data(uuid)
 
 
-@restapi.route("data/<string:uuid>/has_data", methods=["GET"])
-def data_check(uuid):
+@restapi.route("/data/<string:uuid>/has_data", methods=["GET"])
+def has_data(uuid):
     start_date = _verify_datatime_param(
-        "start_date", request.args.get("start_date", default=None, type=str)
+        "start_date", request.args.get("start_date", default=MIN_DATE)
     )
     end_date = _verify_datatime_param(
-        "end_date", request.args.get("end_date", default=None, type=str)
+        "end_date",
+        request.args.get(
+            "end_date",
+            default=datetime.datetime.now(datetime.timezone.utc).strftime(DATE_FORMAT),
+        ),
     )
-    has_data = str(app.api.has_data(uuid, start_date, end_date)).lower()
-    return Response(has_data, mimetype="application/json")
+    result = str(app.api.has_data(uuid, start_date, end_date)).lower()
+    return Response(result, mimetype="application/json")
+
+
+@restapi.route("/data/<string:uuid>/temporal_extent", methods=["GET"])
+def get_temporal_extent(uuid):
+    temp: (datetime, datetime) = app.api.get_temporal_extent(uuid)
+    result = [
+        {
+            "start_date": temp[0].strftime(DATE_FORMAT),
+            "end_date": temp[1].strftime(DATE_FORMAT),
+        }
+    ]
+    return Response(json.dumps(result), mimetype="application/json")
 
 
 @restapi.route("/data/<string:uuid>", methods=["GET"])
 def get_data(uuid):
     log.info("Request details: %s", json.dumps(request.args.to_dict(), indent=2))
     start_date = _verify_datatime_param(
-        "start_date", request.args.get("start_date", default=None, type=str)
+        "start_date", request.args.get("start_date", default=MIN_DATE)
     )
     end_date = _verify_datatime_param(
-        "end_date", request.args.get("end_date", default=None, type=str)
+        "end_date",
+        request.args.get(
+            "end_date",
+            default=datetime.datetime.now(datetime.timezone.utc).strftime(DATE_FORMAT),
+        ),
     )
-
     result: Optional[pd.DataFrame] = app.api.get_dataset_data(
         uuid=uuid, date_start=start_date, date_end=end_date
     )
 
     start_depth = _verify_depth_param(
-        "start_depth", request.args.get("start_depth", default=None, type=numpy.double)
+        "start_depth", numpy.double(request.args.get("start_depth", default=-1.0))
     )
     end_depth = _verify_depth_param(
-        "end_depth", request.args.get("end_depth", default=None, type=numpy.double)
+        "end_depth", numpy.double(request.args.get("end_depth", default=-1.0))
     )
 
     is_to_index = _verify_to_index_flag_param(
-        request.args.get("is_to_index", default=None, type=str)
+        request.args.get("is_to_index", default=None)
     )
 
     # The cloud optimized format is fast to lookup if there is an index, some field isn't part of the
     # index and therefore will not gain to filter by those field, indexed fields are site_code, timestamp, polygon
 
     # Depth is below sea level zero, so logic slightly diff
-    if start_depth is not None and end_depth is not None:
+    if start_depth > 0 and end_depth > 0:
         filtered = result[
             (result["DEPTH"] <= start_depth) & (result["DEPTH"] >= end_depth)
         ]
-    elif start_depth is not None:
+    elif start_depth > 0:
         filtered = result[(result["DEPTH"] <= start_depth)]
-    elif end_depth is not None:
+    elif end_depth > 0:
         filtered = result[result["DEPTH"] >= end_depth]
     else:
         filtered = result

diff --git a/environment.yml b/environment.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python>=3.10.14
+  - python=3.10.14
   - pip<24.1
   - pip:
       - poetry