Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugs/6092 fix dockerfile #8

Merged
merged 9 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
version: "latest"
- name: Install dependencies
run: |
poetry lock --no-update
poetry lock
poetry install
- name: Run pre-commit checks
run: |
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ FROM python:3.10-slim
WORKDIR /app
RUN useradd -l -m -s /bin/bash appuser

COPY pyproject.toml poetry.lock ./
COPY pyproject.toml poetry.lock README.md ./
COPY data_access_service ./data_access_service

# For Docker build to understand the possible env
RUN apt update && \
Expand Down
24 changes: 23 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,34 @@ aodn_cloud_optimised = { git = "https://github.com/aodn/aodn_cloud_optimised.git
```

2. **Install dependencies using Poetry:**

```bash
# after cloning the repo with git clone command
$ cd data-access-service
$ poetry install
```
```bash
# You should not need to install lib locally, if your python version is correct.
# https://arrow.apache.org/install/
sudo apt update
sudo apt install -y -V ca-certificates lsb-release wget
wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
sudo apt update
sudo apt install -y -V libarrow-dev # For C++
sudo apt install -y -V libarrow-glib-dev # For GLib (C)
sudo apt install -y -V libarrow-dataset-dev # For Apache Arrow Dataset C++
sudo apt install -y -V libarrow-dataset-glib-dev # For Apache Arrow Dataset GLib (C)
sudo apt install -y -V libarrow-acero-dev # For Apache Arrow Acero
sudo apt install -y -V libarrow-flight-dev # For Apache Arrow Flight C++
sudo apt install -y -V libarrow-flight-glib-dev # For Apache Arrow Flight GLib (C)
sudo apt install -y -V libarrow-flight-sql-dev # For Apache Arrow Flight SQL C++
sudo apt install -y -V libarrow-flight-sql-glib-dev # For Apache Arrow Flight SQL GLib (C)
sudo apt install -y -V libgandiva-dev # For Gandiva C++
sudo apt install -y -V libgandiva-glib-dev # For Gandiva GLib (C)
sudo apt install -y -V libparquet-dev # For Apache Parquet C++
sudo apt install -y -V libparquet-glib-dev # For Apache Parquet GLib (C)
sudo apt install -y ninja-build
```

3. **Run the app:**

Expand Down
50 changes: 30 additions & 20 deletions data_access_service/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import pandas as pd
import logging

from datetime import timedelta, datetime
from datetime import timedelta, datetime, timezone
from io import BytesIO
from typing import Optional
from aodn_cloud_optimised import ParquetDataQuery
from aodn_cloud_optimised import DataQuery
from data_access_service.core.descriptor import Depth, Descriptor

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -37,7 +37,7 @@ def __init__(self):

# UUID to metadata mapper and init it, a scheduler need to
# updated it as times go
self._instance = ParquetDataQuery.GetAodn()
self._instance = DataQuery.GetAodn()
self._metadata = self._instance.get_metadata()
self._create_uuid_dataset_map()

Expand Down Expand Up @@ -78,26 +78,26 @@ def get_raw_meta_data(self, uuid: str):
else:
return None

"""
Given a time range, we find if this uuid temporal cover the whole range
"""

def has_data(self, uuid: str, start_date: datetime, end_date: datetime):
md: Descriptor = self._cached.get(uuid)
if md is not None:
ds: ParquetDataQuery.Dataset = self._instance.get_dataset(md.dname)
while start_date <= end_date:

# currently use 366 days as a period, to make sure 1 query can cover 1 year
period_end = start_date + timedelta(days=366)
log.info(f"Checking data for {start_date} to {period_end}")
if period_end > end_date:
period_end = end_date

if not ds.get_data(
start_date, period_end, None, None, None, None, None
).empty:
return True
else:
start_date = period_end + timedelta(days=1)
ds: DataQuery.Dataset = self._instance.get_dataset(md.dname)
te = ds.get_temporal_extent()
return start_date <= te[0] and te[1] <= end_date
return False

def get_temporal_extent(self, uuid: str) -> (datetime, datetime):
md: Descriptor = self._cached.get(uuid)
if md is not None:
ds: DataQuery.Dataset = self._instance.get_dataset(md.dname)
return ds.get_temporal_extent()
else:
return ()

def get_dataset_data(
self,
uuid: str,
Expand All @@ -112,11 +112,21 @@ def get_dataset_data(
md: Descriptor = self._cached.get(uuid)

if md is not None:
ds: ParquetDataQuery.Dataset = self._instance.get_dataset(md.dname)
ds: DataQuery.Dataset = self._instance.get_dataset(md.dname)

# Default get 10 days of data
if date_start is None:
date_start = datetime.now() - timedelta(days=10)
date_start = datetime.now(timezone.utc) - timedelta(days=10)

# The get_data call the pyarrow and compare only works with non timezone datetime
# now make sure the timezone is correctly convert to utc then remove it.
# As get_date datetime are all utc, but the pyarrow do not support compare of datetime vs
# datetime with timezone.
if date_start.tzinfo is not None:
date_start = date_start.astimezone(timezone.utc).replace(tzinfo=None)

if date_end.tzinfo is not None:
date_end = date_end.astimezone(timezone.utc).replace(tzinfo=None)

return ds.get_data(
date_start, date_end, lat_min, lat_max, lon_min, lon_max, scalar_filter
Expand Down
51 changes: 36 additions & 15 deletions data_access_service/core/restapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
log = logging.getLogger(__name__)

RECORD_PER_PARTITION: Optional[int] = 1000
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
MIN_DATE = "1970-01-01T00:00:00Z"


# Make all non-numeric and str field to str so that json do not throw serializable error
Expand Down Expand Up @@ -206,54 +208,73 @@ def get_raw_metadata(uuid):
return app.api.get_raw_meta_data(uuid)


@restapi.route("data/<string:uuid>/has_data", methods=["GET"])
def data_check(uuid):
@restapi.route("/data/<string:uuid>/has_data", methods=["GET"])
def has_data(uuid):
start_date = _verify_datatime_param(
"start_date", request.args.get("start_date", default=None, type=str)
"start_date", request.args.get("start_date", default=MIN_DATE)
)
end_date = _verify_datatime_param(
"end_date", request.args.get("end_date", default=None, type=str)
"end_date",
request.args.get(
"end_date",
default=datetime.datetime.now(datetime.timezone.utc).strftime(DATE_FORMAT),
),
)
has_data = str(app.api.has_data(uuid, start_date, end_date)).lower()
return Response(has_data, mimetype="application/json")
result = str(app.api.has_data(uuid, start_date, end_date)).lower()
return Response(result, mimetype="application/json")


@restapi.route("/data/<string:uuid>/temporal_extent", methods=["GET"])
def get_temporal_extent(uuid):
temp: (datetime, datetime) = app.api.get_temporal_extent(uuid)
result = [
{
"start_date": temp[0].strftime(DATE_FORMAT),
"end_date": temp[1].strftime(DATE_FORMAT),
}
]
return Response(json.dumps(result), mimetype="application/json")


@restapi.route("/data/<string:uuid>", methods=["GET"])
def get_data(uuid):
log.info("Request details: %s", json.dumps(request.args.to_dict(), indent=2))
start_date = _verify_datatime_param(
"start_date", request.args.get("start_date", default=None, type=str)
"start_date", request.args.get("start_date", default=MIN_DATE)
)
end_date = _verify_datatime_param(
"end_date", request.args.get("end_date", default=None, type=str)
"end_date",
request.args.get(
"end_date",
default=datetime.datetime.now(datetime.timezone.utc).strftime(DATE_FORMAT),
),
)

result: Optional[pd.DataFrame] = app.api.get_dataset_data(
uuid=uuid, date_start=start_date, date_end=end_date
)

start_depth = _verify_depth_param(
"start_depth", request.args.get("start_depth", default=None, type=numpy.double)
"start_depth", numpy.double(request.args.get("start_depth", default=-1.0))
)
end_depth = _verify_depth_param(
"end_depth", request.args.get("end_depth", default=None, type=numpy.double)
"end_depth", numpy.double(request.args.get("end_depth", default=-1.0))
)

is_to_index = _verify_to_index_flag_param(
request.args.get("is_to_index", default=None, type=str)
request.args.get("is_to_index", default=None)
)

# The cloud optimized format is fast to lookup if there is an index, some field isn't part of the
# index and therefore will not gain to filter by those field, indexed fields are site_code, timestamp, polygon

# Depth is below sea level zero, so logic slightly diff
if start_depth is not None and end_depth is not None:
if start_depth > 0 and end_depth > 0:
filtered = result[
(result["DEPTH"] <= start_depth) & (result["DEPTH"] >= end_depth)
]
elif start_depth is not None:
elif start_depth > 0:
filtered = result[(result["DEPTH"] <= start_depth)]
elif end_depth is not None:
elif end_depth > 0:
filtered = result[result["DEPTH"] >= end_depth]
else:
filtered = result
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- conda-forge
- defaults
dependencies:
- python>=3.10.14
- python=3.10.14
- pip<24.1
- pip:
- poetry
Loading
Loading