NOAA-GSL · esheehan-gsl · Oct 31, 2023 · Oct 30, 2023 · Oct 30, 2023 · Oct 30, 2023
diff --git a/src/unified_graphics/diag.py b/src/unified_graphics/diag.py
@@ -2,7 +2,6 @@
 from collections import namedtuple
 from dataclasses import dataclass
 from enum import Enum
-from pathlib import Path
 from typing import Generator, List, Union
 from urllib.parse import urlparse
 
@@ -461,13 +460,10 @@ def history(
     loop: MinimLoop,
     filters: MultiDict,
 ) -> pd.DataFrame:
-    # FIXME: This fails when diag_zarr is a file:// URL. Pandas ends up trying to use
-    # urlopen to read the file, but it's a directory. For now, we strip file://, but
-    # this is a hack.
-    parquet_file = (
-        Path(parquet_path.replace("file://", ""))
-        / "_".join((model, background, system, domain, frequency))
-        / variable.value
+    parquet_file = os.path.join(
+        parquet_path,
+        "_".join((model, background, system, domain, frequency)),
+        variable.value,
     )
 
     df = pd.read_parquet(

diff --git a/tests/test_diag.py b/tests/test_diag.py
@@ -2,6 +2,7 @@
 from functools import partial
 
 import numpy as np
+import pandas as pd
 import pytest
 import xarray as xr
 from botocore.session import Session
@@ -290,3 +291,141 @@ def test_lower_bounds(self, result, expected):
 
     def test_upper_bounds(self, result, expected):
         assert (result[0][2] == expected[0][2]).all()
+
+
+def test_history(tmp_path, test_dataset, diag_parquet):
+    run_list = [
+        {
+            "initialization_time": "2022-05-16T04:00",
+            "observation": [10, 20],
+            "forecast_unadjusted": [5, 10],
+            "is_used": [True, True],
+            # O - F [5, 10]
+        },
+        {
+            "initialization_time": "2022-05-16T07:00",
+            "observation": [1, 2, 3],
+            "forecast_unadjusted": [5, 10, 3],
+            "longitude": [0, 0, 0],
+            "latitude": [0, 0, 0],
+            "is_used": [True, True, True],
+            # O - F [-4, -8, 0]
+        },
+    ]
+
+    for run in run_list:
+        data = test_dataset(
+            model="RTMA",
+            system="WCOSS",
+            domain="CONUS",
+            background="RRFS",
+            frequency="REALTIME",
+            variable="ps",
+            loop="ges",
+            **run,
+        )
+        diag_parquet(data)
+
+    result = diag.history(
+        f"file://{tmp_path}/",
+        "RTMA",
+        "WCOSS",
+        "CONUS",
+        "RRFS",
+        "REALTIME",
+        diag.Variable.PRESSURE,
+        diag.MinimLoop.GUESS,
+        MultiDict(),
+    )
+
+    pd.testing.assert_frame_equal(
+        result,
+        pd.DataFrame(
+            {
+                "initialization_time": ["2022-05-16T04:00", "2022-05-16T07:00"],
+                "min": [5.0, -8.0],
+                "max": [10.0, 0.0],
+                "mean": [7.5, -4.0],
+                "count": [2.0, 3.0],
+            }
+        ),
+    )
+
+
+def test_history_s3(aws_credentials, moto_server, s3_client, test_dataset, monkeypatch):
+    bucket = "test_history_s3"
+    store = f"s3://{bucket}/"
+    s3_client.create_bucket(Bucket=bucket)
+
+    storage_options = {"client_kwargs": {"endpoint_url": moto_server}}
+    monkeypatch.setattr(
+        diag.pd,
+        "read_parquet",
+        partial(pd.read_parquet, storage_options=storage_options),
+    )
+
+    run_list = [
+        {
+            "initialization_time": "2022-05-16T04:00",
+            "observation": [10, 20],
+            "forecast_unadjusted": [5, 10],
+            "is_used": [True, True],
+            # O - F [5, 10]
+        },
+        {
+            "initialization_time": "2022-05-16T07:00",
+            "observation": [1, 2, 3],
+            "forecast_unadjusted": [5, 10, 3],
+            "longitude": [0, 0, 0],
+            "latitude": [0, 0, 0],
+            "is_used": [True, True, True],
+            # O - F [-4, -8, 0]
+        },
+    ]
+
+    for run in run_list:
+        data = test_dataset(
+            model="RTMA",
+            system="WCOSS",
+            domain="CONUS",
+            background="RRFS",
+            frequency="REALTIME",
+            variable="ps",
+            loop="ges",
+            **run,
+        ).to_dataframe()
+        data["loop"] = "ges"
+        data["initialization_time"] = run["initialization_time"]
+
+        data.to_parquet(
+            f"s3://{bucket}/RTMA_RRFS_WCOSS_CONUS_REALTIME/ps",
+            partition_cols=["loop"],
+            index=True,
+            engine="pyarrow",
+            storage_options=storage_options,
+        )
+
+    result = diag.history(
+        store,
+        "RTMA",
+        "WCOSS",
+        "CONUS",
+        "RRFS",
+        "REALTIME",
+        diag.Variable.PRESSURE,
+        diag.MinimLoop.GUESS,
+        MultiDict(),
+    )
+
+    pd.testing.assert_frame_equal(
+        result,
+        pd.DataFrame(
+            {
+                "initialization_time": ["2022-05-16T04:00", "2022-05-16T07:00"],
+                "min": [5.0, -8.0],
+                "max": [10.0, 0.0],
+                "mean": [7.5, -4.0],
+                "count": [2.0, 3.0],
+            }
+        ),
+    )