Skip to content

Commit

Permalink
Use resolution-dependent default units for lazy time encoding (pydata…
Browse files Browse the repository at this point in the history
…#10017)

When lazily encoding non-nanosecond times, the appropriate optimal integer encoding units are resolution-dependent. This PR updates our encoding pipeline accordingly.

Note that due to our internal reliance on pandas for date string parsing, we are still not able to round trip times outside the range -9999-01-01 to 9999-12-31 with pandas / NumPy, but this at least should pick more natural default units than nanoseconds for chunked arrays of non-nanosecond precision times. This gives users another way of addressing pydata#9154 (i.e. use non-nanosecond time arrays).
  • Loading branch information
spencerkclark authored Feb 5, 2025
1 parent 160cced commit 03c1014
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 10 deletions.
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ Deprecations

Bug fixes
~~~~~~~~~
- Default to resolution-dependent optimal integer encoding units when saving
chunked non-nanosecond :py:class:`numpy.datetime64` or
:py:class:`numpy.timedelta64` arrays to disk. Previously units of
"nanoseconds" were chosen by default, which are optimal for
nanosecond-resolution times, but not for times with coarser resolution. By
`Spencer Clark <https://github.com/spencerkclark>`_ (:pull:`10017`).


Documentation
Expand Down
11 changes: 9 additions & 2 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ def _numpy_to_netcdf_timeunit(units: NPDatetimeUnitOptions) -> str:
}[units]


def _numpy_dtype_to_netcdf_timeunit(dtype: np.dtype) -> str:
unit, _ = np.datetime_data(dtype)
unit = cast(NPDatetimeUnitOptions, unit)
return _numpy_to_netcdf_timeunit(unit)


def _ensure_padded_year(ref_date: str) -> str:
# Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
# are ambiguous (is it YMD or DMY?). This can lead to some very odd
Expand Down Expand Up @@ -1143,7 +1149,8 @@ def _lazily_encode_cf_datetime(
units = "microseconds since 1970-01-01"
dtype = np.dtype("int64")
else:
units = "nanoseconds since 1970-01-01"
netcdf_unit = _numpy_dtype_to_netcdf_timeunit(dates.dtype)
units = f"{netcdf_unit} since 1970-01-01"
dtype = np.dtype("int64")

if units is None or dtype is None:
Expand Down Expand Up @@ -1249,7 +1256,7 @@ def _lazily_encode_cf_timedelta(
timedeltas: T_ChunkedArray, units: str | None = None, dtype: np.dtype | None = None
) -> tuple[T_ChunkedArray, str]:
if units is None and dtype is None:
units = "nanoseconds"
units = _numpy_dtype_to_netcdf_timeunit(timedeltas.dtype)
dtype = np.dtype("int64")

if units is None or dtype is None:
Expand Down
23 changes: 15 additions & 8 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -1620,10 +1620,10 @@ def test_roundtrip_float_times(fill_value, times, units, encoded_values) -> None
_ENCODE_DATETIME64_VIA_DASK_TESTS.values(),
ids=_ENCODE_DATETIME64_VIA_DASK_TESTS.keys(),
)
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype, time_unit) -> None:
import dask.array

times_pd = pd.date_range(start="1700", freq=freq, periods=3)
times_pd = pd.date_range(start="1700", freq=freq, periods=3, unit=time_unit)
times = dask.array.from_array(times_pd, chunks=1)
encoded_times, encoding_units, encoding_calendar = encode_cf_datetime(
times, units, None, dtype
Expand All @@ -1636,13 +1636,17 @@ def test_encode_cf_datetime_datetime64_via_dask(freq, units, dtype) -> None:
assert encoding_units == units
assert encoded_times.dtype == dtype
else:
assert encoding_units == "nanoseconds since 1970-01-01"
expected_netcdf_time_unit = _numpy_to_netcdf_timeunit(time_unit)
assert encoding_units == f"{expected_netcdf_time_unit} since 1970-01-01"
assert encoded_times.dtype == np.dtype("int64")

assert encoding_calendar == "proleptic_gregorian"

decoded_times = decode_cf_datetime(encoded_times, encoding_units, encoding_calendar)
decoded_times = decode_cf_datetime(
encoded_times, encoding_units, encoding_calendar, time_unit=time_unit
)
np.testing.assert_equal(decoded_times, times)
assert decoded_times.dtype == times.dtype


@requires_dask
Expand Down Expand Up @@ -1749,11 +1753,11 @@ def test_encode_cf_datetime_casting_overflow_error(use_cftime, use_dask, dtype)
("units", "dtype"), [("days", np.dtype("int32")), (None, None)]
)
def test_encode_cf_timedelta_via_dask(
units: str | None, dtype: np.dtype | None
units: str | None, dtype: np.dtype | None, time_unit: PDDatetimeUnitOptions
) -> None:
import dask.array

times_pd = pd.timedelta_range(start="0D", freq="D", periods=3)
times_pd = pd.timedelta_range(start="0D", freq="D", periods=3, unit=time_unit) # type: ignore[call-arg]
times = dask.array.from_array(times_pd, chunks=1)
encoded_times, encoding_units = encode_cf_timedelta(times, units, dtype)

Expand All @@ -1764,11 +1768,14 @@ def test_encode_cf_timedelta_via_dask(
assert encoding_units == units
assert encoded_times.dtype == dtype
else:
assert encoding_units == "nanoseconds"
assert encoding_units == _numpy_to_netcdf_timeunit(time_unit)
assert encoded_times.dtype == np.dtype("int64")

decoded_times = decode_cf_timedelta(encoded_times, encoding_units)
decoded_times = decode_cf_timedelta(
encoded_times, encoding_units, time_unit=time_unit
)
np.testing.assert_equal(decoded_times, times)
assert decoded_times.dtype == times.dtype


@pytest.mark.parametrize("use_dask", [False, pytest.param(True, marks=requires_dask)])
Expand Down

0 comments on commit 03c1014

Please sign in to comment.