From f53af87dc55e8ee94cc6cd44dd8a7a13d1416b78 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 02:33:23 +0100 Subject: [PATCH] Improve tests --- pyogrio/tests/test_geopandas_io.py | 161 ++++++++++++++++------------- 1 file changed, 92 insertions(+), 69 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index bf9f70dc..015c99c6 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -39,7 +39,7 @@ import geopandas as gp import pandas as pd from geopandas.array import from_wkt - from pandas.api.types import is_datetime64_dtype + from pandas.api.types import is_datetime64_dtype, is_object_dtype import shapely # if geopandas is present, shapely is expected to be present from shapely.geometry import Point @@ -296,6 +296,35 @@ def test_read_datetime(datetime_file, use_arrow): assert df.col.dtype.name == "datetime64[ns]" +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) +@pytest.mark.requires_arrow_write_api +def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): + """Test writing/reading a column with naive datetimes (no timezone information).""" + dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00", None] + if PANDAS_GE_20: + dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") + else: + dates = pd.to_datetime(dates_raw) + df = gp.GeoDataFrame( + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" + ) + + fpath = tmp_path / f"test{ext}" + write_dataframe(df, fpath, use_arrow=use_arrow) + result = read_dataframe(fpath, use_arrow=use_arrow) + + if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, columns with naive datetimes are written + # correctly, but when read they are wrongly interpreted as being in UTC. + # The reason is complicated, but more info can be found e.g. here: + # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 + assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) + pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") + + assert is_datetime64_dtype(result.dates.dtype) + assert_geodataframe_equal(result, df) + + @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ") @pytest.mark.requires_arrow_write_api @@ -310,7 +339,7 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): # This was fixed in https://github.com/OSGeo/gdal/pull/11049 pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow") - dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"] + dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -319,8 +348,8 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): # Make the index non-consecutive to test this case as well. Added for issue # https://github.com/geopandas/pyogrio/issues/324 df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, - index=[0, 2], + {"dates": dates, "geometry": [Point(1, 1)] * 3}, + index=[0, 2, 3], crs="EPSG:4326", ) assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) @@ -330,45 +359,58 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow): result = read_dataframe(fpath, use_arrow=use_arrow) # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): result.dates = result.dates.astype(df.dates.dtype) - if use_arrow and __gdal_version__ < (3, 11, 0): - if ext in (".fgb", ".gpkg"): - # With GDAL < 3.11 with arrow, datetime columns are written as string type - # columns - df.dates = df.dates.map(lambda x: x.isoformat()) + if use_arrow and ext in (".fgb", ".gpkg") and __gdal_version__ < (3, 11, 0): + # With GDAL < 3.11 with arrow, datetime columns are written as string type + df_exp = df.copy() + df_exp.dates = df_exp[df_exp.dates.notna()].dates.astype(str) + assert_series_equal(result.dates, df_exp.dates, check_index=False) + pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow") + assert isinstance(df.dates.dtype, pd.DatetimeTZDtype) assert_series_equal(result.dates, df.dates, check_index=False) +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): +def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow): """Test with localized dates across a different summer/winter timezone offset.""" # Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10) - dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"] + dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", None] dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates") dates_local = dates_naive.dt.tz_localize("Australia/Sydney") - dates_local_offsets_str = dates_local.map(pd.Timestamp.isoformat) + dates_local_offsets_str = dates_local.astype("string").astype("O") dates_exp = dates_local_offsets_str.map(pd.Timestamp) df = gp.GeoDataFrame( - {"dates": dates_local, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) - fpath = tmp_path / "test.gpkg" + fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) result = read_dataframe(fpath, use_arrow=use_arrow) if use_arrow and __gdal_version__ < (3, 11, 0): - # With GDAL < 3.11 with arrow, datetime columns written as string type columns - dates_exp = dates_local_offsets_str + if ext in (".geojson", ".geojsonl"): + # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC + # when read as the arrow datetime column type does not support mixed tz. + dates_utc = dates_local.dt.tz_convert("UTC") + if PANDAS_GE_20: + dates_utc = dates_utc.dt.as_unit("ms") + assert_series_equal(result.dates, dates_utc) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") + elif ext in (".gpkg", ".fgb"): + # With GDAL < 3.11 with arrow, datetime columns written as string type + assert_series_equal(result.dates, dates_local_offsets_str) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") # GDAL tz only encodes offsets, not timezones - assert_series_equal(result["dates"], dates_exp) + assert is_object_dtype(result.dates.dtype) + assert_series_equal(result.dates, dates_exp) @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) @@ -376,16 +418,18 @@ def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): +def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow): """Test with dates with mixed timezone offsets.""" # Pandas datetime64 column types doesn't support mixed timezone offsets, so # it needs to be a list of pandas.Timestamp objects instead. - dates_raw = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"] - dates_ts = list(map(pd.Timestamp, dates_raw)) + dates = [ + pd.Timestamp("2023-01-01 11:00:01.111+01:00"), + pd.Timestamp("2023-06-01 10:00:01.111+05:00"), + None, + ] df = gp.GeoDataFrame( - {"dates": dates_ts, "geometry": [Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) @@ -395,44 +439,21 @@ def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow): if ext in (".geojson", ".geojsonl"): # With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC # when read as the arrow datetime column type does not support mixed tz. + df_exp = df.copy() + df_exp.dates = pd.to_datetime(dates, utc=True) if PANDAS_GE_20: - df.dates = pd.to_datetime(dates_ts, utc=True).as_unit("ms") - else: - df.dates = pd.to_datetime(dates_ts, utc=True) + df_exp.dates = df_exp.dates.dt.as_unit("ms") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") elif ext in (".gpkg", ".fgb"): # With arrow and GDAL < 3.11, mixed timezone datetimes are written as string # type columns, so no proper roundtrip possible. - df.dates = df.dates.map(pd.Timestamp.isoformat) + df_exp = df.copy() + df_exp.dates = df_exp.dates.astype("string").astype("O") + assert_geodataframe_equal(result, df_exp) + pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow") - assert_geodataframe_equal(result, df) - - -@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"]) -@pytest.mark.requires_arrow_write_api -def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): - """Test writing/reading a column with naive datetimes (no timezone information).""" - dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"] - if PANDAS_GE_20: - dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") - else: - dates = pd.to_datetime(dates_raw) - df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" - ) - - fpath = tmp_path / f"test{ext}" - write_dataframe(df, fpath, use_arrow=use_arrow) - result = read_dataframe(fpath, use_arrow=use_arrow) - - if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0): - # With GDAL < 3.11 with arrow, columns with naive datetimes are written - # correctly, but when read they are wrongly interpreted as being in UTC. - # The reason is complicated, but more info can be found e.g. here: - # https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807 - assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC")) - pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow") - - assert is_datetime64_dtype(result.dates.dtype) + assert is_object_dtype(result.dates.dtype) assert_geodataframe_equal(result, df) @@ -456,8 +477,8 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow): "ignore: Non-conformant content for record 1 in column dates" ) @pytest.mark.requires_arrow_write_api -def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow): - """Datetime objects with null values and the equal offset are read as datetime64.""" +def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow): + """Datetime objects with equal offsets are read as datetime64.""" if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"): # With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime # as well as retaining the timezone. @@ -466,9 +487,9 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar dates = pd.Series(dates_raw, dtype="O") df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]}, - crs="EPSG:4326", + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) + if PANDAS_GE_20: dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: @@ -481,17 +502,18 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar result = read_dataframe(fpath, use_arrow=use_arrow) # With some older versions, the offset is represented slightly differently - if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"): + if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"): result.dates = result.dates.astype(exp_df.dates.dtype) if use_arrow and __gdal_version__ < (3, 11, 0): if ext in (".fgb", ".gpkg"): # With GDAL < 3.11 with arrow, datetime columns are written as string type - # columns - exp_df.dates = exp_df.dates.map( - lambda x: x.isoformat() if x is not pd.NaT else None - ) + exp2_df = exp_df.copy() + exp2_df.dates = exp2_df.dates.astype("string").astype("O") + assert_geodataframe_equal(result, exp2_df) + pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow") + assert isinstance(result.dates.dtype, pd.DatetimeTZDtype) assert_geodataframe_equal(result, exp_df) @@ -499,14 +521,15 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar @pytest.mark.requires_arrow_write_api def test_write_read_datetime_utc(tmp_path, ext, use_arrow): """Test writing/reading a column with UTC datetimes.""" - dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z"] + dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z", None] if PANDAS_GE_20: dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms") else: dates = pd.to_datetime(dates_raw) df = gp.GeoDataFrame( - {"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326" + {"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326" ) + assert df.dates.dtype.name == "datetime64[ms, UTC]" fpath = tmp_path / f"test{ext}" write_dataframe(df, fpath, use_arrow=use_arrow) @@ -517,7 +540,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow): assert_series_equal(result.dates, df.dates.dt.tz_localize(None)) pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow") - assert str(result.dates.dtype) == "datetime64[ms, UTC]" + assert result.dates.dtype.name == "datetime64[ms, UTC]" assert_geodataframe_equal(result, df)