From 302631a3ba3344280eca84c44f786c3e79fc79b9 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Mon, 2 Oct 2023 21:16:21 +0200 Subject: [PATCH 01/16] TST add use_arrow for geopandas tests with sql statement --- pyogrio/tests/test_geopandas_io.py | 82 ++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index fc7cf16e..00e5947e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -456,13 +456,17 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow): # The geometry column cannot be specified when using the # default OGRSQL dialect but is returned nonetheless, so 4 columns. sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres" - df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL") + df = read_dataframe( + naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow + ) assert len(df.columns) == 4 assert len(df) == 177 # Should return single row sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'" - df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL") + df = read_dataframe( + naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow + ) assert len(df) == 1 assert len(df.columns) == 6 assert df.iloc[0].iso_a3 == "CAN" @@ -470,7 +474,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow): sql = """SELECT * FROM naturalearth_lowres WHERE iso_a3 IN ('CAN', 'USA', 'MEX')""" - df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL") + df = read_dataframe( + naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow + ) assert len(df.columns) == 6 assert len(df) == 3 assert df.iso_a3.tolist() == ["CAN", "USA", "MEX"] @@ -479,7 +485,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow): FROM naturalearth_lowres WHERE iso_a3 IN ('CAN', 'USA', 'MEX') ORDER BY name""" - df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL") + df = read_dataframe( + naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow + ) assert len(df.columns) == 6 assert len(df) == 3 assert df.iso_a3.tolist() == ["CAN", "MEX", "USA"] @@ -488,7 +496,9 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow): sql = """SELECT * FROM naturalearth_lowres WHERE POP_EST >= 10000000 AND POP_EST < 100000000""" - df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL") + df = read_dataframe( + naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow + ) assert len(df) == 75 assert len(df.columns) == 6 assert df.pop_est.min() >= 10000000 @@ -496,25 +506,36 @@ def test_read_sql(naturalearth_lowres_all_ext, use_arrow): # Should match no items. sql = "SELECT * FROM naturalearth_lowres WHERE ISO_A3 = 'INVALID'" - df = read_dataframe(naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL") + df = read_dataframe( + naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", use_arrow=use_arrow + ) assert len(df) == 0 -def test_read_sql_invalid(naturalearth_lowres_all_ext): +def test_read_sql_invalid(naturalearth_lowres_all_ext, use_arrow): if naturalearth_lowres_all_ext.suffix == ".gpkg": with pytest.raises(Exception, match="In ExecuteSQL().*"): - read_dataframe(naturalearth_lowres_all_ext, sql="invalid") + read_dataframe( + naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow + ) else: with pytest.raises(Exception, match="SQL Expression Parsing Error"): - read_dataframe(naturalearth_lowres_all_ext, sql="invalid") + read_dataframe( + naturalearth_lowres_all_ext, sql="invalid", use_arrow=use_arrow + ) with pytest.raises( ValueError, match="'sql' paramater cannot be combined with 'layer'" ): - read_dataframe(naturalearth_lowres_all_ext, sql="whatever", layer="invalid") + read_dataframe( + naturalearth_lowres_all_ext, + sql="whatever", + layer="invalid", + use_arrow=use_arrow, + ) -def test_read_sql_columns_where(naturalearth_lowres_all_ext): +def test_read_sql_columns_where(naturalearth_lowres_all_ext, use_arrow): sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres" df = read_dataframe( naturalearth_lowres_all_ext, @@ -522,13 +543,14 @@ def test_read_sql_columns_where(naturalearth_lowres_all_ext): sql_dialect="OGRSQL", columns=["iso_a3_renamed", "name"], where="iso_a3_renamed IN ('CAN', 'USA', 'MEX')", + use_arrow=use_arrow, ) assert len(df.columns) == 3 assert len(df) == 3 assert df.iso_a3_renamed.tolist() == ["CAN", "USA", "MEX"] -def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext): +def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext, use_arrow): sql = "SELECT iso_a3 AS iso_a3_renamed, name, pop_est FROM naturalearth_lowres" df = read_dataframe( naturalearth_lowres_all_ext, @@ -537,13 +559,14 @@ def test_read_sql_columns_where_bbox(naturalearth_lowres_all_ext): columns=["iso_a3_renamed", "name"], where="iso_a3_renamed IN ('CRI', 'PAN')", bbox=(-85, 8, -80, 10), + use_arrow=use_arrow, ) assert len(df.columns) == 3 assert len(df) == 2 assert df.iso_a3_renamed.tolist() == ["PAN", "CRI"] -def test_read_sql_skip_max(naturalearth_lowres_all_ext): +def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): sql = """SELECT * FROM naturalearth_lowres WHERE iso_a3 IN ('CAN', 'MEX', 'USA') @@ -554,6 +577,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext): skip_features=1, max_features=1, sql_dialect="OGRSQL", + use_arrow=use_arrow, ) assert len(df.columns) == 6 assert len(df) == 1 @@ -561,13 +585,21 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext): sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df = read_dataframe( - naturalearth_lowres_all_ext, sql=sql, max_features=3, sql_dialect="OGRSQL" + naturalearth_lowres_all_ext, + sql=sql, + max_features=3, + sql_dialect="OGRSQL", + use_arrow=use_arrow, ) assert len(df) == 1 sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df = read_dataframe( - naturalearth_lowres_all_ext, sql=sql, skip_features=1, sql_dialect="OGRSQL" + naturalearth_lowres_all_ext, + sql=sql, + skip_features=1, + sql_dialect="OGRSQL", + use_arrow=use_arrow, ) assert len(df) == 0 @@ -578,10 +610,12 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext): [ext for ext in ALL_EXTS if ext != ".gpkg"], indirect=["naturalearth_lowres"], ) -def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres): +def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres, use_arrow): # Should return singular item sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'" - df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="SQLITE") + df = read_dataframe( + naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow + ) assert len(df) == 1 assert len(df.columns) == 6 assert df.iloc[0].iso_a3 == "CAN" @@ -591,7 +625,9 @@ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres): sql = """SELECT ST_Buffer(geometry, 5) AS geometry, name, pop_est, iso_a3 FROM naturalearth_lowres WHERE ISO_A3 = 'CAN'""" - df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="SQLITE") + df = read_dataframe( + naturalearth_lowres, sql=sql, sql_dialect="SQLITE", use_arrow=use_arrow + ) assert len(df) == 1 assert len(df.columns) == 4 assert df.iloc[0].geometry.area > area_canada @@ -601,12 +637,14 @@ def test_read_sql_dialect_sqlite_nogpkg(naturalearth_lowres): @pytest.mark.parametrize( "naturalearth_lowres", [".gpkg"], indirect=["naturalearth_lowres"] ) -def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres): +def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow): # "INDIRECT_SQL" prohibits GDAL from passing the SQL statement to sqlite. # Because the statement is processed within GDAL it is possible to use # spatialite functions even if sqlite isn't built with spatialite support. sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'" - df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE") + df = read_dataframe( + naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow + ) assert len(df) == 1 assert len(df.columns) == 6 assert df.iloc[0].iso_a3 == "CAN" @@ -616,7 +654,9 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres): sql = """SELECT ST_Buffer(geom, 5) AS geometry, name, pop_est, iso_a3 FROM naturalearth_lowres WHERE ISO_A3 = 'CAN'""" - df = read_dataframe(naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE") + df = read_dataframe( + naturalearth_lowres, sql=sql, sql_dialect="INDIRECT_SQLITE", use_arrow=use_arrow + ) assert len(df) == 1 assert len(df.columns) == 4 assert df.iloc[0].geometry.area > area_canada From e1a060110a842ee2385be00a96c7047df9f0650a Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 3 Oct 2023 00:18:42 +0200 Subject: [PATCH 02/16] Try fixing skip_features for gdal>=3.8 --- pyogrio/_io.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 71e389b5..a0b5f96d 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1353,16 +1353,16 @@ def ogr_open_arrow( # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) - if not OGR_L_GetArrowStream(ogr_layer, &stream, options): - raise RuntimeError("Failed to open ArrowArrayStream from Layer") - - stream_ptr = &stream - if skip_features: - # only supported for GDAL >= 3.8.0; have to do this after getting + # only supported for GDAL >= 3.8.0; have to do this before getting # the Arrow stream OGR_L_SetNextByIndex(ogr_layer, skip_features) + if not OGR_L_GetArrowStream(ogr_layer, &stream, options): + raise RuntimeError("Failed to open ArrowArrayStream from Layer") + + stream_ptr = &stream + # stream has to be consumed before the Dataset is closed import pyarrow as pa reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr) From 04e27a95e0af2c7fd413a138b5f7cbea3e9799b8 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 3 Oct 2023 08:29:49 +0200 Subject: [PATCH 03/16] Try if a default sql + skip behaves different than LIMIT --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 00e5947e..05c82590 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -593,7 +593,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): ) assert len(df) == 1 - sql = "SELECT * FROM naturalearth_lowres LIMIT 1" + sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'" df = read_dataframe( naturalearth_lowres_all_ext, sql=sql, From d74ccfcb892039d75c45f45cd97f36dc0f109469 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 09:35:57 +0200 Subject: [PATCH 04/16] Revert change to sql_skip_max test --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 98058d81..af3b3fad 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -652,7 +652,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): ) assert len(df) == 1 - sql = "SELECT * FROM naturalearth_lowres WHERE iso_a3 = 'CAN'" + sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df = read_dataframe( naturalearth_lowres_all_ext, sql=sql, From 21757aa4cefb368d36de508bfd4c57504ff68ef7 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 10:14:40 +0200 Subject: [PATCH 05/16] Try SQLITE dialect in test --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index af3b3fad..dce6c38c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -657,7 +657,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): naturalearth_lowres_all_ext, sql=sql, skip_features=1, - sql_dialect="OGRSQL", + sql_dialect="SQLITE", use_arrow=use_arrow, ) assert len(df) == 0 From dcaf3846cd9dff26967c00387095a2c6989285c9 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 10:22:15 +0200 Subject: [PATCH 06/16] Try if sql_dialect is not specified --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index dce6c38c..659c7740 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -657,7 +657,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): naturalearth_lowres_all_ext, sql=sql, skip_features=1, - sql_dialect="SQLITE", + # sql_dialect="SQLITE", use_arrow=use_arrow, ) assert len(df) == 0 From cbe063704407d450bdc114f509813c26aad49535 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 11:31:32 +0200 Subject: [PATCH 07/16] Check if offset and skip are combined in gdal3.8 --- pyogrio/tests/test_geopandas_io.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 659c7740..83adf16b 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -652,15 +652,28 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): ) assert len(df) == 1 + sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 1" + df_offset = read_dataframe( + naturalearth_lowres_all_ext, + sql=sql, + sql_dialect="OGRSQL", + use_arrow=use_arrow, + ) + sql = "SELECT * FROM naturalearth_lowres LIMIT 1" - df = read_dataframe( + df_skip = read_dataframe( naturalearth_lowres_all_ext, sql=sql, + sql_dialect="OGRSQL", skip_features=1, - # sql_dialect="SQLITE", use_arrow=use_arrow, ) - assert len(df) == 0 + if len(df_skip) == 0: + assert len(df_skip) == 0 + else: + assert len(df_skip) == len(df_offset) + assert len(df_skip) == 1 + assert df_skip.iso_a3 == df_offset.iso_a3 @requires_gdal_geos From 1aca3d670fb3956e85cb98f1ffdea2966a5bfb46 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 11:53:43 +0200 Subject: [PATCH 08/16] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 83adf16b..739f1583 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -673,7 +673,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): else: assert len(df_skip) == len(df_offset) assert len(df_skip) == 1 - assert df_skip.iso_a3 == df_offset.iso_a3 + assert df_skip.iso_a3.to_list() == df_offset.iso_a3.to_list() @requires_gdal_geos From b15d116e4236ca62c4b6925f585ec8d35f0883a3 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 12:12:18 +0200 Subject: [PATCH 09/16] Some more experimental testing --- pyogrio/tests/test_geopandas_io.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 739f1583..8fa29bf6 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -660,6 +660,22 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): use_arrow=use_arrow, ) + sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 5" + df_offset5 = read_dataframe( + naturalearth_lowres_all_ext, + sql=sql, + sql_dialect="OGRSQL", + use_arrow=use_arrow, + ) + sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 5" + df_offset5skip1 = read_dataframe( + naturalearth_lowres_all_ext, + sql=sql, + sql_dialect="OGRSQL", + skip_features=1, + use_arrow=use_arrow, + ) + sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df_skip = read_dataframe( naturalearth_lowres_all_ext, @@ -674,6 +690,8 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): assert len(df_skip) == len(df_offset) assert len(df_skip) == 1 assert df_skip.iso_a3.to_list() == df_offset.iso_a3.to_list() + assert df_skip.iso_a3.to_list() == df_offset5skip1.iso_a3.to_list() + assert df_skip.iso_a3.to_list() == df_offset5.iso_a3.to_list() @requires_gdal_geos From 1e5bd5893265639804e5aa954a244fd66ac25bef Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 12:23:25 +0200 Subject: [PATCH 10/16] Another experimental gdal3.8 test --- pyogrio/tests/test_geopandas_io.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 8fa29bf6..3f1f54ed 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -675,7 +675,14 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): skip_features=1, use_arrow=use_arrow, ) - + sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 1" + df_offset1skip5 = read_dataframe( + naturalearth_lowres_all_ext, + sql=sql, + sql_dialect="OGRSQL", + skip_features=5, + use_arrow=use_arrow, + ) sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df_skip = read_dataframe( naturalearth_lowres_all_ext, @@ -690,6 +697,7 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): assert len(df_skip) == len(df_offset) assert len(df_skip) == 1 assert df_skip.iso_a3.to_list() == df_offset.iso_a3.to_list() + assert df_skip.iso_a3.to_list() == df_offset1skip5.iso_a3.to_list() assert df_skip.iso_a3.to_list() == df_offset5skip1.iso_a3.to_list() assert df_skip.iso_a3.to_list() == df_offset5.iso_a3.to_list() From 4311baf0d424b3b1ed69b2d65c0e04021a34e0e2 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 12:45:34 +0200 Subject: [PATCH 11/16] Test diff if skip after open arrow --- pyogrio/_io.pyx | 8 ++++---- pyogrio/tests/test_geopandas_io.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index acc65c60..d7e3ad7e 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1367,14 +1367,14 @@ def ogr_open_arrow( # make sure layer is read from beginning OGR_L_ResetReading(ogr_layer) + if not OGR_L_GetArrowStream(ogr_layer, &stream, options): + raise RuntimeError("Failed to open ArrowArrayStream from Layer") + if skip_features: - # only supported for GDAL >= 3.8.0; have to do this before getting + # only supported for GDAL >= 3.8.0; have to do this after getting # the Arrow stream OGR_L_SetNextByIndex(ogr_layer, skip_features) - if not OGR_L_GetArrowStream(ogr_layer, &stream, options): - raise RuntimeError("Failed to open ArrowArrayStream from Layer") - stream_ptr = &stream # stream has to be consumed before the Dataset is closed diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 3f1f54ed..ae0d0d42 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -642,6 +642,14 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): assert len(df) == 1 assert df.iso_a3.tolist() == ["MEX"] + sql = "SELECT * FROM naturalearth_lowres" + df_all = read_dataframe( + naturalearth_lowres_all_ext, + sql=sql, + sql_dialect="OGRSQL", + use_arrow=use_arrow, + ) + sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df = read_dataframe( naturalearth_lowres_all_ext, @@ -683,6 +691,14 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): skip_features=5, use_arrow=use_arrow, ) + sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 0" + df_offset0skip1 = read_dataframe( + naturalearth_lowres_all_ext, + sql=sql, + sql_dialect="OGRSQL", + skip_features=1, + use_arrow=use_arrow, + ) sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df_skip = read_dataframe( naturalearth_lowres_all_ext, @@ -696,10 +712,16 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): else: assert len(df_skip) == len(df_offset) assert len(df_skip) == 1 - assert df_skip.iso_a3.to_list() == df_offset.iso_a3.to_list() - assert df_skip.iso_a3.to_list() == df_offset1skip5.iso_a3.to_list() - assert df_skip.iso_a3.to_list() == df_offset5skip1.iso_a3.to_list() - assert df_skip.iso_a3.to_list() == df_offset5.iso_a3.to_list() + df_skip.iso_a3.to_list() == df_offset.iso_a3.to_list() + + message = ( + f"More info: df_all.iso_a3: {df_all.iso_a3.to_list()}, " + f"df_offset1skip5.iso_a3: {df_offset1skip5.iso_a3.to_list()}, " + f"df_offset5skip1.iso_a3: {df_offset5skip1.iso_a3.to_list()}, " + f"df_offset0skip1.iso_a3: {df_offset0skip1.iso_a3.to_list()}, " + f"df_offset5.iso_a3: {df_offset5.iso_a3.to_list()}, " + ) + raise Exception(message) @requires_gdal_geos From 4c42649b20c49753a2d28ef263fa7d96648d545c Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 15:42:37 +0200 Subject: [PATCH 12/16] Revert experimental changes to test test_read_sql_skip_max --- pyogrio/tests/test_geopandas_io.py | 65 +----------------------------- 1 file changed, 2 insertions(+), 63 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index ae0d0d42..20d55bb7 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -642,14 +642,6 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): assert len(df) == 1 assert df.iso_a3.tolist() == ["MEX"] - sql = "SELECT * FROM naturalearth_lowres" - df_all = read_dataframe( - naturalearth_lowres_all_ext, - sql=sql, - sql_dialect="OGRSQL", - use_arrow=use_arrow, - ) - sql = "SELECT * FROM naturalearth_lowres LIMIT 1" df = read_dataframe( naturalearth_lowres_all_ext, @@ -660,68 +652,15 @@ def test_read_sql_skip_max(naturalearth_lowres_all_ext, use_arrow): ) assert len(df) == 1 - sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 1" - df_offset = read_dataframe( - naturalearth_lowres_all_ext, - sql=sql, - sql_dialect="OGRSQL", - use_arrow=use_arrow, - ) - - sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 5" - df_offset5 = read_dataframe( - naturalearth_lowres_all_ext, - sql=sql, - sql_dialect="OGRSQL", - use_arrow=use_arrow, - ) - sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 5" - df_offset5skip1 = read_dataframe( - naturalearth_lowres_all_ext, - sql=sql, - sql_dialect="OGRSQL", - skip_features=1, - use_arrow=use_arrow, - ) - sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 1" - df_offset1skip5 = read_dataframe( - naturalearth_lowres_all_ext, - sql=sql, - sql_dialect="OGRSQL", - skip_features=5, - use_arrow=use_arrow, - ) - sql = "SELECT * FROM naturalearth_lowres LIMIT 1 OFFSET 0" - df_offset0skip1 = read_dataframe( - naturalearth_lowres_all_ext, - sql=sql, - sql_dialect="OGRSQL", - skip_features=1, - use_arrow=use_arrow, - ) sql = "SELECT * FROM naturalearth_lowres LIMIT 1" - df_skip = read_dataframe( + df = read_dataframe( naturalearth_lowres_all_ext, sql=sql, sql_dialect="OGRSQL", skip_features=1, use_arrow=use_arrow, ) - if len(df_skip) == 0: - assert len(df_skip) == 0 - else: - assert len(df_skip) == len(df_offset) - assert len(df_skip) == 1 - df_skip.iso_a3.to_list() == df_offset.iso_a3.to_list() - - message = ( - f"More info: df_all.iso_a3: {df_all.iso_a3.to_list()}, " - f"df_offset1skip5.iso_a3: {df_offset1skip5.iso_a3.to_list()}, " - f"df_offset5skip1.iso_a3: {df_offset5skip1.iso_a3.to_list()}, " - f"df_offset0skip1.iso_a3: {df_offset0skip1.iso_a3.to_list()}, " - f"df_offset5.iso_a3: {df_offset5.iso_a3.to_list()}, " - ) - raise Exception(message) + assert len(df) == 0 @requires_gdal_geos From 9dcdbc13038762dcaceefd964fd2520edcf12f1b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 15 Oct 2023 16:33:35 +0200 Subject: [PATCH 13/16] local skip filtering if sql stmt not based on gdal version --- pyogrio/_io.pyx | 17 ++++++++--------- pyogrio/raw.py | 19 ++++++++++--------- pyogrio/tests/test_arrow.py | 26 ++++++++++++++++---------- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index d7e3ad7e..bbba082a 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1266,11 +1266,11 @@ def ogr_open_arrow( if fids is not None: raise ValueError("reading by FID is not supported for Arrow") - IF CTE_GDAL_VERSION < (3, 8, 0): - if skip_features: - raise ValueError( - "specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0" - ) + if skip_features and (sql is not None or where is not None): + raise ValueError( + "specifying 'skip_features' is not supported for Arrow in combination with " + "'sql' or 'where'. You can use the LIMIT and OFFSET clauses in the query." + ) if skip_features < 0: raise ValueError("'skip_features' must be >= 0") @@ -1370,13 +1370,12 @@ def ogr_open_arrow( if not OGR_L_GetArrowStream(ogr_layer, &stream, options): raise RuntimeError("Failed to open ArrowArrayStream from Layer") + stream_ptr = &stream + if skip_features: - # only supported for GDAL >= 3.8.0; have to do this after getting - # the Arrow stream + # For GDAL < 3.8.0 this needs to be set after getting the Arrow stream OGR_L_SetNextByIndex(ogr_layer, skip_features) - stream_ptr = &stream - # stream has to be consumed before the Dataset is closed import pyarrow as pa reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index b19484fc..346a74a0 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -265,10 +265,11 @@ def read_arrow( if max_features is not None and max_features < batch_size: batch_size = max_features - # handle skip_features internally within open_arrow if GDAL >= 3.8.0 - gdal_skip_features = 0 - if get_gdal_version() >= (3, 8, 0): - gdal_skip_features = skip_features + # if where or sql statement specified, gdal has issues applying skip features in + # some cases, so handle it locally. + local_skip_features = 0 + if where is not None or sql is not None: + local_skip_features = skip_features skip_features = 0 with open_arrow( @@ -285,7 +286,7 @@ def read_arrow( sql=sql, sql_dialect=sql_dialect, return_fids=return_fids, - skip_features=gdal_skip_features, + skip_features=skip_features, batch_size=batch_size, **kwargs, ) as source: @@ -300,7 +301,7 @@ def read_arrow( batches.append(batch) count += len(batch) - if count >= (skip_features + max_features): + if count >= (local_skip_features + max_features): break except StopIteration: @@ -310,12 +311,12 @@ def read_arrow( # too many features table = ( Table.from_batches(batches, schema=reader.schema) - .slice(skip_features, max_features) + .slice(local_skip_features, max_features) .combine_chunks() ) - elif skip_features > 0: - table = reader.read_all().slice(skip_features).combine_chunks() + elif local_skip_features > 0: + table = reader.read_all().slice(local_skip_features).combine_chunks() else: table = reader.read_all() diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index dc62bff1..410a54d8 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -4,7 +4,7 @@ import pytest -from pyogrio import __gdal_version__, read_dataframe +from pyogrio import read_dataframe from pyogrio.raw import open_arrow, read_arrow from pyogrio.tests.conftest import requires_arrow_api @@ -152,19 +152,25 @@ def test_open_arrow_batch_size(naturalearth_lowres): assert len(tables[0]) == batch_size, "First table should match the batch size" -@pytest.mark.skipif( - __gdal_version__ >= (3, 8, 0), - reason="skip_features supported by Arrow stream API for GDAL>=3.8.0", -) @pytest.mark.parametrize("skip_features", [10, 200]) -def test_open_arrow_skip_features_unsupported(naturalearth_lowres, skip_features): - """skip_features are not supported for the Arrow stream interface for - GDAL < 3.8.0""" +@pytest.mark.parametrize( + "sql, where", [('SELECT * FROM "naturalearth_lowres"', None), (None, "1=1")] +) +def test_open_arrow_skip_features_unsupported( + naturalearth_lowres, sql, where, skip_features +): + """skip_features are not supported for the Arrow stream interface in combination + with 'sql' or 'where'.""" with pytest.raises( ValueError, - match="specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0", + match=( + "specifying 'skip_features' is not supported for Arrow in combination with " + "'sql' or 'where'" + ), ): - with open_arrow(naturalearth_lowres, skip_features=skip_features) as ( + with open_arrow( + naturalearth_lowres, where=where, sql=sql, skip_features=skip_features + ) as ( meta, reader, ): From 64236cbd9d685f9f87343be5cf82dfe821bdbdf7 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 25 Oct 2023 14:02:55 +0200 Subject: [PATCH 14/16] Revert changes to when filtering is done by gdal now the sql offset + skip bug is fixed in gdal --- pyogrio/_io.pyx | 10 +++++----- pyogrio/raw.py | 20 +++++++++----------- pyogrio/tests/test_arrow.py | 26 ++++++++++---------------- 3 files changed, 24 insertions(+), 32 deletions(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 358d50ad..9ae6456f 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1283,11 +1283,11 @@ def ogr_open_arrow( if fids is not None: raise ValueError("reading by FID is not supported for Arrow") - if skip_features and (sql is not None or where is not None): - raise ValueError( - "specifying 'skip_features' is not supported for Arrow in combination with " - "'sql' or 'where'. You can use the LIMIT and OFFSET clauses in the query." - ) + IF CTE_GDAL_VERSION < (3, 8, 0): + if skip_features: + raise ValueError( + "specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0" + ) if skip_features < 0: raise ValueError("'skip_features' must be >= 0") diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 68b4c90e..01e5e55d 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -273,12 +273,10 @@ def read_arrow( if max_features is not None and max_features < batch_size: batch_size = max_features - # if where or sql statement specified, gdal has issues applying skip features in - # some cases, so handle it locally. - local_skip_features = 0 - if where is not None or sql is not None: - local_skip_features = skip_features - skip_features = 0 + # handle skip_features internally within open_arrow if GDAL >= 3.8.0 + gdal_skip_features = 0 + if get_gdal_version() >= (3, 8, 0): + gdal_skip_features = skip_features with open_arrow( path_or_buffer, @@ -294,7 +292,7 @@ def read_arrow( sql=sql, sql_dialect=sql_dialect, return_fids=return_fids, - skip_features=skip_features, + skip_features=gdal_skip_features, batch_size=batch_size, **kwargs, ) as source: @@ -309,7 +307,7 @@ def read_arrow( batches.append(batch) count += len(batch) - if count >= (local_skip_features + max_features): + if count >= (skip_features + max_features): break except StopIteration: @@ -319,12 +317,12 @@ def read_arrow( # too many features table = ( Table.from_batches(batches, schema=reader.schema) - .slice(local_skip_features, max_features) + .slice(skip_features, max_features) .combine_chunks() ) - elif local_skip_features > 0: - table = reader.read_all().slice(local_skip_features).combine_chunks() + elif skip_features > 0: + table = reader.read_all().slice(skip_features).combine_chunks() else: table = reader.read_all() diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py index 410a54d8..dc62bff1 100644 --- a/pyogrio/tests/test_arrow.py +++ b/pyogrio/tests/test_arrow.py @@ -4,7 +4,7 @@ import pytest -from pyogrio import read_dataframe +from pyogrio import __gdal_version__, read_dataframe from pyogrio.raw import open_arrow, read_arrow from pyogrio.tests.conftest import requires_arrow_api @@ -152,25 +152,19 @@ def test_open_arrow_batch_size(naturalearth_lowres): assert len(tables[0]) == batch_size, "First table should match the batch size" -@pytest.mark.parametrize("skip_features", [10, 200]) -@pytest.mark.parametrize( - "sql, where", [('SELECT * FROM "naturalearth_lowres"', None), (None, "1=1")] +@pytest.mark.skipif( + __gdal_version__ >= (3, 8, 0), + reason="skip_features supported by Arrow stream API for GDAL>=3.8.0", ) -def test_open_arrow_skip_features_unsupported( - naturalearth_lowres, sql, where, skip_features -): - """skip_features are not supported for the Arrow stream interface in combination - with 'sql' or 'where'.""" +@pytest.mark.parametrize("skip_features", [10, 200]) +def test_open_arrow_skip_features_unsupported(naturalearth_lowres, skip_features): + """skip_features are not supported for the Arrow stream interface for + GDAL < 3.8.0""" with pytest.raises( ValueError, - match=( - "specifying 'skip_features' is not supported for Arrow in combination with " - "'sql' or 'where'" - ), + match="specifying 'skip_features' is not supported for Arrow for GDAL<3.8.0", ): - with open_arrow( - naturalearth_lowres, where=where, sql=sql, skip_features=skip_features - ) as ( + with open_arrow(naturalearth_lowres, skip_features=skip_features) as ( meta, reader, ): From 659631b7e2d3f8af5ac7beb99f004e589ad659ab Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Wed, 25 Oct 2023 14:04:09 +0200 Subject: [PATCH 15/16] part 2 --- pyogrio/raw.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 01e5e55d..4384ee9a 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -277,6 +277,7 @@ def read_arrow( gdal_skip_features = 0 if get_gdal_version() >= (3, 8, 0): gdal_skip_features = skip_features + skip_features = 0 with open_arrow( path_or_buffer, From 0f64f8662807b1120dda9287d794501e0c1d45ac Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Tue, 23 Jan 2024 16:49:01 +0100 Subject: [PATCH 16/16] Rollback changes to comment --- pyogrio/_io.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 9ae6456f..12fb85a3 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1390,7 +1390,8 @@ def ogr_open_arrow( stream_ptr = &stream if skip_features: - # For GDAL < 3.8.0 this needs to be set after getting the Arrow stream + # only supported for GDAL >= 3.8.0; have to do this after getting + # the Arrow stream OGR_L_SetNextByIndex(ogr_layer, skip_features) # stream has to be consumed before the Dataset is closed