diff --git a/.github/workflows/Rest.yml b/.github/workflows/Rest.yml index 11560e9..cb30450 100644 --- a/.github/workflows/Rest.yml +++ b/.github/workflows/Rest.yml @@ -50,10 +50,6 @@ jobs: run: | make release - - name: Start Rest Catalog - run: | - make start-rest-catalog - - name: Generate data run: | make data diff --git a/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/q00.sql b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/q00.sql new file mode 100644 index 0000000..959157d --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE iceberg_catalog.lineitem_001_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/q01.sql b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/q01.sql new file mode 100644 index 0000000..2894b39 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/q01.sql @@ -0,0 +1,11 @@ +update iceberg_catalog.lineitem_001_deletes +set l_orderkey=NULL, + l_partkey=NULL, + l_suppkey=NULL, + l_linenumber=NULL, + l_quantity=NULL, + l_extendedprice=NULL, + l_discount=NULL, + l_shipdate=NULL, + l_comment=NULL +where l_partkey % 2 = 0; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/setup.py b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/lineitem_001_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_001_deletes/q00.sql b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/q00.sql new file mode 100644 index 0000000..959157d --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE iceberg_catalog.lineitem_001_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_001_deletes/q01.sql b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/q01.sql new file mode 100644 index 0000000..2894b39 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/q01.sql @@ -0,0 +1,11 @@ +update iceberg_catalog.lineitem_001_deletes +set l_orderkey=NULL, + l_partkey=NULL, + l_suppkey=NULL, + l_linenumber=NULL, + l_quantity=NULL, + l_extendedprice=NULL, + l_discount=NULL, + l_shipdate=NULL, + l_comment=NULL +where l_partkey % 2 = 0; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_001_deletes/setup.py b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_001_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/q00.sql b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/q00.sql new file mode 100644 index 0000000..58388c9 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/q00.sql @@ -0,0 +1,8 @@ +CREATE OR REPLACE TABLE iceberg_catalog.lineitem_partitioned_l_shipmode +USING iceberg +PARTITIONED BY (l_shipmode) +TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' +) +as select * from parquet_file_view; diff --git a/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/q01.sql b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/q01.sql new file mode 100644 index 0000000..8a5e90b --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/q01.sql @@ -0,0 +1 @@ +delete from iceberg_catalog.lineitem_partitioned_l_shipmode where l_shipmode = 'TRUCK'; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/setup.py b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/q00.sql b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/q00.sql new file mode 100644 index 0000000..dc7d4b0 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/q00.sql @@ -0,0 +1,8 @@ +CREATE OR REPLACE TABLE iceberg_catalog.lineitem_partitioned_l_shipmode_deletes +USING iceberg +PARTITIONED BY (l_shipmode) +TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' +) +as select * from parquet_file_view; diff --git a/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/q01.sql b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/q01.sql new file mode 100644 index 0000000..9d28acf --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/q01.sql @@ -0,0 +1,6 @@ +UPDATE iceberg_catalog.lineitem_partitioned_l_shipmode_deletes +Set l_comment=NULL, + l_quantity=NULL, + l_discount=NULL, + l_linestatus=NULL +where l_linenumber = 3 or l_linenumber = 4 or l_linenumber = 5; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/setup.py b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_partitioned_l_shipmode_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/q00.sql b/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/q00.sql new file mode 100644 index 0000000..d9eae47 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE iceberg_catalog.lineitem_sf1_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/q01.sql b/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/q01.sql new file mode 100644 index 0000000..6b28e70 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/q01.sql @@ -0,0 +1,11 @@ +update iceberg_catalog.lineitem_sf1_deletes +set l_orderkey=NULL, + l_partkey=NULL, + l_suppkey=NULL, + l_linenumber=NULL, + l_quantity=NULL, + l_extendedprice=NULL, + l_discount=NULL, + l_shipdate=NULL, + l_comment=NULL +where l_partkey % 2 = 0; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/setup.py b/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/setup.py new file mode 100644 index 0000000..6fc4355 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf1_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=1)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/q00.sql b/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/q00.sql new file mode 100644 index 0000000..96c1d14 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE iceberg_catalog.lineitem_sf_01_1_delete + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/q01.sql b/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/q01.sql new file mode 100644 index 0000000..ecb6a3f --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/q01.sql @@ -0,0 +1 @@ +delete from iceberg_catalog.lineitem_sf_01_1_delete where l_orderkey=10053 and l_partkey = 77; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/setup.py b/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf_01_1_delete/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf_01_no_deletes/q00.sql b/scripts/data_generators/generate_spark_local/lineitem_sf_01_no_deletes/q00.sql new file mode 100644 index 0000000..9e02481 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf_01_no_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE iceberg_catalog.lineitem_sf_01_no_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/lineitem_sf_01_no_deletes/setup.py b/scripts/data_generators/generate_spark_local/lineitem_sf_01_no_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_local/lineitem_sf_01_no_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/table_with_deletes/q00.sql b/scripts/data_generators/generate_spark_local/table_with_deletes/q00.sql new file mode 100644 index 0000000..bfb84c2 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/table_with_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE iceberg_catalog.table_with_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/table_with_deletes/q01.sql b/scripts/data_generators/generate_spark_local/table_with_deletes/q01.sql new file mode 100644 index 0000000..6e73313 --- /dev/null +++ b/scripts/data_generators/generate_spark_local/table_with_deletes/q01.sql @@ -0,0 +1,11 @@ +update iceberg_catalog.table_with_deletes +set l_orderkey=NULL, + l_partkey=NULL, + l_suppkey=NULL, + l_linenumber=NULL, + l_quantity=NULL, + l_extendedprice=NULL, + l_discount=NULL, + l_shipdate=NULL, + l_comment=NULL +where l_partkey % 2 = 0; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_local/table_with_deletes/setup.py b/scripts/data_generators/generate_spark_local/table_with_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_local/table_with_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/q00.sql b/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/q00.sql new file mode 100644 index 0000000..c581d86 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE default.lineitem_001_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/q01.sql b/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/q01.sql new file mode 100644 index 0000000..472849d --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/q01.sql @@ -0,0 +1,11 @@ +update default.lineitem_001_deletes +set l_orderkey=NULL, + l_partkey=NULL, + l_suppkey=NULL, + l_linenumber=NULL, + l_quantity=NULL, + l_extendedprice=NULL, + l_discount=NULL, + l_shipdate=NULL, + l_comment=NULL +where l_partkey % 2 = 0; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/setup.py b/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_001_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/q00.sql b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/q00.sql new file mode 100644 index 0000000..fa22eb3 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/q00.sql @@ -0,0 +1,8 @@ +CREATE OR REPLACE TABLE default.lineitem_partitioned_l_shipmode +USING iceberg +PARTITIONED BY (l_shipmode) +TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' +) +as select * from parquet_file_view; diff --git a/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/q01.sql b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/q01.sql new file mode 100644 index 0000000..c7047b5 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/q01.sql @@ -0,0 +1 @@ +delete from default.lineitem_partitioned_l_shipmode where l_shipmode = 'TRUCK'; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/setup.py b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/q00.sql b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/q00.sql new file mode 100644 index 0000000..e1efd19 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/q00.sql @@ -0,0 +1,8 @@ +CREATE OR REPLACE TABLE default.lineitem_partitioned_l_shipmode_deletes +USING iceberg +PARTITIONED BY (l_shipmode) +TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' +) +as select * from parquet_file_view; diff --git a/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/q01.sql b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/q01.sql new file mode 100644 index 0000000..8116388 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/q01.sql @@ -0,0 +1,6 @@ +UPDATE default.lineitem_partitioned_l_shipmode_deletes +Set l_comment=NULL, + l_quantity=NULL, + l_discount=NULL, + l_linestatus=NULL +where l_linenumber = 3 or l_linenumber = 4 or l_linenumber = 5; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/setup.py b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_partitioned_l_shipmode_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/q00.sql b/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/q00.sql new file mode 100644 index 0000000..4112dad --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE default.lineitem_sf1_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/q01.sql b/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/q01.sql new file mode 100644 index 0000000..4f789bd --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/q01.sql @@ -0,0 +1,11 @@ +update default.lineitem_sf1_deletes +set l_orderkey=NULL, + l_partkey=NULL, + l_suppkey=NULL, + l_linenumber=NULL, + l_quantity=NULL, + l_extendedprice=NULL, + l_discount=NULL, + l_shipdate=NULL, + l_comment=NULL +where l_partkey % 2 = 0; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/setup.py b/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/setup.py new file mode 100644 index 0000000..6fc4355 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf1_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=1)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/q00.sql b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/q00.sql new file mode 100644 index 0000000..7893470 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE default.lineitem_sf_01_1_delete + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/q01.sql b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/q01.sql new file mode 100644 index 0000000..8fcc58e --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/q01.sql @@ -0,0 +1 @@ +delete from default.lineitem_sf_01_1_delete where l_orderkey=10053 and l_partkey = 77; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/setup.py b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_1_delete/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf_01_no_deletes/q00.sql b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_no_deletes/q00.sql new file mode 100644 index 0000000..b21fa20 --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_no_deletes/q00.sql @@ -0,0 +1,6 @@ +CREATE or REPLACE TABLE default.lineitem_sf_01_no_deletes + TBLPROPERTIES ( + 'format-version'='2', + 'write.update.mode'='merge-on-read' + ) +AS SELECT * FROM parquet_file_view; \ No newline at end of file diff --git a/scripts/data_generators/generate_spark_rest/lineitem_sf_01_no_deletes/setup.py b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_no_deletes/setup.py new file mode 100644 index 0000000..df29dff --- /dev/null +++ b/scripts/data_generators/generate_spark_rest/lineitem_sf_01_no_deletes/setup.py @@ -0,0 +1,8 @@ +import duckdb +import os + +PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE') + +duckdb_con = duckdb.connect() +duckdb_con.execute("call dbgen(sf=0.01)") +duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)") \ No newline at end of file diff --git a/test/sql/local/iceberg_scans/iceberg_metadata.test b/test/sql/local/iceberg_scans/iceberg_metadata.test new file mode 100644 index 0000000..182c7c0 --- /dev/null +++ b/test/sql/local/iceberg_scans/iceberg_metadata.test @@ -0,0 +1,98 @@ +# name: test/sql/local/iceberg_metadata.test +# description: test iceberg metadata function +# group: [iceberg] + +# Before we load the extension, this will fail +statement error +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg'); +---- +Catalog Error + +require parquet + +require iceberg + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg', ALLOW_MOVED_PATHS=TRUE); +---- +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro 2 DATA ADDED EXISTING lineitem_iceberg/data/00041-414-f3c73457-bbd6-4b92-9c15-17b241171b16-00001.parquet PARQUET 51793 +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro 2 DATA DELETED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg', ALLOW_MOVED_PATHS=TRUE, version='1'); +---- +lineitem_iceberg/metadata/cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro 1 DATA ADDED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg', ALLOW_MOVED_PATHS=TRUE, version_name_format='v%s%s.metadata.json'); +---- +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro 2 DATA ADDED EXISTING lineitem_iceberg/data/00041-414-f3c73457-bbd6-4b92-9c15-17b241171b16-00001.parquet PARQUET 51793 +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro 2 DATA DELETED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg', ALLOW_MOVED_PATHS=TRUE, version='2', version_name_format='v%s%s.metadata.json'); +---- +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro 2 DATA ADDED EXISTING lineitem_iceberg/data/00041-414-f3c73457-bbd6-4b92-9c15-17b241171b16-00001.parquet PARQUET 51793 +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro 2 DATA DELETED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +statement error +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_gz', ALLOW_MOVED_PATHS=TRUE); +---- +IO Error: Iceberg metadata file not found for table version '2' using 'none' compression and format(s): 'v%s%s.metadata.json,%s%s.metadata.json' + +statement error +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_gz', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC="blarg", version_name_format='blat%s%s'); +---- +IO Error: Iceberg metadata file not found for table version '2' using 'blarg' compression and format(s): 'blat%s%s' + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_gz', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC="gzip"); +---- +lineitem_iceberg_gz/metadata/23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro 0 DATA ADDED EXISTING lineitem_iceberg_gz/data/00000-2-371a340c-ded5-4e85-aa49-9c788d6f21cd-00001.parquet PARQUET 111968 + +statement error +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_nonexistent'); +---- +IO Error: Failed to read iceberg table. No version was provided and no version-hint could be found, + +statement error +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_no_hint', ALLOW_MOVED_PATHS=TRUE); +---- +:.*SET unsafe_enable_version_guessing.* + +statement ok +SET unsafe_enable_version_guessing = true; + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_no_hint', ALLOW_MOVED_PATHS=TRUE); +---- +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro 2 DATA ADDED EXISTING lineitem_iceberg/data/00041-414-f3c73457-bbd6-4b92-9c15-17b241171b16-00001.parquet PARQUET 51793 +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro 2 DATA DELETED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_no_hint', ALLOW_MOVED_PATHS=TRUE, version='1'); +---- +lineitem_iceberg/metadata/cf3d0be5-cf70-453d-ad8f-48fdc412e608-m0.avro 1 DATA ADDED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_no_hint', ALLOW_MOVED_PATHS=TRUE, version_name_format='v%s%s.metadata.json'); +---- +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro 2 DATA ADDED EXISTING lineitem_iceberg/data/00041-414-f3c73457-bbd6-4b92-9c15-17b241171b16-00001.parquet PARQUET 51793 +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro 2 DATA DELETED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_no_hint', ALLOW_MOVED_PATHS=TRUE, version='?', version_name_format='v%s%s.metadata.json'); +---- +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m1.avro 2 DATA ADDED EXISTING lineitem_iceberg/data/00041-414-f3c73457-bbd6-4b92-9c15-17b241171b16-00001.parquet PARQUET 51793 +lineitem_iceberg/metadata/10eaca8a-1e1c-421e-ad6d-b232e5ee23d3-m0.avro 2 DATA DELETED EXISTING lineitem_iceberg/data/00000-411-0792dcfe-4e25-4ca3-8ada-175286069a47-00001.parquet PARQUET 60175 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_gz_no_hint', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC='gzip', version='?'); +---- +lineitem_iceberg_gz/metadata/23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro 0 DATA ADDED EXISTING lineitem_iceberg_gz/data/00000-2-371a340c-ded5-4e85-aa49-9c788d6f21cd-00001.parquet PARQUET 111968 + +query IIIIIIII +SELECT * FROM ICEBERG_METADATA('data/persistent/iceberg/lineitem_iceberg_gz_no_hint', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC='gzip'); +---- +lineitem_iceberg_gz/metadata/23f9dbea-1e7f-4694-a82c-dc3c9a94953e-m0.avro 0 DATA ADDED EXISTING lineitem_iceberg_gz/data/00000-2-371a340c-ded5-4e85-aa49-9c788d6f21cd-00001.parquet PARQUET 111968 + diff --git a/test/sql/local/iceberg_scans/iceberg_scan.test b/test/sql/local/iceberg_scans/iceberg_scan.test new file mode 100644 index 0000000..f39c658 --- /dev/null +++ b/test/sql/local/iceberg_scans/iceberg_scan.test @@ -0,0 +1,181 @@ +# name: test/sql/local/iceberg_scan.test +# description: test iceberg extension +# group: [iceberg] + +# Before we load the extension, this will fail +statement error +SELECT * FROM ICEBERG_SCAN('data/iceberg/lineitem_iceberg'); +---- +Catalog Error + +require parquet + +require iceberg + +### Scanning latest snapshot +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', ALLOW_MOVED_PATHS=TRUE); +---- +51793 + +# Scanning 1st snapshot +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', snapshot_from_id=3776207205136740581, ALLOW_MOVED_PATHS=TRUE); +---- +60175 + +# Scanning 2nd snapshot +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', snapshot_from_id=7635660646343998149, ALLOW_MOVED_PATHS=TRUE); +---- +51793 + +# Scanning 2nd snapshot +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', version='2', ALLOW_MOVED_PATHS=TRUE); +---- +51793 + +# Scanning latest snapshot at specific moment in time +# note in the data we have: +# 1 = 2023-02-15 15:07:54.504 +# 2 = 2023-02-15 15:08:14.73 +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', snapshot_from_timestamp='2023-02-15 15:07:54.504'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +60175 + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', snapshot_from_timestamp='2023-02-15 15:07:54.729'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +60175 + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', snapshot_from_timestamp='2023-02-15 15:08:14.73'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +51793 + +statement error +FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg', snapshot_from_timestamp='2023-02-15 15:07:54.503'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +IO Error: Could not find latest snapshots for timestamp 2023-02-15 15:07:54.503 + +statement error +SELECT * FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_gz', ALLOW_MOVED_PATHS=TRUE); +---- +IO Error: Iceberg metadata file not found for table version '2' using 'none' compression and format(s): 'v%s%s.metadata.json,%s%s.metadata.json' + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_gz', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC="gzip"); +---- +111968 + +statement error +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_gz', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC="gzip", version='1'); +---- +IO Error: No snapshots found + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_gz', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC="gzip", version='2', version_name_format='v%s%s.metadata.json'); +---- +111968 + +statement error +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_no_hint', ALLOW_MOVED_PATHS=TRUE); +---- +:.*SET unsafe_enable_version_guessing.* + +statement ok +SET unsafe_enable_version_guessing=true; + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_no_hint', snapshot_from_timestamp='2023-02-15 15:07:54.504'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +60175 + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_no_hint', snapshot_from_timestamp='2023-02-15 15:07:54.729'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +60175 + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_no_hint', snapshot_from_timestamp='2023-02-15 15:08:14.73'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +51793 + +statement error +FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_no_hint', snapshot_from_timestamp='2023-02-15 15:07:54.503'::TIMESTAMP, ALLOW_MOVED_PATHS=TRUE); +---- +IO Error: Could not find latest snapshots for timestamp 2023-02-15 15:07:54.503 + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_gz_no_hint', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC="gzip"); +---- +111968 + +query I +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/iceberg/lineitem_iceberg_gz_no_hint', ALLOW_MOVED_PATHS=TRUE, METADATA_COMPRESSION_CODEC="gzip", version='2', version_name_format='v%s%s.metadata.json'); +---- +111968 + +require-env DUCKDB_ICEBERG_HAVE_GENERATED_DATA + +query I +SELECT typeof(l_orderkey_bool) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +BOOLEAN + +query I +SELECT typeof(l_partkey_int) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +INTEGER + +query I +SELECT typeof(l_suppkey_long) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +BIGINT + +query I +SELECT typeof(l_extendedprice_float) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +FLOAT + +query I +SELECT typeof(l_extendedprice_double) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +DOUBLE + +query I +SELECT typeof(l_extendedprice_dec9_2) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +DECIMAL(9,2) + +query I +SELECT typeof(l_shipdate_date) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +DATE + +# query I +# SELECT typeof(l_partkey_time) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +# ---- +# INTEGER + +query I +SELECT typeof(l_commitdate_timestamp) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +TIMESTAMP + +query I +SELECT typeof(l_commitdate_timestamp_tz) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +TIMESTAMP WITH TIME ZONE + +query I +SELECT typeof(l_comment_string) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +VARCHAR + +query I +SELECT typeof(l_comment_blob) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1', ALLOW_MOVED_PATHS=TRUE) LIMIT 1; +---- +BLOB diff --git a/test/sql/local/iceberg_scans/iceberg_scan_generated_data_0_001.test b/test/sql/local/iceberg_scans/iceberg_scan_generated_data_0_001.test new file mode 100644 index 0000000..400c24d --- /dev/null +++ b/test/sql/local/iceberg_scans/iceberg_scan_generated_data_0_001.test @@ -0,0 +1,124 @@ +# name: test/sql/local/iceberg_scan_generated_data_0_001.test +# description: test iceberg extension with the sf0.001 generated test set +# group: [iceberg] + +require parquet + +require iceberg + +### Invalid iceberg metadata leads to failed statement +statement error +SELECT count(*) FROM ICEBERG_SCAN('data/persistent/bad_data/bad_iceberg_metadata.json'); +---- +Invalid Input Error: Fails to parse iceberg metadata from data/persistent/bad_data/bad_iceberg_metadata.json + +### Iceberg spec v1 + +require-env DUCKDB_ICEBERG_HAVE_GENERATED_DATA + +# Check count matches the same as last file +query I nosort table_v1_count +SELECT count(*) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1'); +---- + +query I nosort table_v1_count +SELECT count(*) FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v1/last/data.parquet/*.parquet'); +---- + + +# Check schema is identical, sorting by uuid to guarantee unique order +query I nosort q1-schema +DESCRIBE SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1') ORDER BY uuid; +---- + +query I nosort q1-schema +DESCRIBE SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1/metadata/v9.metadata.json') ORDER BY uuid; +---- + +query I nosort q1-schema +DESCRIBE SELECT * FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v1/last/data.parquet/*.parquet') ORDER BY uuid; +---- + +# Check data is identical, sorting by uuid to guarantee unique order +query I nosort q1-data +SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1') ORDER BY uuid; +---- + +query I nosort q1-data +SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1/metadata/v9.metadata.json') ORDER BY uuid; +---- + +query I nosort q1-data +SELECT * FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v1/last/data.parquet/*.parquet') ORDER BY uuid; +---- + +# Confirm the type matches that of the iceberg schema +query IIIIII +DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1') ORDER BY uuid; +---- +schema_evol_added_col_1 BIGINT YES NULL NULL NULL + +### Iceberg spec v2 + +# Check count matches +query I nosort count_match_r1 +SELECT count(*) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2'); +---- + +# We should also be able to scan the metadata file directly +query I nosort count_match_r1 +SELECT count(*) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2/metadata/v9.metadata.json'); +---- + +# Check schema is identical, sorting by uuid to guarantee unique order +query I nosort q2-schema +DESCRIBE SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2') ORDER BY uuid; +---- + +query I nosort q2-schema +DESCRIBE SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2/metadata/v9.metadata.json') ORDER BY uuid; +---- + +query I nosort q2-schema +DESCRIBE SELECT * FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v2/last/data.parquet/*.parquet') ORDER BY uuid; +---- + +# Check data is identical, sorting by uuid to guarantee unique order +query I nosort q2-data +SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2') ORDER BY uuid; +---- + +# Check data is identical, sorting by uuid to guarantee unique order +query I nosort q2-data +SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2/metadata/v9.metadata.json') ORDER BY uuid; +---- + +query I nosort q2-data +SELECT * FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v2/last/data.parquet/*.parquet') ORDER BY uuid; +---- + +### Test schema evolution + +# Latest metadata version has correct type +query IIIIII +DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2/metadata/v9.metadata.json') ORDER BY uuid; +---- +schema_evol_added_col_1 BIGINT YES NULL NULL NULL + +# One before has the old type +query IIIIII +DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2/metadata/v8.metadata.json') ORDER BY uuid; +---- +schema_evol_added_col_1 INTEGER YES NULL NULL NULL + +# Even older: it did not exist yet +statement error +DESCRIBE SELECT schema_evol_added_col_1 FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2/metadata/v6.metadata.json') ORDER BY uuid; +---- +Binder Error + +# Check that there are injected cardinality +query II +EXPLAIN SELECT count(*) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v2'); +---- +physical_plan :.*ICEBERG_SCAN.*Rows.* diff --git a/test/sql/local/iceberg_scans/iceberg_scan_generated_data_1.test_slow b/test/sql/local/iceberg_scans/iceberg_scan_generated_data_1.test_slow new file mode 100644 index 0000000..a9a9f47 --- /dev/null +++ b/test/sql/local/iceberg_scans/iceberg_scan_generated_data_1.test_slow @@ -0,0 +1,43 @@ +# name: test/sql/local/iceberg_scan_generated_data_1.test_slow +# description: test iceberg extension with the sf1 generated test set +# group: [iceberg] + +require parquet + +require iceberg + +require-env DUCKDB_ICEBERG_HAVE_GENERATED_DATA + +# Check count matches +query I nosort count_iceberg_scan +SELECT count(*) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1'); + + +query I nosort count_iceberg_scan +select count(*) from read_parquet('data/generated/intermediates/spark-local/pyspark_iceberg_table_v1/last/data.parquet/*.parquet'); + +# Check data is identical, sorting by uuid to guarantee unique order. +query I nosort q1 +SELECT COUNT(*) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1'); +---- + +query I nosort q1 +SELECT COUNT(*) FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v1/last/data.parquet/*.parquet'); +---- + +query I nosort q2 +SELECT COUNT(*), MIN(l_suppkey_long), MAX(l_suppkey_long), SUM(l_suppkey_long) FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1'); +---- + +query I nosort q2 +SELECT COUNT(*), MIN(l_suppkey_long), MAX(l_suppkey_long), SUM(l_suppkey_long) FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v1/last/data.parquet/*.parquet'); +---- + +# Full table compare: very slow +query I nosort q3 +SELECT * FROM ICEBERG_SCAN('data/generated/iceberg/spark-local/pyspark_iceberg_table_v1') WHERE uuid NOT NULL ORDER BY uuid; +---- + +query I nosort q3 +SELECT * FROM PARQUET_SCAN('data/generated/intermediates/spark-local/pyspark_iceberg_table_v1/last/data.parquet/*.parquet') WHERE uuid NOT NULL ORDER BY uuid; +---- diff --git a/test/sql/local/iceberg_scans/iceberg_snapshots.test b/test/sql/local/iceberg_scans/iceberg_snapshots.test new file mode 100644 index 0000000..4accb3c --- /dev/null +++ b/test/sql/local/iceberg_scans/iceberg_snapshots.test @@ -0,0 +1,87 @@ +# name: test/sql/local/iceberg_snapshots.test +# description: test iceberg snapshots function +# group: [iceberg] + +# Before we load the extension, this will fail +statement error +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg'); +---- +Catalog Error + +require notwindows + +require parquet + +require iceberg + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg'); +---- +1 3776207205136740581 2023-02-15 15:07:54.504 lineitem_iceberg/metadata/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro +2 7635660646343998149 2023-02-15 15:08:14.73 lineitem_iceberg/metadata/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg', version='1'); +---- +1 3776207205136740581 2023-02-15 15:07:54.504 lineitem_iceberg/metadata/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro + +statement error +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg', version="1", version_name_format='v%s%s.metadata.gz'); +---- +IO Error: Iceberg metadata file not found for table version '1' using 'none' compression and format(s): 'v%s%s.metadata.gz' + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg', version="1", version_name_format='v%s%s.metadata.json'); +---- +1 3776207205136740581 2023-02-15 15:07:54.504 lineitem_iceberg/metadata/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg', version='1'); +---- +1 3776207205136740581 2023-02-15 15:07:54.504 lineitem_iceberg/metadata/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro + +statement error +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_nonexistent'); +---- +IO Error: Failed to read iceberg table. No version was provided and no version-hint could be found, + +statement error +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_gz'); +---- +IO Error: Iceberg metadata file not found for table version '2' using 'none' compression and format(s): 'v%s%s.metadata.json,%s%s.metadata.json' + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_gz', metadata_compression_codec="gzip"); +---- +0 4468019210336628573 2024-03-13 18:38:58.602 lineitem_iceberg_gz/metadata/snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_gz', metadata_compression_codec="gzip", version='2'); +---- +0 4468019210336628573 2024-03-13 18:38:58.602 lineitem_iceberg_gz/metadata/snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro + +statement ok +SET unsafe_enable_version_guessing=true; + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_no_hint'); +---- +1 3776207205136740581 2023-02-15 15:07:54.504 lineitem_iceberg/metadata/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro +2 7635660646343998149 2023-02-15 15:08:14.73 lineitem_iceberg/metadata/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_no_hint', version='1'); +---- +1 3776207205136740581 2023-02-15 15:07:54.504 lineitem_iceberg/metadata/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_no_hint', version="?"); +---- +1 3776207205136740581 2023-02-15 15:07:54.504 lineitem_iceberg/metadata/snap-3776207205136740581-1-cf3d0be5-cf70-453d-ad8f-48fdc412e608.avro +2 7635660646343998149 2023-02-15 15:08:14.73 lineitem_iceberg/metadata/snap-7635660646343998149-1-10eaca8a-1e1c-421e-ad6d-b232e5ee23d3.avro + +query IIII +SELECT * FROM ICEBERG_SNAPSHOTS('data/persistent/iceberg/lineitem_iceberg_gz_no_hint', metadata_compression_codec="gzip"); +---- +0 4468019210336628573 2024-03-13 18:38:58.602 lineitem_iceberg_gz/metadata/snap-4468019210336628573-1-23f9dbea-1e7f-4694-a82c-dc3c9a94953e.avro + diff --git a/test/sql/local/iceberg_scans/iceberge_read_deletes.test b/test/sql/local/iceberg_scans/iceberge_read_deletes.test new file mode 100644 index 0000000..e505813 --- /dev/null +++ b/test/sql/local/iceberg_scans/iceberge_read_deletes.test @@ -0,0 +1,184 @@ +# name: test/sql/local/iceberg_catalog_read.test +# description: test integration with iceberg catalog read +# group: [iceberg] + +require-env DUCKDB_ICEBERG_HAVE_GENERATED_DATA + +require parquet + +require iceberg + +statement ok +set enable_logging=true; + +# TODO verify the catalog has deletes (rest catalog stores data differently from local catalog) +query I nosort results_1 +select sum(l_suppkey), min(l_suppkey), max(l_suppkey) from ICEBERG_SCAN('data/generated/iceberg/spark-local/table_with_deletes'); + +query I nosort results_1 +select sum(l_suppkey), min(l_suppkey), max(l_suppkey) from read_parquet('data/generated/intermediates/spark-local/table_with_deletes/last/data.parquet/*.parquet'); + +# query I +# select message::INT > 0 from duckdb_logs() where type like '%duckdb-iceberg.finalize_deletes%'; +# ---- +# true + +# Verify parquet scans on tables with delete vectors do not mess with results. +query I nosort table_filter_result +select l_partkey from ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_001_deletes') where l_partkey > 5 and l_partkey < 20000; +---- + +query I nosort table_filter_result +select l_partkey from read_parquet('data/generated/intermediates/spark-local/lineitem_001_deletes/last/data.parquet/*.parquet') where l_partkey > 5 and l_partkey < 20000; +---- + +# Verify Deletes +# joins with a table that has deletes. +# Projecting l_orderkey, joining on l_partkey +query I nosort join_results +select l1_deletes.l_partkey, count(*) count from + ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf1_deletes') l1_deletes, + ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf_01_no_deletes') l2_no_deletes +where l1_deletes.l_partkey = l2_no_deletes.l_partkey +group by l1_deletes.l_partkey +order by l1_deletes.l_partkey, count +---- + +# query I +# select message::INT > 0 from duckdb_logs() where type like '%duckdb-iceberg.finalize_deletes%' order by timestamp desc limit 1; +# ---- +# true + +query I nosort join_results +select l1_deletes.l_partkey, count(*) count from +ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf1_deletes') l1_deletes, +ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf_01_no_deletes') l2_no_deletes +where l1_deletes.l_partkey = l2_no_deletes.l_partkey +group by l1_deletes.l_partkey +order by l1_deletes.l_partkey, count +---- + + +# Verify Deletes +# joins with a table that has deletes. +# Projecting l_orderkey, joining on l_partkey +query I nosort join_results_2_orderkey +select l1_deletes.l_orderkey, count(*) count from + ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf1_deletes') l1_deletes, + ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf_01_no_deletes') l2_no_deletes +where l1_deletes.l_partkey = l2_no_deletes.l_partkey +group by l1_deletes.l_orderkey +order by l1_deletes.l_orderkey, count +---- + +# query I +# select message::INT > 0 from duckdb_logs() where type like '%duckdb-iceberg.finalize_deletes%' order by timestamp desc limit 1; +# ---- +# true + +query I nosort join_results_2_orderkey +select l1_deletes.l_orderkey, count(*) count from +ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf1_deletes') l1_deletes, +ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf_01_no_deletes') l2_no_deletes +where l1_deletes.l_partkey = l2_no_deletes.l_partkey +group by l1_deletes.l_orderkey +order by l1_deletes.l_orderkey, count +---- + + +# Verify a single delete +query IIII nosort single_delete_result +select l_orderkey, + l_partkey, + l_suppkey, + l_quantity +from ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf_01_1_delete') +order by l_partkey, l_orderkey limit 10; +---- + +query IIII nosort single_delete_result +select l_orderkey, + l_partkey, + l_suppkey, + l_quantity +from read_parquet('data/generated/intermediates/spark-rest/lineitem_sf_01_1_delete/last/data.parquet/*.parquet') +order by l_partkey, l_orderkey limit 10; +---- + +query I +select count(*) +from ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_sf_01_1_delete') +where l_orderkey=10053 and l_partkey = 77; +---- +0 + +query I +select count(*) +from read_parquet('data/generated/intermediates/spark-rest/lineitem_sf_01_1_delete/last/data.parquet/*.parquet') +where l_orderkey=10053 and l_partkey = 77; +---- +0 + + +# verify paritioned table read +# add tests for partitioned tables. +query II nosort result_4_partitioned +select l_shipmode, count(*) count +from ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_partitioned_l_shipmode') +group by l_shipmode order by count; +---- + +query II nosort result_4_partitioned +select l_shipmode, count(*) count +from read_parquet('data/generated/intermediates/spark-rest/lineitem_partitioned_l_shipmode/last/data.parquet/*.parquet') +group by l_shipmode order by count; +---- + +# verify paritioned table read with table filters +# add tests for partitioned tables. +query II nosort result_4_partitioned_table_filters +select l_shipmode, count(*) count +from ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_partitioned_l_shipmode') +where l_partkey > 50 +group by l_shipmode order by count; +---- + +query II nosort result_4_partitioned_table_filters +select l_shipmode, count(*) count +from read_parquet('data/generated/intermediates/spark-rest/lineitem_partitioned_l_shipmode/last/data.parquet/*.parquet') +where l_partkey > 50 +group by l_shipmode order by count; +---- + +# verify delete from partitioned table +# create table lineitem_partitioned_mmany_deletes as select * from lineitem (merge on write, partition by l_shipmode) +# select count(*), l_shipmode from lineitem where l_linenumber in (3,4,5,6) group by l_shipmode ; +# add tests for partitioned tables. +query II nosort result_5 +select l_shipmode, count(*) count +from ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_partitioned_l_shipmode_deletes') +group by l_shipmode order by count; +---- + +query II nosort result_5 +select l_shipmode, count(*) count +from read_parquet('data/generated/intermediates/spark-rest/lineitem_partitioned_l_shipmode_deletes/last/data.parquet/*.parquet') +group by l_shipmode order by count; +---- + +# verify select on partitioned table deletes with table_filters +query II nosort result_5_table_filter +select l_shipmode, count(*) count +from ICEBERG_SCAN('data/generated/iceberg/spark-local/lineitem_partitioned_l_shipmode_deletes') +where l_partkey > 100 +group by l_shipmode order by count; +---- + +query II nosort result_5_table_filter +select l_shipmode, count(*) count +from read_parquet('data/generated/intermediates/spark-rest/lineitem_partitioned_l_shipmode_deletes/last/data.parquet/*.parquet') +where l_partkey > 100 +group by l_shipmode order by count; +---- + + diff --git a/test/sql/local/irc/iceberg_catalog_read.test b/test/sql/local/irc/iceberg_catalog_read.test new file mode 100644 index 0000000..1b903ca --- /dev/null +++ b/test/sql/local/irc/iceberg_catalog_read.test @@ -0,0 +1,115 @@ +# name: test/sql/local/iceberg_catalog_read.test +# description: test integration with iceberg catalog read +# group: [iceberg] + +require-env ICEBERG_SERVER_AVAILABLE + +require parquet + +require iceberg + +require httpfs + +statement ok +CREATE SECRET ( + TYPE ICEBERG, + ENDPOINT 'http://127.0.0.1:8181' + ); + + +statement ok +CREATE SECRET ( + TYPE S3, + KEY_ID 'admin', + SECRET 'password', + ENDPOINT '127.0.0.1:9000', + URL_STYLE 'path', + USE_SSL 0 + ); + + +statement ok +ATTACH '' AS my_datalake (TYPE ICEBERG); + +query IIIIII +Show all tables; +---- +my_datalake default pyspark_iceberg_table_v1 [__] [INTEGER] false +my_datalake default pyspark_iceberg_table_v2 [__] [INTEGER] false +my_datalake default table_more_deletes [__] [INTEGER] false +my_datalake default table_partitioned [__] [INTEGER] false +my_datalake default table_unpartitioned [__] [INTEGER] false + +statement error +select * from table_unpartitioned +---- +:.*table_unpartitioned does not exist.* + +statement error +select * from table_unpartitioned +---- +:.*Did you mean.*my_datalake.default.table_unpartitioned.* + +query III +select * from my_datalake.default.table_unpartitioned order by all; +---- +2023-03-01 1 a +2023-03-02 2 b +2023-03-03 3 c +2023-03-04 4 d +2023-03-05 5 e +2023-03-06 6 f +2023-03-07 7 g +2023-03-08 8 h +2023-03-09 9 i +2023-03-10 10 j +2023-03-11 11 k +2023-03-12 12 l + +# test deletes (see generate_iceberg_spark_rest.py for where deletes occur) +query III +select * from my_datalake.default.table_more_deletes order by all; +---- +2023-03-01 1 a +2023-03-02 2 b +2023-03-03 3 c +2023-03-10 10 j +2023-03-11 11 k +2023-03-12 12 l + + +query I +select sum(l_suppkey_long) from my_datalake.default.pyspark_iceberg_table_v2; +---- +15602826438 + +statement error +update my_datalake.default.table_unpartitioned set number = 5 where number < 5; +---- +:.*Not implemented Error.* + +statement error +delete from my_datalake.default.table_unpartitioned where number < 5; +---- +:.*Not implemented Error.* + +statement error +insert into my_datalake.default.table_unpartitioned values ('2023-03-13', 13, 'm'); +---- +:.*Not implemented Error.* + +statement error +Create table my_datalake.default.new_table (a int, b varchar); +---- +:.*Not implemented Error.* + +statement error +Alter table my_datalake.default.table_more_deletes add column new_column INTEGER default 10; +---- +:.*Not implemented Error.* + +statement error +Drop table my_datalake.default.table_more_deletes; +---- +:.*Not implemented Error.* + diff --git a/test/sql/local/irc/irc_catalog_read_deletes.test b/test/sql/local/irc/irc_catalog_read_deletes.test new file mode 100644 index 0000000..184044f --- /dev/null +++ b/test/sql/local/irc/irc_catalog_read_deletes.test @@ -0,0 +1,120 @@ +# name: test/sql/local/iceberg_catalog_read.test +# description: test integration with iceberg catalog read +# group: [iceberg] + +require-env ICEBERG_SERVER_AVAILABLE + +require parquet + +require iceberg + +require httpfs + +statement ok +CREATE SECRET ( + TYPE ICEBERG, + ENDPOINT 'http://127.0.0.1:8181' + ); + + +statement ok +CREATE SECRET ( + TYPE S3, + KEY_ID 'admin', + SECRET 'password', + ENDPOINT '127.0.0.1:9000', + URL_STYLE 'path', + USE_SSL 0 + ); + + +statement ok +ATTACH '' AS my_datalake (TYPE ICEBERG); + + +statement ok +set enable_logging=true; +# verify deletes + +# TODO verify the catalog has deletes (rest catalog stores data differently from local catalog) +query I nosort results_1 +select sum(l_suppkey), min(l_suppkey), max(l_suppkey) from my_datalake.default.table_with_deletes; + +query I nosort results_1 +select sum(l_suppkey), min(l_suppkey), max(l_suppkey) from read_parquet('data/generated/intermediates/spark-rest/table_with_deletes/last/data.parquet/*.parquet'); + +query I +select message::INT > 0 from duckdb_logs() where type like '%duckdb-iceberg.finalize_deletes%'; +---- +true + +mode skip + +# Verify Deletes +# joins with a table that has deletes. +query I nosort results_2 +select l1_deletes.l_orderkey, count(*) count from + my_datalake.default.lineitem_sf1_deletes l1_deletes, + my_datalake.default.lineitem_sf_01_no_deletes l2_no_deletes +where l1_deletes.l_partkey = l2_no_deletes.l_partkey +group by l1_deletes.l_orderkey +order by l1_deletes.l_orderkey, count +limit 10; +---- + + +query I nosort results_2 +select l1_deletes.l_orderkey, count(*) count from + read_parquet('data/generated/intermediates/spark-rest/lineitem_sf1_deletes/last/data.parquet/*.parquet') l1_deletes, + read_parquet('data/generated/intermediates/spark-rest/lineitem_sf_01_no_deletes/last/data.parquet/*.parquet') l2_no_deletes +where l1_deletes.l_partkey = l2_no_deletes.l_partkey +group by l1_deletes.l_orderkey +order by l1_deletes.l_orderkey, count +limit 10 +; + +# Verify a single delete +query IIII nosort result_3 +select l_orderkey, l_partkey, l_suppkey, l_quantity from my_datalake.default.lineitem_sf_01_1_delete order by l_partkey, l_orderkey limit 10; +---- + +query IIII nosort result_3 +select l_orderkey, l_partkey, l_suppkey, l_quantity from read_parquet('data/generated/intermediates/spark-rest/lineitem_sf_01_1_delete/last/data.parquet/*.parquet') order by l_partkey, l_orderkey limit 10; +---- + +query I +select count(*) from my_datalake.default.lineitem_sf_01_1_delete where l_orderkey=10053 and l_partkey = 77; +---- +0 + +query I +select count(*) from read_parquet('data/generated/intermediates/spark-rest/lineitem_sf_01_1_delete/last/data.parquet/*.parquet') where l_orderkey=10053 and l_partkey = 77; +---- +0 + + +# Verify reading from large partitioned table +# add tests for partitioned tables. +query II nosort result_4 +select l_shipmode, count(*) count from my_datalake.default.lineitem_partitioned_l_shipmode group by l_shipmode order by count; +---- + +query II nosort result_4 +select l_shipmode, count(*) count from read_parquet('data/generated/intermediates/spark-rest/lineitem_partitioned_l_shipmode/last/data.parquet/*.parquet') group by l_shipmode order by count; +---- + + +# Verify reading from large partitioned table with deletes +query II nosort result_5 +select l_shipmode, count(*) count from my_datalake.default.lineitem_partitioned_l_shipmode_deletes group by l_shipmode order by count; +---- + +query II nosort result_5 +select l_shipmode, count(*) count from read_parquet('data/generated/intermediates/spark-rest/lineitem_partitioned_l_shipmode_deletes/last/data.parquet/*.parquet') group by l_shipmode order by count; +---- + +query I +select message::INT > 0 from duckdb_logs() where type like '%duckdb-iceberg.finalize_deletes%' order by timestamp desc limit 1; +---- +1 +