Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the iceberg_scan internals to make use of the MultiFileReader API #101

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
311dc60
rework the internals of the iceberg extension to use the MultiFileRea…
Tishj Feb 13, 2025
6d4101e
bring back accidentally deleted changes
Tishj Feb 13, 2025
8572345
default table version
Tishj Feb 13, 2025
4cb7285
apply changes to test
Tishj Feb 14, 2025
2fba666
fix crash in 'StructColumnReader::GetChildReader', for the 'file_row_…
Tishj Feb 14, 2025
57aebb9
update duckdb submodule to 1.2
Tishj Feb 14, 2025
5d56ce9
change irc to work with refactored iceberg_scan, deletion vectors sti…
samansmink Feb 17, 2025
d50fbe7
'Mytherin/multifilereaderrework' switches the order of CreateMultiFil…
Tishj Feb 19, 2025
f53dbb9
positional delete files optionally have a third column, 'row', have t…
Tishj Feb 20, 2025
17310a9
dont move to the next manifest directly, only move once the current m…
Tishj Feb 21, 2025
c531742
data probably had the same problem, this should fix it
Tishj Feb 21, 2025
7340270
dont blindly assume that rowids are only sequential, there could be g…
Tishj Feb 24, 2025
1e6b32f
simplify and fix the Apply method
Tishj Feb 24, 2025
59e234c
fixed Apply thanks to Tom, reminder to make this performant later..
Tishj Feb 24, 2025
d71eb18
add more tests, especially tests taht test delete vectors
Tmonster Feb 24, 2025
71c1429
revert tests so they pass on current CI
Tmonster Feb 24, 2025
9647ed5
remove untested files
Tmonster Feb 25, 2025
d0ad861
fix last few tests so they pass
Tmonster Feb 25, 2025
a204f78
Merge branch 'multi_file_reader' into add_more_tests_to_verify_readin…
Tishj Feb 25, 2025
f0b6976
update test result
Tishj Feb 25, 2025
439ae8f
remove mode skips and enable logging code
Tmonster Feb 26, 2025
5e56306
Merge branch 'add_more_tests_to_verify_reading_tables_with_deletes' i…
Tishj Feb 26, 2025
1aa1085
Merge remote-tracking branch 'upstream/main' into multi_file_reader
Tishj Feb 27, 2025
ee475a2
load parquet before iceberg
Tishj Feb 27, 2025
573ac59
point duckdb at the v1.2.0 tag
Tishj Feb 28, 2025
c15d301
exploit the sorted property of (positional) delete files, reducing th…
Tishj Feb 28, 2025
9e351c7
return nullptr for ComplexFilterPushdown for now
Tishj Feb 28, 2025
1c8a1ac
fix up error messages
Tishj Feb 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/Rest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,6 @@ jobs:
run: |
make release

- name: Start Rest Catalog
run: |
make start-rest-catalog

- name: Generate data
run: |
make data
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
src/common/utils.cpp
src/common/schema.cpp
src/common/iceberg.cpp
src/iceberg_functions/iceberg_multi_file_reader.cpp
src/iceberg_functions/iceberg_snapshots.cpp
src/iceberg_functions/iceberg_scan.cpp
src/iceberg_functions/iceberg_metadata.cpp
Expand Down
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 1106 files
2 changes: 1 addition & 1 deletion extension_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ duckdb_extension_load(iceberg
LOAD_TESTS
)

duckdb_extension_load(tpch)
duckdb_extension_load(tpch)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE iceberg_catalog.lineitem_001_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
update iceberg_catalog.lineitem_001_deletes
set l_orderkey=NULL,
l_partkey=NULL,
l_suppkey=NULL,
l_linenumber=NULL,
l_quantity=NULL,
l_extendedprice=NULL,
l_discount=NULL,
l_shipdate=NULL,
l_comment=NULL
where l_partkey % 2 = 0;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE iceberg_catalog.lineitem_001_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
update iceberg_catalog.lineitem_001_deletes
set l_orderkey=NULL,
l_partkey=NULL,
l_suppkey=NULL,
l_linenumber=NULL,
l_quantity=NULL,
l_extendedprice=NULL,
l_discount=NULL,
l_shipdate=NULL,
l_comment=NULL
where l_partkey % 2 = 0;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
CREATE OR REPLACE TABLE iceberg_catalog.lineitem_partitioned_l_shipmode
USING iceberg
PARTITIONED BY (l_shipmode)
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
as select * from parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
delete from iceberg_catalog.lineitem_partitioned_l_shipmode where l_shipmode = 'TRUCK';
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
CREATE OR REPLACE TABLE iceberg_catalog.lineitem_partitioned_l_shipmode_deletes
USING iceberg
PARTITIONED BY (l_shipmode)
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
as select * from parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
UPDATE iceberg_catalog.lineitem_partitioned_l_shipmode_deletes
Set l_comment=NULL,
l_quantity=NULL,
l_discount=NULL,
l_linestatus=NULL
where l_linenumber = 3 or l_linenumber = 4 or l_linenumber = 5;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE iceberg_catalog.lineitem_sf1_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
update iceberg_catalog.lineitem_sf1_deletes
set l_orderkey=NULL,
l_partkey=NULL,
l_suppkey=NULL,
l_linenumber=NULL,
l_quantity=NULL,
l_extendedprice=NULL,
l_discount=NULL,
l_shipdate=NULL,
l_comment=NULL
where l_partkey % 2 = 0;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=1)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE iceberg_catalog.lineitem_sf_01_1_delete
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
delete from iceberg_catalog.lineitem_sf_01_1_delete where l_orderkey=10053 and l_partkey = 77;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE iceberg_catalog.lineitem_sf_01_no_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE iceberg_catalog.table_with_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
update iceberg_catalog.table_with_deletes
set l_orderkey=NULL,
l_partkey=NULL,
l_suppkey=NULL,
l_linenumber=NULL,
l_quantity=NULL,
l_extendedprice=NULL,
l_discount=NULL,
l_shipdate=NULL,
l_comment=NULL
where l_partkey % 2 = 0;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE default.lineitem_001_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
update default.lineitem_001_deletes
set l_orderkey=NULL,
l_partkey=NULL,
l_suppkey=NULL,
l_linenumber=NULL,
l_quantity=NULL,
l_extendedprice=NULL,
l_discount=NULL,
l_shipdate=NULL,
l_comment=NULL
where l_partkey % 2 = 0;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
CREATE OR REPLACE TABLE default.lineitem_partitioned_l_shipmode
USING iceberg
PARTITIONED BY (l_shipmode)
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
as select * from parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
delete from default.lineitem_partitioned_l_shipmode where l_shipmode = 'TRUCK';
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
CREATE OR REPLACE TABLE default.lineitem_partitioned_l_shipmode_deletes
USING iceberg
PARTITIONED BY (l_shipmode)
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
as select * from parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
UPDATE default.lineitem_partitioned_l_shipmode_deletes
Set l_comment=NULL,
l_quantity=NULL,
l_discount=NULL,
l_linestatus=NULL
where l_linenumber = 3 or l_linenumber = 4 or l_linenumber = 5;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE default.lineitem_sf1_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
update default.lineitem_sf1_deletes
set l_orderkey=NULL,
l_partkey=NULL,
l_suppkey=NULL,
l_linenumber=NULL,
l_quantity=NULL,
l_extendedprice=NULL,
l_discount=NULL,
l_shipdate=NULL,
l_comment=NULL
where l_partkey % 2 = 0;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=1)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE default.lineitem_sf_01_1_delete
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
delete from default.lineitem_sf_01_1_delete where l_orderkey=10053 and l_partkey = 77;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE or REPLACE TABLE default.lineitem_sf_01_no_deletes
TBLPROPERTIES (
'format-version'='2',
'write.update.mode'='merge-on-read'
)
AS SELECT * FROM parquet_file_view;
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import duckdb
import os

PARQUET_SRC_FILE = os.getenv('PARQUET_SRC_FILE')

duckdb_con = duckdb.connect()
duckdb_con.execute("call dbgen(sf=0.01)")
duckdb_con.execute(f"copy lineitem to '{PARQUET_SRC_FILE}' (FORMAT PARQUET)")
Binary file modified scripts/data_generators/tmp_data/tmp.parquet
Binary file not shown.
Loading
Loading