Skip to content

Commit

Permalink
fix partitioned arrow tables reading
Browse files Browse the repository at this point in the history
  • Loading branch information
korowa committed Mar 3, 2024
1 parent 7be4bf8 commit 33814f9
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 4 deletions.
2 changes: 1 addition & 1 deletion datafusion/core/src/datasource/physical_plan/arrow_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ impl ExecutionPlan for ArrowExec {

let opener = ArrowOpener {
object_store,
projection: self.base_config.projection.clone(),
projection: self.base_config.file_column_projection_indices(),
};
let stream =
FileStream::new(&self.base_config, partition, opener, &self.metrics)?;
Expand Down
Binary file not shown.
Binary file not shown.
71 changes: 71 additions & 0 deletions datafusion/sqllogictest/test_files/arrow_files.slt
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,74 @@ SELECT * FROM arrow_simple
2 bar NULL
3 baz false
4 NULL true

# ARROW partitioned table
statement ok
CREATE EXTERNAL TABLE arrow_partitioned (
part Int,
f0 Bigint,
f1 String,
f2 Boolean
)
STORED AS ARROW
LOCATION '../core/tests/data/partitioned_table_arrow/'
PARTITIONED BY (part);

# select wildcard
query ITBI
SELECT * FROM arrow_partitioned ORDER BY f0;
----
1 foo true 123
2 bar false 123
3 baz true 456
4 NULL NULL 456

# select all fields
query IITB
SELECT part, f0, f1, f2 FROM arrow_partitioned ORDER BY f0;
----
123 1 foo true
123 2 bar false
456 3 baz true
456 4 NULL NULL

# select without partition column
query IB
SELECT f0, f2 FROM arrow_partitioned ORDER BY f0
----
1 true
2 false
3 true
4 NULL

# select only partition column
query I
SELECT part FROM arrow_partitioned ORDER BY part
----
123
123
456
456

# select without any table-related columns in projection
query I
SELECT 1 FROM arrow_partitioned
----
1
1
1
1

# select with partition filter
query I
SELECT f0 FROM arrow_partitioned WHERE part = 123 ORDER BY f0
----
1
2

# select with partition filter should scan only one directory
query TT
EXPLAIN SELECT f0 FROM arrow_partitioned WHERE part = 456
----
logical_plan TableScan: arrow_partitioned projection=[f0], full_filters=[arrow_partitioned.part = Int32(456)]
physical_plan ArrowExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_table_arrow/part=456/data.arrow]]}, projection=[f0]
6 changes: 4 additions & 2 deletions datafusion/sqllogictest/test_files/insert_to_external.slt
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,11 @@ select * from dictionary_encoded_arrow_test_readback;
----
b

# https://github.com/apache/arrow-datafusion/issues/7816
query error DataFusion error: Arrow error: Schema error: project index 1 out of bounds, max field 1
query TT
select * from dictionary_encoded_arrow_partitioned order by (a);
----
a foo
b bar


# test_insert_into
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/json.slt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ SELECT part FROM json_partitioned_test ORDER BY part
2
2

# select without any table-relates columns in projection
# select without any table-related columns in projection
query T
SELECT 'x' FROM json_partitioned_test
----
Expand Down

0 comments on commit 33814f9

Please sign in to comment.