Skip to content

Commit

Permalink
Making sure parquet column readers return the expected amount of rows,
Browse files Browse the repository at this point in the history
…fixes duckdb#4903
  • Loading branch information
hannes committed Oct 7, 2022
1 parent 307477b commit 45389c4
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 8 deletions.
Binary file added data/parquet-testing/bug4903.parquet
Binary file not shown.
17 changes: 9 additions & 8 deletions extension/parquet/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ unique_ptr<ColumnReader> ParquetReader::CreateReaderRecursive(const FileMetaData

if (!s_ele.__isset.type) { // inner node
if (s_ele.num_children == 0) {
throw std::runtime_error("Node has no children but should");
throw InvalidInputException("Node has no children but should");
}
child_list_t<LogicalType> child_types;
vector<unique_ptr<ColumnReader>> child_readers;
Expand Down Expand Up @@ -575,7 +575,7 @@ uint64_t ParquetReader::GetGroupCompressedSize(ParquetReaderScanState &state) {

if (total_compressed_size != 0 && calc_compressed_size != 0 &&
(idx_t)total_compressed_size != calc_compressed_size) {
throw std::runtime_error("mismatch between calculated compressed size and reported compressed size");
throw InvalidInputException("mismatch between calculated compressed size and reported compressed size");
}

return total_compressed_size ? total_compressed_size : calc_compressed_size;
Expand Down Expand Up @@ -918,7 +918,7 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
double scan_percentage = (double)(to_scan_compressed_bytes) / total_row_group_span;

if (to_scan_compressed_bytes > total_row_group_span) {
throw std::runtime_error(
throw InvalidInputException(
"Malformed parquet file: sum of total compressed bytes of columns seems incorrect");
}

Expand Down Expand Up @@ -1042,11 +1042,12 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul
continue;
}

// std::cout << "Reading nofilter for col " <<
// root_reader->GetChildReader(file_col_idx)->Schema().name
//<< "\n";
root_reader->GetChildReader(file_col_idx)
->Read(result.size(), filter_mask, define_ptr, repeat_ptr, result.data[out_col_idx]);
auto rows_read = root_reader->GetChildReader(file_col_idx)
->Read(result.size(), filter_mask, define_ptr, repeat_ptr, result.data[out_col_idx]);
if (rows_read != result.size()) {
throw InvalidInputException("Mismatch in parquet read for column %llu, expected %llu rows, got %llu",
file_col_idx, result.size(), rows_read);
}
}
}

Expand Down
9 changes: 9 additions & 0 deletions test/sql/copy/parquet/parquet_4903.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# name: test/sql/copy/parquet/parquet_4903.test
# description: Issue #4442: Parquet reader converts timestamp to i64 *sometimes*
# group: [parquet]

require parquet

# file is corrupt
statement error
SELECT type_param_constraints FROM 'data/parquet-testing/bug4903.parquet' limit 10

0 comments on commit 45389c4

Please sign in to comment.