From 45389c4e48ff44bd924f3b02690b2c28d7dac320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Fri, 7 Oct 2022 11:10:02 +0200 Subject: [PATCH] Making sure parquet column readers return the expected amount of rows, fixes #4903 --- data/parquet-testing/bug4903.parquet | Bin 0 -> 4576 bytes extension/parquet/parquet_reader.cpp | 17 +++++++++-------- test/sql/copy/parquet/parquet_4903.test | 9 +++++++++ 3 files changed, 18 insertions(+), 8 deletions(-) create mode 100644 data/parquet-testing/bug4903.parquet create mode 100644 test/sql/copy/parquet/parquet_4903.test diff --git a/data/parquet-testing/bug4903.parquet b/data/parquet-testing/bug4903.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2979df6993832e482683962e12f40573e777000f GIT binary patch literal 4576 zcmds5O=ufO6kcuDvTVt+Vtb}*5Umc?;({?P#D*GCQW{bdN)V;=P)gLbcdcx)UOBrf z6Pq@N5`xJghZJ(?p@-t;V1f><&n3C^5PZnNgj{k? z`FY>>erD#)ljq(nDAbtv+maHE=<6N<=y6>#mdhT@FH$$i8QG5H&07c~+ zW?>H65nvSBf+%5=_VWh{M3}s(ZBieAk&R@Ud;nfS??^Ug>UyS5fwTpXQjOV=fp(p~ zk~-hIv;{iJo9PsNNhcX3AAswqEa}uh2;k(6HQa5nJ`5FRseXmMle)(!I8VTIM~OZR zMgEd$G6C=+nh?4d00hsl&U_^E1ymCA%!*XovQylu7;FLDfakTC?m583uhe)HPm)zs z6*a@-5`%Z+@_!MFt=Vg_)UkurjdtPy&7$|D9iBT+mJ}YHx9B0}JuwbK_;YqrB2e^= z)ah5%CCzWOLS!gNt*Uz7@j~18Ot;}!aI9ct#B!Q;5c2-GV+En_c+E!8UR`yrRN;UU zIn6WIY|oN}luDu4^v4wlc5kF%ThZn0_-rql|KBrPcOA=EDS? z<-A}!=q1h-BUnR{&NC&B>x7$3Y>2uYMP%HxEQeed5KYSr&1k$t$<|HZTx-Mt^2DlZ zHUnZkmZVo&9>m^+H42c5vdTPM#;y~DNIgpO-*glV9H*Yxux;qF{1<^08m0JY`h*y_ z>{YYvh5?DW1`9aE;)%8C^7N~>ja(V_{(Ls4OviRSQsFM!!dH!4O|Ej-9VMzDI8w&- zOc@or6yk_bf*(};qY47GWC4>Lq1=D3^f7kc5ITF0A=UgPJ`X}QQrpmUz-k=hJ5Au| zF{Q%4XYhjz%+>V1l?*kk?+kH&e@H|ycG!6c_lF>cN7gmQgRHPiP7jo06<-8_8dNsD z#f`ByER@Gn2-m(h4BACSL!%+}i3`6-n;<3WP`VAg1+qW~p!8eRxH}?@ie((0PPlP* zM}SR~JT%f$pBT40T1HBDd-Y9hl&pqNfn;rg6-34%fV$gyCT2pEy#ki(DE1=u05H(4 z$9cX+DV%+hCRMPAKQtcaYuyXYxLp`T)RSa6$?XCsIT%PHCuPoXAQ*0q4KjlzagMz; z#EJTby2+ar z9#m>KgF6|cgKMHV7cF{2rIhanZ(>cES*)A><+dHpowLKrhV2JXSyrlFutv2qv)p!E it5W^M3Fkg-dCiYoYimxpWCj ParquetReader::CreateReaderRecursive(const FileMetaData if (!s_ele.__isset.type) { // inner node if (s_ele.num_children == 0) { - throw std::runtime_error("Node has no children but should"); + throw InvalidInputException("Node has no children but should"); } child_list_t child_types; vector> child_readers; @@ -575,7 +575,7 @@ uint64_t ParquetReader::GetGroupCompressedSize(ParquetReaderScanState &state) { if (total_compressed_size != 0 && calc_compressed_size != 0 && (idx_t)total_compressed_size != calc_compressed_size) { - throw std::runtime_error("mismatch between calculated compressed size and reported compressed size"); + throw InvalidInputException("mismatch between calculated compressed size and reported compressed size"); } return total_compressed_size ? total_compressed_size : calc_compressed_size; @@ -918,7 +918,7 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul double scan_percentage = (double)(to_scan_compressed_bytes) / total_row_group_span; if (to_scan_compressed_bytes > total_row_group_span) { - throw std::runtime_error( + throw InvalidInputException( "Malformed parquet file: sum of total compressed bytes of columns seems incorrect"); } @@ -1042,11 +1042,12 @@ bool ParquetReader::ScanInternal(ParquetReaderScanState &state, DataChunk &resul continue; } - // std::cout << "Reading nofilter for col " << - // root_reader->GetChildReader(file_col_idx)->Schema().name - //<< "\n"; - root_reader->GetChildReader(file_col_idx) - ->Read(result.size(), filter_mask, define_ptr, repeat_ptr, result.data[out_col_idx]); + auto rows_read = root_reader->GetChildReader(file_col_idx) + ->Read(result.size(), filter_mask, define_ptr, repeat_ptr, result.data[out_col_idx]); + if (rows_read != result.size()) { + throw InvalidInputException("Mismatch in parquet read for column %llu, expected %llu rows, got %llu", + file_col_idx, result.size(), rows_read); + } } } diff --git a/test/sql/copy/parquet/parquet_4903.test b/test/sql/copy/parquet/parquet_4903.test new file mode 100644 index 000000000000..778576be11ed --- /dev/null +++ b/test/sql/copy/parquet/parquet_4903.test @@ -0,0 +1,9 @@ +# name: test/sql/copy/parquet/parquet_4903.test +# description: Issue #4442: Parquet reader converts timestamp to i64 *sometimes* +# group: [parquet] + +require parquet + +# file is corrupt +statement error +SELECT type_param_constraints FROM 'data/parquet-testing/bug4903.parquet' limit 10