From 9b584547c768fb09b2e33b4ad8797cf45c3b3b97 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 7 Aug 2024 20:01:04 +0800 Subject: [PATCH] GH-43427: [C++][Parquet] Deprecate ColumnChunk::file_offset field and no longer write Metadata at end of Chunk (#43428) ### Rationale for this change See github issue ### What changes are included in this PR? Force `ColumnChunk::file_offset` tobe 0 ### Are these changes tested? No ### Are there any user-facing changes? Might affect user using legacy `ColumnChunk::file_offset` * GitHub Issue: #43427 Authored-by: mwish Signed-off-by: mwish --- c_glib/test/parquet/test-column-chunk-metadata.rb | 2 +- cpp/src/parquet/column_writer.cc | 5 ----- cpp/src/parquet/metadata.cc | 6 +++--- cpp/src/parquet/metadata.h | 7 ++++++- python/pyarrow/tests/parquet/test_metadata.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/c_glib/test/parquet/test-column-chunk-metadata.rb b/c_glib/test/parquet/test-column-chunk-metadata.rb index f0012f0124577..4612e5bf0cc59 100644 --- a/c_glib/test/parquet/test-column-chunk-metadata.rb +++ b/c_glib/test/parquet/test-column-chunk-metadata.rb @@ -77,7 +77,7 @@ def setup test("#file_offset") do assert do - @metadata.file_offset > 0 + @metadata.file_offset == 0 end end diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 90e0102b422bb..f859ec9653f78 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -353,8 +353,6 @@ class SerializedPageWriter : public PageWriter { total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback, dict_encoding_stats_, data_encoding_stats_, meta_encryptor_); - // Write metadata at end of column chunk - metadata_->WriteTo(sink_.get()); } /** @@ -667,9 +665,6 @@ class BufferedPageWriter : public PageWriter { has_dictionary, fallback, pager_->dict_encoding_stats_, pager_->data_encoding_stats_, pager_->meta_encryptor_); - // Write metadata at end of column chunk - metadata_->WriteTo(in_memory_sink_.get()); - // Buffered page writer needs to adjust page offsets. pager_->FinishPageIndexes(final_position); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index fe16f5b76bd09..10c8afaf37507 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -1536,10 +1536,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { const std::shared_ptr& encryptor) { if (dictionary_page_offset > 0) { column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset); - column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size); - } else { - column_chunk_->__set_file_offset(data_page_offset + compressed_size); } + // The `file_offset` field is deprecated and should be set to 0. + // See https://github.com/apache/parquet-format/pull/440 for detail. + column_chunk_->__set_file_offset(0); column_chunk_->__isset.meta_data = true; column_chunk_->meta_data.__set_num_values(num_values); if (index_page_offset >= 0) { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index e02d2e7c852f0..e46297540ba6e 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -146,7 +146,12 @@ class PARQUET_EXPORT ColumnChunkMetaData { bool Equals(const ColumnChunkMetaData& other) const; - // column chunk + // Byte offset of `ColumnMetaData` in `file_path()`. + // + // Note that the meaning of this field has been inconsistent among implementations + // so its use has since been deprecated in the Parquet specification. Modern + // implementations will set this to `0` to indicate that the `ColumnMetaData` is solely + // contained in the `ColumnChunk` struct. int64_t file_offset() const; // parameter is only used when a dataset is spread across multiple files diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 52ab59a961b3e..528cf0110dd95 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -122,7 +122,7 @@ def test_parquet_metadata_api(): col_meta = rg_meta.column(ncols + 2) col_meta = rg_meta.column(0) - assert col_meta.file_offset > 0 + assert col_meta.file_offset == 0 assert col_meta.file_path == '' # created from BytesIO assert col_meta.physical_type == 'BOOLEAN' assert col_meta.num_values == 10000