diff --git a/cpp/src/parquet/column_chunker_test.cc b/cpp/src/parquet/column_chunker_test.cc index c4332c882d402..a1682ec1027c7 100644 --- a/cpp/src/parquet/column_chunker_test.cc +++ b/cpp/src/parquet/column_chunker_test.cc @@ -7,8 +7,18 @@ // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. #include +#include +#include +#include #include "arrow/array.h" #include "arrow/array/builder_binary.h" @@ -68,14 +78,20 @@ std::shared_ptr ConcatAndCombine( Result> WriteTableToBuffer(const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, + bool enable_dictionary = false, + int64_t row_group_size = 1024 * 1024) { auto sink = CreateOutputStream(); - auto write_props = WriterProperties::Builder() - .disable_dictionary() - ->enable_cdc() - ->cdc_size_range(min_chunk_size, max_chunk_size) - ->build(); + auto builder = WriterProperties::Builder(); + // enable content defined chunking + builder.enable_cdc()->cdc_size_range(min_chunk_size, max_chunk_size); + if (enable_dictionary) { + builder.enable_dictionary(); + } else { + builder.disable_dictionary(); + } + auto write_props = builder.build(); auto arrow_props = default_arrow_writer_properties(); RETURN_NOT_OK(WriteTable(*table, default_memory_pool(), sink, row_group_size, write_props, arrow_props)); @@ -118,9 +134,12 @@ std::vector GetColumnPageLengths(const std::shared_ptr& data, Result> WriteAndGetPageLengths(const std::shared_ptr
& table, uint64_t min_chunk_size, uint64_t max_chunk_size, + + bool enable_dictionary = false, int column_index = 0) { - ARROW_ASSIGN_OR_RAISE(auto buffer, - WriteTableToBuffer(table, min_chunk_size, max_chunk_size)); + ARROW_ASSIGN_OR_RAISE( + auto buffer, + WriteTableToBuffer(table, min_chunk_size, max_chunk_size, enable_dictionary)); ARROW_ASSIGN_OR_RAISE(auto readback, ReadTableFromBuffer(buffer)); RETURN_NOT_OK(readback->ValidateFull()); @@ -129,53 +148,213 @@ Result> WriteAndGetPageLengths(const std::shared_ptr
& values, uint64_t min, uint64_t max) { +void AssertAllBetween(const std::vector& values, uint64_t min, uint64_t max, + bool expect_dictionary_fallback = false) { // expect the last chunk since it is not guaranteed to be within the range - for (size_t i = 0; i < values.size() - 1; i++) { - ASSERT_GE(values[i], min); - ASSERT_LE(values[i], max); + if (expect_dictionary_fallback) { + // if dictionary encoding is enabled, the writer can fallback to plain + // encoding splitting within a content defined chunk, so we can't + // guarantee that all chunks are within the range in this case, but we + // know that there can be at most 2 pages smaller than the min_chunk_size + size_t smaller_count = 0; + for (size_t i = 0; i < values.size() - 1; i++) { + if (values[i] < min) { + smaller_count++; + } else { + ASSERT_LE(values[i], max); + } + } + ASSERT_LE(smaller_count, 2); + } else { + for (size_t i = 0; i < values.size() - 1; i++) { + ASSERT_GE(values[i], min); + ASSERT_LE(values[i], max); + } } ASSERT_LE(values.back(), max); } +std::vector, std::vector>> FindDifferences( + const std::vector& first, const std::vector& second) { + auto n = first.size(), m = second.size(); + + // Build DP table for LCS. + std::vector> dp(n + 1, std::vector(m + 1, 0)); + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < m; ++j) { + dp[i + 1][j + 1] = + (first[i] == second[j]) ? dp[i][j] + 1 : std::max(dp[i + 1][j], dp[i][j + 1]); + } + } + + // Backtrack to recover LCS indices. + std::vector> common; + for (auto i = n, j = m; i > 0 && j > 0;) { + if (first[i - 1] == second[j - 1]) { + common.emplace_back(i - 1, j - 1); + --i, --j; + } else if (dp[i - 1][j] >= dp[i][j - 1]) { + --i; + } else { + --j; + } + } + std::reverse(common.begin(), common.end()); + + // Extract differences using the common indices as anchors. + std::vector, std::vector>> result; + size_t last_i = 0, last_j = 0; + for (auto [ci, cj] : common) { + std::vector diff1(first.begin() + last_i, first.begin() + ci); + std::vector diff2(second.begin() + last_j, second.begin() + cj); + if (!diff1.empty() || !diff2.empty()) { + result.emplace_back(std::move(diff1), std::move(diff2)); + } + last_i = ci + 1; + last_j = cj + 1; + } + // Add any remaining elements after the last common index. + std::vector diff1(first.begin() + last_i, first.end()); + std::vector diff2(second.begin() + last_j, second.end()); + if (!diff1.empty() || !diff2.empty()) { + result.emplace_back(std::move(diff1), std::move(diff2)); + } + + return result; +} + +TEST(TestFindDifferences, Basic) { + std::vector first = {1, 2, 3, 4, 5}; + std::vector second = {1, 7, 8, 4, 5}; + + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_EQ(diffs[0].first, std::vector({2, 3})); + ASSERT_EQ(diffs[0].second, std::vector({7, 8})); +} + +TEST(TestFindDifferences, MultipleDifferences) { + std::vector first = {1, 2, 3, 4, 5, 6, 7}; + std::vector second = {1, 8, 9, 4, 10, 6, 11}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 3); + + ASSERT_EQ(diffs[0].first, std::vector({2, 3})); + ASSERT_EQ(diffs[0].second, std::vector({8, 9})); + + ASSERT_EQ(diffs[1].first, std::vector({5})); + ASSERT_EQ(diffs[1].second, std::vector({10})); + + ASSERT_EQ(diffs[2].first, std::vector({7})); + ASSERT_EQ(diffs[2].second, std::vector({11})); +} + +TEST(TestFindDifferences, DifferentLengths) { + std::vector first = {1, 2, 3}; + std::vector second = {1, 2, 3, 4, 5}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_TRUE(diffs[0].first.empty()); + ASSERT_EQ(diffs[0].second, std::vector({4, 5})); +} + +TEST(TestFindDifferences, EmptyArrays) { + std::vector first = {}; + std::vector second = {}; + auto diffs = FindDifferences(first, second); + ASSERT_TRUE(diffs.empty()); +} + +TEST(TestFindDifferences, LongSequenceWithSingleDifference) { + std::vector first = { + 1994, 2193, 2700, 1913, 2052, + }; + std::vector second = {2048, 43, 2080, 2700, 1913, 2052}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_EQ(diffs[0].first, std::vector({1994, 2193})); + ASSERT_EQ(diffs[0].second, std::vector({2048, 43, 2080})); + + // Verify that elements after the difference are identical + for (size_t i = 3; i < second.size(); i++) { + ASSERT_EQ(first[i - 1], second[i]); + } +} + +TEST(TestFindDifferences, LongSequenceWithMiddleChanges) { + std::vector first = {2169, 1976, 2180, 2147, 1934, 1772, + 1914, 2075, 2154, 1940, 1934, 1970}; + std::vector second = {2169, 1976, 2180, 2147, 2265, 1804, + 1717, 1925, 2122, 1940, 1934, 1970}; + auto diffs = FindDifferences(first, second); + + ASSERT_EQ(diffs.size(), 1); + ASSERT_EQ(diffs[0].first, std::vector({1934, 1772, 1914, 2075, 2154})); + ASSERT_EQ(diffs[0].second, std::vector({2265, 1804, 1717, 1925, 2122})); + + // Verify elements before and after the difference are identical + for (size_t i = 0; i < 4; i++) { + ASSERT_EQ(first[i], second[i]); + } + for (size_t i = 9; i < first.size(); i++) { + ASSERT_EQ(first[i], second[i]); + } +} + void AssertUpdateCase(const std::vector& original, - const std::vector& modified) { - ASSERT_EQ(original.size(), modified.size()); - for (size_t i = 0; i < original.size(); i++) { - ASSERT_EQ(original[i], modified[i]); + const std::vector& modified, uint8_t n_modifications) { + auto diffs = FindDifferences(original, modified); + ASSERT_LE(diffs.size(), n_modifications); + + for (const auto& diff : diffs) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum, right_sum); + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); + } + + if (diffs.size() == 0) { + // no differences found, the arrays are equal + ASSERT_TRUE(original == modified); } } void AssertDeleteCase(const std::vector& original, - const std::vector& modified, - uint8_t n_modifications = 1) { - ASSERT_EQ(original.size(), modified.size()); - size_t smaller_count = 0; - for (size_t i = 0; i < original.size(); i++) { - if (modified[i] < original[i]) { - smaller_count++; - ASSERT_LT(modified[i], original[i]); - } else { - ASSERT_EQ(modified[i], original[i]); - } + const std::vector& modified, uint8_t n_modifications, + uint64_t edit_length) { + auto diffs = FindDifferences(original, modified); + ASSERT_EQ(diffs.size(), n_modifications); + + for (const auto& diff : diffs) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum, right_sum + edit_length); + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); } - ASSERT_EQ(smaller_count, n_modifications); } void AssertInsertCase(const std::vector& original, - const std::vector& modified, - uint8_t n_modifications = 1) { - ASSERT_EQ(original.size(), modified.size()); - size_t larger_count = 0; - for (size_t i = 0; i < original.size(); i++) { - if (modified[i] > original[i]) { - larger_count++; - ASSERT_GT(modified[i], original[i]); - } else { - ASSERT_EQ(modified[i], original[i]); - } + const std::vector& modified, uint8_t n_modifications, + uint64_t edit_length) { + auto diffs = FindDifferences(original, modified); + ASSERT_EQ(diffs.size(), n_modifications); + + for (const auto& diff : diffs) { + uint64_t left_sum = 0, right_sum = 0; + for (const auto& val : diff.first) left_sum += val; + for (const auto& val : diff.second) right_sum += val; + ASSERT_EQ(left_sum + edit_length, right_sum); + ASSERT_LE(diff.first.size(), 2); + ASSERT_LE(diff.second.size(), 2); } - ASSERT_EQ(larger_count, n_modifications); } void AssertAppendCase(const std::vector& original, @@ -189,229 +368,269 @@ void AssertAppendCase(const std::vector& original, uint64_t ElementCount(uint64_t size, int32_t byte_width, bool nullable) { if (nullable) { + // in case of nullable types the def_levels are also fed through the chunker + // to identify changes in the null bitmap, this will increase the byte width + // and decrease the number of elements per chunk byte_width += 2; } return size / byte_width; } -constexpr uint64_t kMinChunkSize = 128 * 1024; -constexpr uint64_t kMaxChunkSize = 256 * 1024; +constexpr uint64_t kMinChunkSize = 32 * 1024; +constexpr uint64_t kMaxChunkSize = 128 * 1024; +constexpr uint64_t kPartLength = 128 * 1024; +constexpr uint64_t kEditLength = 32; // TODO: -// - test nullable types // - test nested types -// - test dictionary encoding // - test multiple row groups -class TestColumnChunker : public ::testing::TestWithParam< - std::tuple, bool>> {}; +class TestContentDefinedChunker + : public ::testing::TestWithParam< + std::tuple, bool, bool>> {}; -TEST_P(TestColumnChunker, DeleteOnce) { +TEST_P(TestContentDefinedChunker, DeleteOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); auto base = ConcatAndCombine({part1, part2, part3}); auto modified = ConcatAndCombine({part1, part3}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertDeleteCase(base_lengths, modified_lengths, 1); + AssertDeleteCase(base_lengths, modified_lengths, 1, kEditLength); } -TEST_P(TestColumnChunker, DeleteTwice) { +TEST_P(TestContentDefinedChunker, DeleteTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); - auto part5 = GenerateTable({field}, 128 * 1024); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); auto modified = ConcatAndCombine({part1, part3, part5}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertDeleteCase(base_lengths, modified_lengths, 2); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertDeleteCase(base_lengths, modified_lengths, 2, kEditLength); } -TEST_P(TestColumnChunker, UpdateOnce) { +TEST_P(TestContentDefinedChunker, UpdateOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); auto base = ConcatAndCombine({part1, part2, part3}); auto modified = ConcatAndCombine({part1, part4, part3}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertUpdateCase(base_lengths, modified_lengths); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertUpdateCase(base_lengths, modified_lengths, 1); } -TEST_P(TestColumnChunker, UpdateTwice) { +TEST_P(TestContentDefinedChunker, UpdateTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); - auto part5 = GenerateTable({field}, 128 * 1024); - auto part6 = GenerateTable({field}, 32, /*seed=*/3); - auto part7 = GenerateTable({field}, 32, /*seed=*/4); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); + auto part6 = GenerateTable({field}, kEditLength, /*seed=*/6); + auto part7 = GenerateTable({field}, kEditLength, /*seed=*/7); auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); auto modified = ConcatAndCombine({part1, part6, part3, part7, part5}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertUpdateCase(base_lengths, modified_lengths); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertUpdateCase(base_lengths, modified_lengths, 2); } -TEST_P(TestColumnChunker, InsertOnce) { +TEST_P(TestContentDefinedChunker, InsertOnce) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 64); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); - auto base = ConcatAndCombine({part1, part2, part3}); - auto modified = ConcatAndCombine({part1, part2, part4, part3}); + auto base = ConcatAndCombine({part1, part3}); + auto modified = ConcatAndCombine({part1, part2, part3}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertInsertCase(base_lengths, modified_lengths, 1); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertInsertCase(base_lengths, modified_lengths, 1, kEditLength); } -TEST_P(TestColumnChunker, InsertTwice) { +TEST_P(TestContentDefinedChunker, InsertTwice) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); + enable_dictionary = false; auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32, /*seed=*/2); - auto part5 = GenerateTable({field}, 128 * 1024); - auto part6 = GenerateTable({field}, 64); - auto part7 = GenerateTable({field}, 64); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); + auto part5 = GenerateTable({field}, kPartLength, /*seed=*/5); - auto base = ConcatAndCombine({part1, part2, part3, part4, part5}); - auto modified = ConcatAndCombine({part1, part2, part6, part3, part4, part7, part5}); + auto base = ConcatAndCombine({part1, part3, part5}); + auto modified = ConcatAndCombine({part1, part2, part3, part4, part5}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); - - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); - AssertInsertCase(base_lengths, modified_lengths, 2); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); + + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertInsertCase(base_lengths, modified_lengths, 2, kEditLength); } -TEST_P(TestColumnChunker, Append) { +TEST_P(TestContentDefinedChunker, Append) { auto dtype = std::get<0>(GetParam()); auto nullable = std::get<1>(GetParam()); + auto enable_dictionary = std::get<2>(GetParam()); auto field = ::arrow::field("f0", dtype, nullable); - auto part1 = GenerateTable({field}, 128 * 1024); - auto part2 = GenerateTable({field}, 32, /*seed=*/1); - auto part3 = GenerateTable({field}, 128 * 1024); - auto part4 = GenerateTable({field}, 32 * 1024); + auto part1 = GenerateTable({field}, kPartLength, /*seed=*/1); + auto part2 = GenerateTable({field}, kEditLength, /*seed=*/2); + auto part3 = GenerateTable({field}, kPartLength, /*seed=*/3); + auto part4 = GenerateTable({field}, kEditLength, /*seed=*/4); auto base = ConcatAndCombine({part1, part2, part3}); auto modified = ConcatAndCombine({part1, part2, part3, part4}); + ASSERT_FALSE(base->Equals(*modified)); auto min_length = ElementCount(kMinChunkSize, dtype->byte_width(), nullable); auto max_length = ElementCount(kMaxChunkSize, dtype->byte_width(), nullable); ASSERT_OK_AND_ASSIGN(auto base_lengths, - WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(base, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); ASSERT_OK_AND_ASSIGN(auto modified_lengths, - WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize)); + WriteAndGetPageLengths(modified, kMinChunkSize, kMaxChunkSize, + /*enable_dictionary=*/enable_dictionary)); - AssertAllBetween(base_lengths, min_length, max_length); - AssertAllBetween(modified_lengths, min_length, max_length); + AssertAllBetween(base_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); + AssertAllBetween(modified_lengths, min_length, max_length, + /*expect_dictionary_fallback=*/enable_dictionary); AssertAppendCase(base_lengths, modified_lengths); } INSTANTIATE_TEST_SUITE_P( - TypeRoundtrip, TestColumnChunker, + FixedSizedTypes, TestContentDefinedChunker, Combine(Values(::arrow::uint8(), ::arrow::uint16(), ::arrow::uint32(), ::arrow::uint64(), ::arrow::int8(), ::arrow::int16(), ::arrow::int32(), ::arrow::int64(), ::arrow::float16(), ::arrow::float32(), ::arrow::float64()), - Bool())); + Bool(), Bool())); } // namespace parquet - -// - check that the state is maintained across rowgroups, so the edits should be -// consistent -// - check that the edits are consistent between writes -// - some smoke testing like approach would be nice to test several arrow types