From 3da7fa883cf6ecdb3a31e538fca2588c5465f64b Mon Sep 17 00:00:00 2001 From: Chongfeng Hu Date: Fri, 10 Jan 2025 11:43:37 -0800 Subject: [PATCH 1/2] Add new Tablet API to expose optionalSections metadata (#125) Summary: This new API will unlock use cases such as enumerating all optional sections in the Nimble file. It will also empower `nimble_dump` to be able to provide relevant information about the optional sections. Reviewed By: helfman Differential Revision: D67949242 --- dwio/nimble/tablet/TabletReader.cpp | 19 ++++++------ dwio/nimble/tablet/TabletReader.h | 37 +++++++++++++++++++++--- dwio/nimble/tablet/tests/TabletTests.cpp | 19 ++++++++++++ 3 files changed, 62 insertions(+), 13 deletions(-) diff --git a/dwio/nimble/tablet/TabletReader.cpp b/dwio/nimble/tablet/TabletReader.cpp index aa65878..1b52e46 100644 --- a/dwio/nimble/tablet/TabletReader.cpp +++ b/dwio/nimble/tablet/TabletReader.cpp @@ -18,6 +18,7 @@ #include "dwio/nimble/common/Buffer.h" #include "dwio/nimble/common/EncodingPrimitives.h" #include "dwio/nimble/common/Exceptions.h" +#include "dwio/nimble/common/Types.h" #include "dwio/nimble/tablet/Compression.h" #include "dwio/nimble/tablet/Constants.h" #include "dwio/nimble/tablet/FooterGenerated.h" @@ -383,11 +384,11 @@ TabletReader::TabletReader( for (auto i = 0; i < optionalSections->names()->size(); ++i) { optionalSections_.insert(std::make_pair( optionalSections->names()->GetAsString(i)->str(), - std::make_tuple( + MetadataSection{ optionalSections->offsets()->Get(i), optionalSections->sizes()->Get(i), static_cast( - optionalSections->compression_types()->Get(i))))); + optionalSections->compression_types()->Get(i))})); } } @@ -399,9 +400,9 @@ TabletReader::TabletReader( continue; } - const auto sectionOffset = std::get<0>(it->second); - const auto sectionSize = std::get<1>(it->second); - const auto sectionCompressionType = std::get<2>(it->second); + const auto sectionOffset = it->second.offset(); + const auto sectionSize = it->second.size(); + const auto sectionCompressionType = it->second.compressionType(); if (sectionOffset < offset) { // Section was not read yet. Need to read from file. @@ -427,7 +428,7 @@ TabletReader::TabletReader( auto iobuf = std::move(result[i]); const std::string preload{mustRead[i].label}; auto metadata = std::make_unique( - memoryPool_, iobuf, std::get<2>(optionalSections_[preload])); + memoryPool_, iobuf, optionalSections_.at(preload).compressionType()); optionalSectionsCache_.wlock()->insert({preload, std::move(metadata)}); } } @@ -670,9 +671,9 @@ std::optional
TabletReader::loadOptionalSection( return std::nullopt; } - const auto offset = std::get<0>(it->second); - const auto size = std::get<1>(it->second); - const auto compressionType = std::get<2>(it->second); + const auto offset = it->second.offset(); + const auto size = it->second.size(); + const auto compressionType = it->second.compressionType(); velox::common::Region region{offset, size, name}; folly::IOBuf iobuf; diff --git a/dwio/nimble/tablet/TabletReader.h b/dwio/nimble/tablet/TabletReader.h index 93ab654..807e8f8 100644 --- a/dwio/nimble/tablet/TabletReader.h +++ b/dwio/nimble/tablet/TabletReader.h @@ -17,6 +17,7 @@ #include #include "dwio/nimble/common/Checksum.h" +#include "dwio/nimble/common/Types.h" #include "dwio/nimble/common/Vector.h" #include "folly/Range.h" #include "folly/Synchronized.h" @@ -85,6 +86,32 @@ class Section { MetadataBuffer buffer_; }; +class MetadataSection { + public: + MetadataSection( + uint64_t offset, + uint32_t size, + CompressionType compressionType) + : offset_{offset}, size_{size}, compressionType_{compressionType} {} + + uint64_t offset() const { + return offset_; + } + + uint32_t size() const { + return size_; + } + + CompressionType compressionType() const { + return compressionType_; + } + + private: + uint64_t offset_; + uint32_t size_; + CompressionType compressionType_; +}; + class Postscript { public: uint32_t footerSize() const { @@ -251,6 +278,11 @@ class TabletReader { const StripeIdentifier& stripe, std::span streamIdentifiers) const; + const std::unordered_map& optionalSections() + const { + return optionalSections_; + } + std::optional
loadOptionalSection( const std::string& name, bool keepCache = false) const; @@ -349,10 +381,7 @@ class TabletReader { uint32_t stripeCount_{0}; const uint32_t* stripeRowCounts_{nullptr}; const uint64_t* stripeOffsets_{nullptr}; - std::unordered_map< - std::string, - std::tuple> - optionalSections_; + std::unordered_map optionalSections_; mutable folly::Synchronized< std::unordered_map>> optionalSectionsCache_; diff --git a/dwio/nimble/tablet/tests/TabletTests.cpp b/dwio/nimble/tablet/tests/TabletTests.cpp index be01698..0e16fdb 100644 --- a/dwio/nimble/tablet/tests/TabletTests.cpp +++ b/dwio/nimble/tablet/tests/TabletTests.cpp @@ -561,6 +561,23 @@ TEST(TabletTests, OptionalSections) { file, useChaniedBuffers); nimble::TabletReader tablet{*pool, &readFile}; + ASSERT_EQ(tablet.optionalSections().size(), 3); + ASSERT_TRUE(tablet.optionalSections().contains("section1")); + ASSERT_EQ( + tablet.optionalSections().at("section1").compressionType(), + nimble::CompressionType::Uncompressed); + ASSERT_EQ(tablet.optionalSections().at("section1").size(), random.size()); + ASSERT_TRUE(tablet.optionalSections().contains("section2")); + ASSERT_EQ( + tablet.optionalSections().at("section2").compressionType(), + nimble::CompressionType::Uncompressed); + ASSERT_EQ(tablet.optionalSections().at("section2").size(), zeroes.size()); + ASSERT_TRUE(tablet.optionalSections().contains("section3")); + ASSERT_EQ( + tablet.optionalSections().at("section3").compressionType(), + nimble::CompressionType::Uncompressed); + ASSERT_EQ(tablet.optionalSections().at("section3").size(), 0); + auto check1 = [&]() { auto section = tablet.loadOptionalSection("section1"); ASSERT_TRUE(section.has_value()); @@ -607,6 +624,8 @@ TEST(TabletTests, OptionalSectionsEmpty) { file, useChaniedBuffers); nimble::TabletReader tablet{*pool, &readFile}; + ASSERT_TRUE(tablet.optionalSections().empty()); + auto section = tablet.loadOptionalSection("section1"); ASSERT_FALSE(section.has_value()); } From fb7a9ea18139b4c0db5c4bb8f6c23e64ac7fe4c5 Mon Sep 17 00:00:00 2001 From: Chongfeng Hu Date: Fri, 10 Jan 2025 11:43:37 -0800 Subject: [PATCH 2/2] Add 2 new TabletReader APIs: stripesMetadata and stripeGroupsMetadata (#126) Summary: These 2 new APIs will allow clients to get insights into stripes and stripe groups metadata, e.g., offset, size, etc. This information can be useful in use cases like `nimble_dump` where we want to know the sizes of these sections in the Nimble file. Reviewed By: helfman Differential Revision: D67957498 --- dwio/nimble/tablet/TabletReader.cpp | 39 ++++++++++++++++++++++++++++- dwio/nimble/tablet/TabletReader.h | 8 ++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/dwio/nimble/tablet/TabletReader.cpp b/dwio/nimble/tablet/TabletReader.cpp index 1b52e46..01439fc 100644 --- a/dwio/nimble/tablet/TabletReader.cpp +++ b/dwio/nimble/tablet/TabletReader.cpp @@ -26,7 +26,9 @@ #include "folly/io/Cursor.h" #include +#include #include +#include #include #include #include @@ -48,7 +50,7 @@ namespace facebook::nimble { // 4 bytes footer size + 1 byte footer compression type + // 1 byte checksum type + 8 bytes checksum + // 2 bytes major version + 2 bytes minor version + -// 4 bytes magic number. +// 2 bytes magic number. namespace { template @@ -648,6 +650,41 @@ uint64_t TabletReader::getTotalStreamSize( return streamSizeSum; } +std::optional TabletReader::stripesMetadata() const { + auto footerRoot = + asFlatBuffersRoot(footer_->content()); + auto stripes = footerRoot->stripes(); + if (!stripes) { + return std::nullopt; + } + return MetadataSection{ + stripes->offset(), + stripes->size(), + static_cast(stripes->compression_type())}; +} + +std::vector TabletReader::stripeGroupsMetadata() const { + std::vector groupsMetadata; + auto footerRoot = + asFlatBuffersRoot(footer_->content()); + auto stripeGroups = footerRoot->stripe_groups(); + if (!stripeGroups) { + return groupsMetadata; + } + groupsMetadata.reserve(stripeGroups->size()); + std::transform( + stripeGroups->cbegin(), + stripeGroups->cend(), + std::back_inserter(groupsMetadata), + [](const auto& stripeGroup) { + return MetadataSection{ + stripeGroup->offset(), + stripeGroup->size(), + static_cast(stripeGroup->compression_type())}; + }); + return groupsMetadata; +} + std::optional
TabletReader::loadOptionalSection( const std::string& name, bool keepCache) const { diff --git a/dwio/nimble/tablet/TabletReader.h b/dwio/nimble/tablet/TabletReader.h index 807e8f8..dcbf43c 100644 --- a/dwio/nimble/tablet/TabletReader.h +++ b/dwio/nimble/tablet/TabletReader.h @@ -14,7 +14,11 @@ * limitations under the License. */ #pragma once + +#include +#include #include +#include #include "dwio/nimble/common/Checksum.h" #include "dwio/nimble/common/Types.h" @@ -278,6 +282,10 @@ class TabletReader { const StripeIdentifier& stripe, std::span streamIdentifiers) const; + std::optional stripesMetadata() const; + + std::vector stripeGroupsMetadata() const; + const std::unordered_map& optionalSections() const { return optionalSections_;