From 0611eb5b9df04f76b9d763b44763349b067b6c48 Mon Sep 17 00:00:00 2001
From: Nick Brekhus <brekhus@meta.com>
Date: Wed, 18 Sep 2024 13:27:44 -0700
Subject: [PATCH 01/17] Fix orphaned files in SstFileManager (#13015)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13015

`Close()`ing a database now releases tracked files in `SstFileManager`. Previously this space would be leaked until the database was later reopened.

Reviewed By: jowlyzhang

Differential Revision: D62590773

fbshipit-source-id: 5461bd253d974ac4967ad52fee92e2650f8a9a28
---
 db/db_impl/db_impl.cc                         | 68 +++++++++++++++++++
 db/db_impl/db_impl.h                          | 10 +++
 db/db_impl/db_impl_open.cc                    | 41 +----------
 db/db_sst_test.cc                             | 24 +++++++
 file/sst_file_manager_impl.cc                 | 10 +++
 file/sst_file_manager_impl.h                  |  4 +-
 .../sst_file_manager_untrack_close.md         |  2 +
 7 files changed, 118 insertions(+), 41 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/sst_file_manager_untrack_close.md

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 5b54b4d2d..e95561efa 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -530,6 +530,11 @@ Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() {
   return Status::OK();
 }
 
+void DBImpl::UntrackDataFiles() {
+  TrackOrUntrackFiles(/*existing_data_files=*/{},
+                      /*track=*/false);
+}
+
 Status DBImpl::CloseHelper() {
   // Guarantee that there is no background error recovery in progress before
   // continuing with the shutdown
@@ -669,6 +674,13 @@ Status DBImpl::CloseHelper() {
     delete txn_entry.second;
   }
 
+  // Return an unowned SstFileManager to a consistent state
+  if (immutable_db_options_.sst_file_manager && !own_sfm_) {
+    mutex_.Unlock();
+    UntrackDataFiles();
+    mutex_.Lock();
+  }
+
   // versions need to be destroyed before table_cache since it can hold
   // references to table_cache.
   {
@@ -6747,6 +6759,62 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
   }
 }
 
+void DBImpl::TrackOrUntrackFiles(
+    const std::vector<std::string>& existing_data_files, bool track) {
+  auto sfm = static_cast_with_check<SstFileManagerImpl>(
+      immutable_db_options_.sst_file_manager.get());
+  assert(sfm);
+  std::vector<ColumnFamilyMetaData> metadata;
+  GetAllColumnFamilyMetaData(&metadata);
+  auto action = [&](const std::string& file_path,
+                    std::optional<uint64_t> size) {
+    if (track) {
+      if (size) {
+        sfm->OnAddFile(file_path, *size).PermitUncheckedError();
+      } else {
+        sfm->OnAddFile(file_path).PermitUncheckedError();
+      }
+    } else {
+      sfm->OnUntrackFile(file_path).PermitUncheckedError();
+    }
+  };
+
+  std::unordered_set<std::string> referenced_files;
+  for (const auto& md : metadata) {
+    for (const auto& lmd : md.levels) {
+      for (const auto& fmd : lmd.files) {
+        // We're assuming that each sst file name exists in at most one of
+        // the paths.
+        std::string file_path =
+            fmd.directory + kFilePathSeparator + fmd.relative_filename;
+        action(file_path, fmd.size);
+        referenced_files.insert(file_path);
+      }
+    }
+    for (const auto& bmd : md.blob_files) {
+      std::string name = bmd.blob_file_name;
+      // The BlobMetaData.blob_file_name may start with "/".
+      if (!name.empty() && name[0] == kFilePathSeparator) {
+        name = name.substr(1);
+      }
+      // We're assuming that each blob file name exists in at most one of
+      // the paths.
+      std::string file_path = bmd.blob_file_path + kFilePathSeparator + name;
+      action(file_path, bmd.blob_file_size);
+      referenced_files.insert(file_path);
+    }
+  }
+
+  for (const auto& file_path : existing_data_files) {
+    if (referenced_files.find(file_path) != referenced_files.end()) {
+      continue;
+    }
+    // There shouldn't be any duplicated files. In case there is, SstFileManager
+    // will take care of deduping it.
+    action(file_path, /*size=*/std::nullopt);
+  }
+}
+
 void DBImpl::InstallSeqnoToTimeMappingInSV(
     std::vector<SuperVersionContext>* sv_contexts) {
   mutex_.AssertHeld();
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index da0625ae7..66b324825 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1618,9 +1618,15 @@ class DBImpl : public DB {
   // vast majority of all files), since it already has the file size
   // on record, we don't need to query the file system. Otherwise, we query the
   // file system for the size of an unreferenced file.
+  // REQUIRES: mutex unlocked
   void TrackExistingDataFiles(
       const std::vector<std::string>& existing_data_files);
 
+  // Untrack data files in sst manager. This is only called during DB::Close on
+  // an unowned SstFileManager, to return it to a consistent state.
+  // REQUIRES: mutex unlocked
+  void UntrackDataFiles();
+
   // SetDbSessionId() should be called in the constuctor DBImpl()
   // to ensure that db_session_id_ gets updated every time the DB is opened
   void SetDbSessionId();
@@ -2190,6 +2196,10 @@ class DBImpl : public DB {
                           JobContext* job_context, LogBuffer* log_buffer,
                           CompactionJobInfo* compaction_job_info);
 
+  // REQUIRES: mutex unlocked
+  void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
+                           bool track);
+
   ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
 
   void MaybeScheduleFlushOrCompaction();
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 0697cc20f..4ce2e82e8 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1988,46 +1988,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
 
 void DBImpl::TrackExistingDataFiles(
     const std::vector<std::string>& existing_data_files) {
-  auto sfm = static_cast<SstFileManagerImpl*>(
-      immutable_db_options_.sst_file_manager.get());
-  assert(sfm);
-  std::vector<ColumnFamilyMetaData> metadata;
-  GetAllColumnFamilyMetaData(&metadata);
-
-  std::unordered_set<std::string> referenced_files;
-  for (const auto& md : metadata) {
-    for (const auto& lmd : md.levels) {
-      for (const auto& fmd : lmd.files) {
-        // We're assuming that each sst file name exists in at most one of
-        // the paths.
-        std::string file_path =
-            fmd.directory + kFilePathSeparator + fmd.relative_filename;
-        sfm->OnAddFile(file_path, fmd.size).PermitUncheckedError();
-        referenced_files.insert(file_path);
-      }
-    }
-    for (const auto& bmd : md.blob_files) {
-      std::string name = bmd.blob_file_name;
-      // The BlobMetaData.blob_file_name may start with "/".
-      if (!name.empty() && name[0] == kFilePathSeparator) {
-        name = name.substr(1);
-      }
-      // We're assuming that each blob file name exists in at most one of
-      // the paths.
-      std::string file_path = bmd.blob_file_path + kFilePathSeparator + name;
-      sfm->OnAddFile(file_path, bmd.blob_file_size).PermitUncheckedError();
-      referenced_files.insert(file_path);
-    }
-  }
-
-  for (const auto& file_path : existing_data_files) {
-    if (referenced_files.find(file_path) != referenced_files.end()) {
-      continue;
-    }
-    // There shouldn't be any duplicated files. In case there is, SstFileManager
-    // will take care of deduping it.
-    sfm->OnAddFile(file_path).PermitUncheckedError();
-  }
+  TrackOrUntrackFiles(existing_data_files, /*track=*/true);
 }
 
 Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 5499d58f5..14ec8e55c 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -383,12 +383,16 @@ TEST_F(DBSSTTest, DBWithSstFileManager) {
   ASSERT_EQ(files_moved, 0);
 
   Close();
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
   Reopen(options);
   ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
   ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
 
   // Verify that we track all the files again after the DB is closed and opened
   Close();
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
   sst_file_manager.reset(NewSstFileManager(env_));
   options.sst_file_manager = sst_file_manager;
   sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
@@ -439,6 +443,11 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+
+  int64_t untracked_files = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnUntrackFile",
+      [&](void* /*arg*/) { ++untracked_files; });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
@@ -485,6 +494,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
   }
   ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
   Close();
+  ASSERT_EQ(untracked_files, files_in_db.size());
+  untracked_files = 0;
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
 
   Reopen(options);
   ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
@@ -492,6 +505,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
 
   // Verify that we track all the files again after the DB is closed and opened.
   Close();
+  ASSERT_EQ(untracked_files, files_in_db.size());
+  untracked_files = 0;
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
 
   sst_file_manager.reset(NewSstFileManager(env_));
   options.sst_file_manager = sst_file_manager;
@@ -507,6 +524,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
   ASSERT_EQ(files_deleted, 0);
   ASSERT_EQ(files_scheduled_to_delete, 0);
   Close();
+  ASSERT_EQ(untracked_files, files_in_db.size());
+  untracked_files = 0;
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
         assert(arg);
@@ -666,6 +687,9 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
   }
 
   Close();
+  ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
+  ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
+
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
         assert(arg);
diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
index 3f4f48e28..0ecb5d5eb 100644
--- a/file/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -118,6 +118,16 @@ Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
   return Status::OK();
 }
 
+Status SstFileManagerImpl::OnUntrackFile(const std::string& file_path) {
+  {
+    MutexLock l(&mu_);
+    OnDeleteFileImpl(file_path);
+  }
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnUntrackFile",
+                           const_cast<std::string*>(&file_path));
+  return Status::OK();
+}
+
 void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
   MutexLock l(&mu_);
   max_allowed_space_ = max_allowed_space;
diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h
index 47a2b5935..96ec271ee 100644
--- a/file/sst_file_manager_impl.h
+++ b/file/sst_file_manager_impl.h
@@ -50,6 +50,9 @@ class SstFileManagerImpl : public SstFileManager {
   Status OnMoveFile(const std::string& old_path, const std::string& new_path,
                     uint64_t* file_size = nullptr);
 
+  // DB will call OnUntrackFile when closing with an unowned SstFileManager.
+  Status OnUntrackFile(const std::string& file_path);
+
   // Update the maximum allowed space that should be used by RocksDB, if
   // the total size of the SST and blob files exceeds max_allowed_space, writes
   // to RocksDB will fail.
@@ -217,4 +220,3 @@ class SstFileManagerImpl : public SstFileManager {
 };
 
 }  // namespace ROCKSDB_NAMESPACE
-
diff --git a/unreleased_history/behavior_changes/sst_file_manager_untrack_close.md b/unreleased_history/behavior_changes/sst_file_manager_untrack_close.md
new file mode 100644
index 000000000..7251b547e
--- /dev/null
+++ b/unreleased_history/behavior_changes/sst_file_manager_untrack_close.md
@@ -0,0 +1,2 @@
+DB::Close now untracks files in SstFileManager, making avaialble any space used
+by them. Prior to this change they would be orphaned until the DB is re-opened.

From 10984e8c26ae961144231b8c6f8dd3058e55a169 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 18 Sep 2024 15:26:37 -0700
Subject: [PATCH 02/17] Fix and generalize framework for filtering range
 queries, etc. (#13005)

Summary:
There was a subtle design/contract bug in the previous version of range filtering in experimental.h  If someone implemented a key segments extractor with "all or nothing" fixed size segments, that could result in unsafe range filtering. For example, with two segments of width 3:
```
x = 0x|12 34 56|78 9A 00|
y = 0x|12 34 56||78 9B
z = 0x|12 34 56|78 9C 00|
```
Segment 1 of y (empty) is out of order with segment 1 of x and z.

I have re-worked the contract to make it clear what does work, and implemented a standard extractor for fixed-size segments, CappedKeySegmentsExtractor. The safe approach for filtering is to consume as much as is available for a segment in the case of a short key.

I have also added support for min-max filtering with reverse byte-wise comparator, which is probably the 2nd most common comparator for RocksDB users (because of MySQL). It might seem that a min-max filter doesn't care about forward or reverse ordering, but it does when trying to determine whether in input range from segment values v1 to v2, where it so happens that v2 is byte-wise less than v1, is an empty forward interval or a non-empty reverse interval. At least in the current setup, we don't have that context.

A new unit test (with some refactoring) tests CappedKeySegmentsExtractor, reverse byte-wise comparator, and the corresponding min-max filter.

I have also (contractually / mathematically) generalized the framework to comparators other than the byte-wise comparator, and made other generalizations to make the extractor limitations more explicitly connected to the particular filters and filtering used--at least in description.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13005

Test Plan: added unit tests as described

Reviewed By: jowlyzhang

Differential Revision: D62769784

Pulled By: pdillinger

fbshipit-source-id: 0d41f0d0273586bdad55e4aa30381ebc861f7044
---
 db/db_bloom_filter_test.cc     | 240 +++++++++++++++++----
 db/experimental.cc             | 198 +++++++++++++++---
 include/rocksdb/comparator.h   |  15 +-
 include/rocksdb/experimental.h | 370 +++++++++++++++++++++++++--------
 4 files changed, 658 insertions(+), 165 deletions(-)

diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index 4943e99c4..7b42f4ee5 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -3734,14 +3734,33 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
   }
 }
 
-TEST_F(DBBloomFilterTest, SstQueryFilter) {
-  using experimental::KeySegmentsExtractor;
-  using experimental::MakeSharedBytewiseMinMaxSQFC;
-  using experimental::SelectKeySegment;
-  using experimental::SstQueryFilterConfigs;
-  using experimental::SstQueryFilterConfigsManager;
-  using KeyCategorySet = KeySegmentsExtractor::KeyCategorySet;
+using experimental::KeySegmentsExtractor;
+using experimental::MakeSharedBytewiseMinMaxSQFC;
+using experimental::MakeSharedCappedKeySegmentsExtractor;
+using experimental::MakeSharedReverseBytewiseMinMaxSQFC;
+using experimental::SelectKeySegment;
+using experimental::SstQueryFilterConfigs;
+using experimental::SstQueryFilterConfigsManager;
+using KeyCategorySet = KeySegmentsExtractor::KeyCategorySet;
+
+static std::vector<std::string> RangeQueryKeys(
+    SstQueryFilterConfigsManager::Factory& factory, DB& db, const Slice& lb,
+    const Slice& ub) {
+  ReadOptions ro;
+  ro.iterate_lower_bound = &lb;
+  ro.iterate_upper_bound = &ub;
+  ro.table_filter = factory.GetTableFilterForRangeQuery(lb, ub);
+  auto it = db.NewIterator(ro);
+  std::vector<std::string> ret;
+  for (it->Seek(lb); it->Valid(); it->Next()) {
+    ret.push_back(it->key().ToString());
+  }
+  EXPECT_OK(it->status());
+  delete it;
+  return ret;
+};
 
+TEST_F(DBBloomFilterTest, SstQueryFilter) {
   struct MySegmentExtractor : public KeySegmentsExtractor {
     char min_first_char;
     char max_first_char;
@@ -3890,101 +3909,86 @@ TEST_F(DBBloomFilterTest, SstQueryFilter) {
   ASSERT_OK(Flush());
 
   using Keys = std::vector<std::string>;
-  auto RangeQueryKeys =
+  auto RangeQuery =
       [factory, db = db_](
           std::string lb, std::string ub,
           std::shared_ptr<SstQueryFilterConfigsManager::Factory> alt_factory =
               nullptr) {
-        Slice lb_slice = lb;
-        Slice ub_slice = ub;
-
-        ReadOptions ro;
-        ro.iterate_lower_bound = &lb_slice;
-        ro.iterate_upper_bound = &ub_slice;
-        ro.table_filter = (alt_factory ? alt_factory : factory)
-                              ->GetTableFilterForRangeQuery(lb_slice, ub_slice);
-        auto it = db->NewIterator(ro);
-        Keys ret;
-        for (it->Seek(lb_slice); it->Valid(); it->Next()) {
-          ret.push_back(it->key().ToString());
-        }
-        EXPECT_OK(it->status());
-        delete it;
-        return ret;
+        return RangeQueryKeys(alt_factory ? *alt_factory : *factory, *db, lb,
+                              ub);
       };
 
   // Control 1: range is not filtered but min/max filter is checked
   // because of common prefix leading up to 2nd segment
   // TODO/future: statistics for when filter is checked vs. not applicable
-  EXPECT_EQ(RangeQueryKeys("abc_150", "abc_249"),
+  EXPECT_EQ(RangeQuery("abc_150", "abc_249"),
             Keys({"abc_156_987", "abc_234", "abc_245_567"}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2);
 
   // Test 1: range is filtered to just lowest level, fully containing the
   // segments in that category
-  EXPECT_EQ(RangeQueryKeys("abc_100", "abc_179"),
+  EXPECT_EQ(RangeQuery("abc_100", "abc_179"),
             Keys({"abc_123", "abc_13", "abc_156_987"}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1);
 
   // Test 2: range is filtered to just lowest level, partial overlap
-  EXPECT_EQ(RangeQueryKeys("abc_1500_x_y", "abc_16QQ"), Keys({"abc_156_987"}));
+  EXPECT_EQ(RangeQuery("abc_1500_x_y", "abc_16QQ"), Keys({"abc_156_987"}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1);
 
   // Test 3: range is filtered to just highest level, fully containing the
   // segments in that category but would be overlapping the range for the other
   // file if the filter included all categories
-  EXPECT_EQ(RangeQueryKeys("abc_200", "abc_300"),
+  EXPECT_EQ(RangeQuery("abc_200", "abc_300"),
             Keys({"abc_234", "abc_245_567", "abc_25"}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1);
 
   // Test 4: range is filtered to just highest level, partial overlap (etc.)
-  EXPECT_EQ(RangeQueryKeys("abc_200", "abc_249"),
-            Keys({"abc_234", "abc_245_567"}));
+  EXPECT_EQ(RangeQuery("abc_200", "abc_249"), Keys({"abc_234", "abc_245_567"}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1);
 
   // Test 5: range is filtered from both levels, because of category scope
-  EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400"), Keys({}));
+  EXPECT_EQ(RangeQuery("abc_300", "abc_400"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0);
 
   // Control 2: range is not filtered because association between 1st and
   // 2nd segment is not represented
-  EXPECT_EQ(RangeQueryKeys("abc_170", "abc_190"), Keys({}));
+  EXPECT_EQ(RangeQuery("abc_170", "abc_190"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2);
 
   // Control 3: range is not filtered because there's no (bloom) filter on
   // 1st segment (like prefix filtering)
-  EXPECT_EQ(RangeQueryKeys("baa_170", "baa_190"), Keys({}));
+  EXPECT_EQ(RangeQuery("baa_170", "baa_190"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2);
 
   // Control 4: range is not filtered because difference in segments leading
   // up to 2nd segment
-  EXPECT_EQ(RangeQueryKeys("abc_500", "abd_501"), Keys({}));
+  EXPECT_EQ(RangeQuery("abc_500", "abd_501"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2);
 
   // TODO: exclusive upper bound tests
 
   // ======= Testing 3rd segment (cross-category filter) =======
   // Control 5: not filtered because of segment range overlap
-  EXPECT_EQ(RangeQueryKeys(" z__700", " z__750"), Keys({}));
+  EXPECT_EQ(RangeQuery(" z__700", " z__750"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 2);
 
   // Test 6: filtered on both levels
-  EXPECT_EQ(RangeQueryKeys(" z__100", " z__300"), Keys({}));
+  EXPECT_EQ(RangeQuery(" z__100", " z__300"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0);
 
   // Control 6: finding something, with 2nd segment filter helping
-  EXPECT_EQ(RangeQueryKeys("abc_156_9", "abc_156_99"), Keys({"abc_156_987"}));
+  EXPECT_EQ(RangeQuery("abc_156_9", "abc_156_99"), Keys({"abc_156_987"}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1);
 
-  EXPECT_EQ(RangeQueryKeys("abc_245_56", "abc_245_57"), Keys({"abc_245_567"}));
+  EXPECT_EQ(RangeQuery("abc_245_56", "abc_245_57"), Keys({"abc_245_567"}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 1);
 
   // Test 6: filtered on both levels, for different segments
-  EXPECT_EQ(RangeQueryKeys("abc_245_900", "abc_245_999"), Keys({}));
+  EXPECT_EQ(RangeQuery("abc_245_900", "abc_245_999"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0);
 
   // ======= Testing extractor read portability =======
-  EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400"), Keys({}));
+  EXPECT_EQ(RangeQuery("abc_300", "abc_400"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0);
 
   // Only modifies how filters are written
@@ -3992,20 +3996,168 @@ TEST_F(DBBloomFilterTest, SstQueryFilter) {
   ASSERT_EQ(factory->GetFilteringVersion(), 0U);
   ASSERT_EQ(factory->GetConfigs().IsEmptyNotFound(), true);
 
-  EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400"), Keys({}));
+  EXPECT_EQ(RangeQuery("abc_300", "abc_400"), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0);
 
   // Even a different config name with different extractor can read
-  EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400", MakeFactory("bar", 43)),
-            Keys({}));
+  EXPECT_EQ(RangeQuery("abc_300", "abc_400", MakeFactory("bar", 43)), Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0);
 
   // Or a "not found" config name
-  EXPECT_EQ(RangeQueryKeys("abc_300", "abc_400", MakeFactory("blah", 43)),
+  EXPECT_EQ(RangeQuery("abc_300", "abc_400", MakeFactory("blah", 43)),
             Keys({}));
   EXPECT_EQ(TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA), 0);
 }
 
+static std::vector<int> ExtractedSizes(const KeySegmentsExtractor& ex,
+                                       const Slice& k) {
+  KeySegmentsExtractor::Result r;
+  ex.Extract(k, KeySegmentsExtractor::kFullUserKey, &r);
+  std::vector<int> ret;
+  uint32_t last = 0;
+  for (const auto i : r.segment_ends) {
+    ret.push_back(static_cast<int>(i - last));
+    last = i;
+  }
+  return ret;
+}
+
+TEST_F(DBBloomFilterTest, FixedWidthSegments) {
+  // Unit tests for
+  auto extractor_none = MakeSharedCappedKeySegmentsExtractor({});
+  auto extractor0b = MakeSharedCappedKeySegmentsExtractor({0});
+  auto extractor1b = MakeSharedCappedKeySegmentsExtractor({1});
+  auto extractor4b = MakeSharedCappedKeySegmentsExtractor({4});
+  auto extractor4b0b = MakeSharedCappedKeySegmentsExtractor({4, 0});
+  auto extractor4b0b4b = MakeSharedCappedKeySegmentsExtractor({4, 0, 4});
+  auto extractor1b3b0b4b = MakeSharedCappedKeySegmentsExtractor({1, 3, 0, 4});
+
+  ASSERT_EQ(extractor_none->GetId(), "CappedKeySegmentsExtractor");
+  ASSERT_EQ(extractor0b->GetId(), "CappedKeySegmentsExtractor0b");
+  ASSERT_EQ(extractor1b->GetId(), "CappedKeySegmentsExtractor1b");
+  ASSERT_EQ(extractor4b->GetId(), "CappedKeySegmentsExtractor4b");
+  ASSERT_EQ(extractor4b0b->GetId(), "CappedKeySegmentsExtractor4b0b");
+  ASSERT_EQ(extractor4b0b4b->GetId(), "CappedKeySegmentsExtractor4b0b4b");
+  ASSERT_EQ(extractor1b3b0b4b->GetId(), "CappedKeySegmentsExtractor1b3b0b4b");
+
+  using V = std::vector<int>;
+  ASSERT_EQ(V({}), ExtractedSizes(*extractor_none, {}));
+  ASSERT_EQ(V({0}), ExtractedSizes(*extractor0b, {}));
+  ASSERT_EQ(V({0}), ExtractedSizes(*extractor1b, {}));
+  ASSERT_EQ(V({0}), ExtractedSizes(*extractor4b, {}));
+  ASSERT_EQ(V({0, 0}), ExtractedSizes(*extractor4b0b, {}));
+  ASSERT_EQ(V({0, 0, 0}), ExtractedSizes(*extractor4b0b4b, {}));
+  ASSERT_EQ(V({0, 0, 0, 0}), ExtractedSizes(*extractor1b3b0b4b, {}));
+
+  ASSERT_EQ(V({3}), ExtractedSizes(*extractor4b, "bla"));
+  ASSERT_EQ(V({3, 0}), ExtractedSizes(*extractor4b0b, "bla"));
+  ASSERT_EQ(V({1, 2, 0, 0}), ExtractedSizes(*extractor1b3b0b4b, "bla"));
+
+  ASSERT_EQ(V({}), ExtractedSizes(*extractor_none, "blah"));
+  ASSERT_EQ(V({0}), ExtractedSizes(*extractor0b, "blah"));
+  ASSERT_EQ(V({1}), ExtractedSizes(*extractor1b, "blah"));
+  ASSERT_EQ(V({4}), ExtractedSizes(*extractor4b, "blah"));
+  ASSERT_EQ(V({4, 0}), ExtractedSizes(*extractor4b0b, "blah"));
+  ASSERT_EQ(V({4, 0, 0}), ExtractedSizes(*extractor4b0b4b, "blah"));
+  ASSERT_EQ(V({1, 3, 0, 0}), ExtractedSizes(*extractor1b3b0b4b, "blah"));
+
+  ASSERT_EQ(V({4, 0}), ExtractedSizes(*extractor4b0b, "blah1"));
+  ASSERT_EQ(V({4, 0, 1}), ExtractedSizes(*extractor4b0b4b, "blah1"));
+  ASSERT_EQ(V({4, 0, 2}), ExtractedSizes(*extractor4b0b4b, "blah12"));
+  ASSERT_EQ(V({4, 0, 3}), ExtractedSizes(*extractor4b0b4b, "blah123"));
+  ASSERT_EQ(V({1, 3, 0, 3}), ExtractedSizes(*extractor1b3b0b4b, "blah123"));
+
+  ASSERT_EQ(V({4}), ExtractedSizes(*extractor4b, "blah1234"));
+  ASSERT_EQ(V({4, 0}), ExtractedSizes(*extractor4b0b, "blah1234"));
+  ASSERT_EQ(V({4, 0, 4}), ExtractedSizes(*extractor4b0b4b, "blah1234"));
+  ASSERT_EQ(V({1, 3, 0, 4}), ExtractedSizes(*extractor1b3b0b4b, "blah1234"));
+
+  ASSERT_EQ(V({4, 0, 4}), ExtractedSizes(*extractor4b0b4b, "blah12345"));
+  ASSERT_EQ(V({1, 3, 0, 4}), ExtractedSizes(*extractor1b3b0b4b, "blah12345"));
+
+  // Filter config for second and fourth segment
+  auto filter1 =
+      MakeSharedReverseBytewiseMinMaxSQFC(experimental::SelectKeySegment(1));
+  auto filter3 =
+      MakeSharedReverseBytewiseMinMaxSQFC(experimental::SelectKeySegment(3));
+  SstQueryFilterConfigs configs1 = {{filter1, filter3}, extractor1b3b0b4b};
+  SstQueryFilterConfigsManager::Data data = {{42, {{"foo", configs1}}}};
+  std::shared_ptr<SstQueryFilterConfigsManager> configs_manager;
+  ASSERT_OK(SstQueryFilterConfigsManager::MakeShared(data, &configs_manager));
+  std::shared_ptr<SstQueryFilterConfigsManager::Factory> f;
+  ASSERT_OK(configs_manager->MakeSharedFactory("foo", 42, &f));
+
+  ASSERT_EQ(f->GetConfigsName(), "foo");
+  ASSERT_EQ(f->GetConfigs().IsEmptyNotFound(), false);
+
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.table_properties_collector_factories.push_back(f);
+  // Next most common comparator after bytewise
+  options.comparator = ReverseBytewiseComparator();
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("abcd1234", "val0"));
+  ASSERT_OK(Put("abcd1245", "val1"));
+  ASSERT_OK(Put("abcd99", "val2"));  // short key
+  ASSERT_OK(Put("aqua1200", "val3"));
+  ASSERT_OK(Put("aqua1230", "val4"));
+  ASSERT_OK(Put("zen", "val5"));  // very short key
+  ASSERT_OK(Put("azur1220", "val6"));
+  ASSERT_OK(Put("blah", "val7"));
+  ASSERT_OK(Put("blah2", "val8"));
+
+  ASSERT_OK(Flush());
+
+  using Keys = std::vector<std::string>;
+
+  // Range is not filtered but segment 1 min/max filter is checked
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "aczz0000", "acdc0000"), Keys({}));
+  EXPECT_EQ(1, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Found (can't use segment 3 filter)
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "aqzz0000", "aqdc0000"),
+            Keys({"aqua1230", "aqua1200"}));
+  EXPECT_EQ(1, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Filtered because of segment 1 min-max not intersecting [aaa, abb]
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "zabb9999", "zaaa0000"), Keys({}));
+  EXPECT_EQ(0, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Found
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "aqua1200ZZZ", "aqua1000ZZZ"),
+            Keys({"aqua1200"}));
+  EXPECT_EQ(1, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Found despite short key
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "aqua121", "aqua1"), Keys({"aqua1200"}));
+  EXPECT_EQ(1, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Filtered because of segment 3 min-max not intersecting [1000, 1100]
+  // Note that the empty string is tracked outside of the min-max range.
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "aqua1100ZZZ", "aqua1000ZZZ"), Keys({}));
+  EXPECT_EQ(0, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Also filtered despite short key
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "aqua11", "aqua1"), Keys({}));
+  EXPECT_EQ(0, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Found
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "blah21", "blag"),
+            Keys({"blah2", "blah"}));
+  EXPECT_EQ(1, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Found
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "blah0", "blag"), Keys({"blah"}));
+  EXPECT_EQ(1, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+
+  // Filtered because of segment 3 min-max not intersecting [0, 1]
+  // Note that the empty string is tracked outside of the min-max range.
+  EXPECT_EQ(RangeQueryKeys(*f, *db_, "blah1", "blah0"), Keys({}));
+  EXPECT_EQ(0, TestGetAndResetTickerCount(options, NON_LAST_LEVEL_SEEK_DATA));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/experimental.cc b/db/experimental.cc
index 8abe45978..f5f14ec61 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -152,6 +152,85 @@ Status UpdateManifestForFilesState(
 // EXPERIMENTAL new filtering features
 
 namespace {
+template <size_t N>
+class SemiStaticCappedKeySegmentsExtractor : public KeySegmentsExtractor {
+ public:
+  SemiStaticCappedKeySegmentsExtractor(const uint32_t* byte_widths) {
+    id_ = kName();
+    uint32_t prev_end = 0;
+    if constexpr (N > 0) {  // Suppress a compiler warning
+      for (size_t i = 0; i < N; ++i) {
+        prev_end = prev_end + byte_widths[i];
+        ideal_ends_[i] = prev_end;
+        id_ += std::to_string(byte_widths[i]) + "b";
+      }
+    }
+  }
+
+  static const char* kName() { return "CappedKeySegmentsExtractor"; }
+
+  const char* Name() const override { return kName(); }
+
+  std::string GetId() const override { return id_; }
+
+  void Extract(const Slice& key_or_bound, KeyKind /*kind*/,
+               Result* result) const override {
+    // Optimistic assignment
+    result->segment_ends.assign(ideal_ends_.begin(), ideal_ends_.end());
+    if constexpr (N > 0) {  // Suppress a compiler warning
+      uint32_t key_size = static_cast<uint32_t>(key_or_bound.size());
+      if (key_size < ideal_ends_.back()) {
+        // Need to fix up (should be rare)
+        for (size_t i = 0; i < N; ++i) {
+          result->segment_ends[i] = std::min(key_size, result->segment_ends[i]);
+        }
+      }
+    }
+  }
+
+ private:
+  std::array<uint32_t, N> ideal_ends_;
+  std::string id_;
+};
+
+class DynamicCappedKeySegmentsExtractor : public KeySegmentsExtractor {
+ public:
+  DynamicCappedKeySegmentsExtractor(const std::vector<uint32_t>& byte_widths) {
+    id_ = kName();
+    uint32_t prev_end = 0;
+    for (size_t i = 0; i < byte_widths.size(); ++i) {
+      prev_end = prev_end + byte_widths[i];
+      ideal_ends_[i] = prev_end;
+      id_ += std::to_string(byte_widths[i]) + "b";
+    }
+    final_ideal_end_ = prev_end;
+  }
+
+  static const char* kName() { return "CappedKeySegmentsExtractor"; }
+
+  const char* Name() const override { return kName(); }
+
+  std::string GetId() const override { return id_; }
+
+  void Extract(const Slice& key_or_bound, KeyKind /*kind*/,
+               Result* result) const override {
+    // Optimistic assignment
+    result->segment_ends = ideal_ends_;
+    uint32_t key_size = static_cast<uint32_t>(key_or_bound.size());
+    if (key_size < final_ideal_end_) {
+      // Need to fix up (should be rare)
+      for (size_t i = 0; i < ideal_ends_.size(); ++i) {
+        result->segment_ends[i] = std::min(key_size, result->segment_ends[i]);
+      }
+    }
+  }
+
+ private:
+  std::vector<uint32_t> ideal_ends_;
+  uint32_t final_ideal_end_;
+  std::string id_;
+};
+
 void GetFilterInput(FilterInput select, const Slice& key,
                     const KeySegmentsExtractor::Result& extracted,
                     Slice* out_input, Slice* out_leadup) {
@@ -211,12 +290,6 @@ void GetFilterInput(FilterInput select, const Slice& key,
       assert(false);
       return Slice();
     }
-
-    Slice operator()(SelectValue) {
-      // TODO
-      assert(false);
-      return Slice();
-    }
   };
 
   Slice input = std::visit(FilterInputGetter(key, extracted), select);
@@ -256,9 +329,6 @@ const char* DeserializeFilterInput(const char* p, const char* limit,
         case 3:
           *out = SelectColumnName{};
           return p;
-        case 4:
-          *out = SelectValue{};
-          return p;
         default:
           // Reserved for future use
           return nullptr;
@@ -315,7 +385,6 @@ void SerializeFilterInput(std::string* out, const FilterInput& select) {
     void operator()(SelectLegacyKeyPrefix) { out->push_back(1); }
     void operator()(SelectUserTimestamp) { out->push_back(2); }
     void operator()(SelectColumnName) { out->push_back(3); }
-    void operator()(SelectValue) { out->push_back(4); }
     void operator()(SelectKeySegment select) {
       // TODO: expand supported cases
       assert(select.segment_index < 16);
@@ -372,6 +441,7 @@ enum BuiltinSstQueryFilters : char {
   // and filtered independently because it might be a special case that is
   // not representative of the minimum in a spread of values.
   kBytewiseMinMaxFilter = 0x10,
+  kRevBytewiseMinMaxFilter = 0x11,
 };
 
 class SstQueryFilterBuilder {
@@ -459,7 +529,10 @@ class CategoryScopeFilterWrapperBuilder : public SstQueryFilterBuilder {
 
 class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
  public:
-  using SstQueryFilterConfigImpl::SstQueryFilterConfigImpl;
+  explicit BytewiseMinMaxSstQueryFilterConfig(
+      const FilterInput& input,
+      const KeySegmentsExtractor::KeyCategorySet& categories, bool reverse)
+      : SstQueryFilterConfigImpl(input, categories), reverse_(reverse) {}
 
   std::unique_ptr<SstQueryFilterBuilder> NewBuilder(
       bool sanity_checks) const override {
@@ -477,11 +550,13 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
       const KeySegmentsExtractor::Result& lower_bound_extracted,
       const Slice& upper_bound_excl,
       const KeySegmentsExtractor::Result& upper_bound_extracted) {
-    assert(!filter.empty() && filter[0] == kBytewiseMinMaxFilter);
+    assert(!filter.empty() && (filter[0] == kBytewiseMinMaxFilter ||
+                               filter[0] == kRevBytewiseMinMaxFilter));
     if (filter.size() <= 4) {
       // Missing some data
       return true;
     }
+    bool reverse = (filter[0] == kRevBytewiseMinMaxFilter);
     bool empty_included = (filter[1] & kEmptySeenFlag) != 0;
     const char* p = filter.data() + 2;
     const char* limit = filter.data() + filter.size();
@@ -528,8 +603,13 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
 
     // May match if both the upper bound and lower bound indicate there could
     // be overlap
-    return upper_bound_input.compare(smallest) >= 0 &&
-           lower_bound_input.compare(largest) <= 0;
+    if (reverse) {
+      return upper_bound_input.compare(smallest) <= 0 &&
+             lower_bound_input.compare(largest) >= 0;
+    } else {
+      return upper_bound_input.compare(smallest) >= 0 &&
+             lower_bound_input.compare(largest) <= 0;
+    }
   }
 
  protected:
@@ -551,19 +631,11 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
                        &prev_leadup);
 
         int compare = prev_leadup.compare(leadup);
-        if (compare > 0) {
-          status = Status::Corruption(
-              "Ordering invariant violated from 0x" +
-              prev_key->ToString(/*hex=*/true) + " with prefix 0x" +
-              prev_leadup.ToString(/*hex=*/true) + " to 0x" +
-              key.ToString(/*hex=*/true) + " with prefix 0x" +
-              leadup.ToString(/*hex=*/true));
-          return;
-        } else if (compare == 0) {
+        if (compare == 0) {
           // On the same prefix leading up to the segment, the segments must
           // not be out of order.
           compare = prev_input.compare(input);
-          if (compare > 0) {
+          if (parent.reverse_ ? compare < 0 : compare > 0) {
             status = Status::Corruption(
                 "Ordering invariant violated from 0x" +
                 prev_key->ToString(/*hex=*/true) + " with segment 0x" +
@@ -573,6 +645,9 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
             return;
           }
         }
+        // NOTE: it is not strictly required that the leadup be ordered, just
+        // satisfy the "common segment prefix property" which would be
+        // expensive to check
       }
 
       // Now actually update state for the filter inputs
@@ -598,7 +673,8 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
         return 0;
       }
       return 2 + GetFilterInputSerializedLength(parent.input_) +
-             VarintLength(smallest.size()) + smallest.size() + largest.size();
+             VarintLength(parent.reverse_ ? largest.size() : smallest.size()) +
+             smallest.size() + largest.size();
     }
 
     void Finish(std::string& append_to) override {
@@ -610,23 +686,27 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
       }
       size_t old_append_to_size = append_to.size();
       append_to.reserve(old_append_to_size + encoded_length);
-      append_to.push_back(kBytewiseMinMaxFilter);
+      append_to.push_back(parent.reverse_ ? kRevBytewiseMinMaxFilter
+                                          : kBytewiseMinMaxFilter);
 
       append_to.push_back(empty_seen ? kEmptySeenFlag : 0);
 
       SerializeFilterInput(&append_to, parent.input_);
 
-      PutVarint32(&append_to, static_cast<uint32_t>(smallest.size()));
-      append_to.append(smallest);
-      // The end of `largest` is given by the end of the filter
-      append_to.append(largest);
+      auto& minv = parent.reverse_ ? largest : smallest;
+      auto& maxv = parent.reverse_ ? smallest : largest;
+      PutVarint32(&append_to, static_cast<uint32_t>(minv.size()));
+      append_to.append(minv);
+      // The end of `maxv` is given by the end of the filter
+      append_to.append(maxv);
       assert(append_to.size() == old_append_to_size + encoded_length);
     }
 
     const BytewiseMinMaxSstQueryFilterConfig& parent;
     const bool sanity_checks;
     // Smallest and largest segment seen, excluding the empty segment which
-    // is tracked separately
+    // is tracked separately. "Reverse" from parent is only applied at
+    // serialization time, for efficiency.
     std::string smallest;
     std::string largest;
     bool empty_seen = false;
@@ -635,6 +715,8 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
     Status status;
   };
 
+  bool reverse_;
+
  private:
   static constexpr char kEmptySeenFlag = 0x1;
 };
@@ -1036,6 +1118,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
             may_match = MayMatch_CategoryScopeFilterWrapper(filter, *state);
             break;
           case kBytewiseMinMaxFilter:
+          case kRevBytewiseMinMaxFilter:
             if (state == nullptr) {
               // TODO? Report problem
               // No filtering
@@ -1189,14 +1272,63 @@ const std::string SstQueryFilterConfigsManagerImpl::kTablePropertyName =
     "rocksdb.sqfc";
 }  // namespace
 
+std::shared_ptr<const KeySegmentsExtractor>
+MakeSharedCappedKeySegmentsExtractor(const std::vector<size_t>& byte_widths) {
+  std::vector<uint32_t> byte_widths_checked;
+  byte_widths_checked.resize(byte_widths.size());
+  size_t final_end = 0;
+  for (size_t i = 0; i < byte_widths.size(); ++i) {
+    final_end += byte_widths[i];
+    if (byte_widths[i] > UINT32_MAX / 2 || final_end > UINT32_MAX) {
+      // Better to crash than to proceed unsafely
+      return nullptr;
+    }
+    byte_widths_checked[i] = static_cast<uint32_t>(byte_widths[i]);
+  }
+
+  switch (byte_widths_checked.size()) {
+    case 0:
+      return std::make_shared<SemiStaticCappedKeySegmentsExtractor<0>>(
+          byte_widths_checked.data());
+    case 1:
+      return std::make_shared<SemiStaticCappedKeySegmentsExtractor<1>>(
+          byte_widths_checked.data());
+    case 2:
+      return std::make_shared<SemiStaticCappedKeySegmentsExtractor<2>>(
+          byte_widths_checked.data());
+    case 3:
+      return std::make_shared<SemiStaticCappedKeySegmentsExtractor<3>>(
+          byte_widths_checked.data());
+    case 4:
+      return std::make_shared<SemiStaticCappedKeySegmentsExtractor<4>>(
+          byte_widths_checked.data());
+    case 5:
+      return std::make_shared<SemiStaticCappedKeySegmentsExtractor<5>>(
+          byte_widths_checked.data());
+    case 6:
+      return std::make_shared<SemiStaticCappedKeySegmentsExtractor<6>>(
+          byte_widths_checked.data());
+    default:
+      return std::make_shared<DynamicCappedKeySegmentsExtractor>(
+          byte_widths_checked);
+  }
+}
+
 bool SstQueryFilterConfigs::IsEmptyNotFound() const {
   return this == &kEmptyNotFoundSQFC;
 }
 
 std::shared_ptr<SstQueryFilterConfig> MakeSharedBytewiseMinMaxSQFC(
     FilterInput input, KeySegmentsExtractor::KeyCategorySet categories) {
-  return std::make_shared<BytewiseMinMaxSstQueryFilterConfig>(input,
-                                                              categories);
+  return std::make_shared<BytewiseMinMaxSstQueryFilterConfig>(
+      input, categories,
+      /*reverse=*/false);
+}
+
+std::shared_ptr<SstQueryFilterConfig> MakeSharedReverseBytewiseMinMaxSQFC(
+    FilterInput input, KeySegmentsExtractor::KeyCategorySet categories) {
+  return std::make_shared<BytewiseMinMaxSstQueryFilterConfig>(input, categories,
+                                                              /*reverse=*/true);
 }
 
 Status SstQueryFilterConfigsManager::MakeShared(
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index c1ea96b59..d727d38c1 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -179,13 +179,18 @@ class Comparator : public Customizable, public CompareInterface {
   size_t timestamp_size_;
 };
 
-// Return a builtin comparator that uses lexicographic byte-wise
-// ordering.  The result remains the property of this module and
-// must not be deleted.
+// Return a builtin comparator that uses lexicographic ordering
+// on unsigned bytes, so the empty string is ordered before everything
+// else and a sufficiently long string of \xFF orders after anything.
+// CanKeysWithDifferentByteContentsBeEqual() == false
+// Returns an immortal pointer that must not be deleted by the caller.
 const Comparator* BytewiseComparator();
 
-// Return a builtin comparator that uses reverse lexicographic byte-wise
-// ordering.
+// Return a builtin comparator that is the reverse ordering of
+// BytewiseComparator(), so the empty string is ordered after everything
+// else and a sufficiently long string of \xFF orders before anything.
+// CanKeysWithDifferentByteContentsBeEqual() == false
+// Returns an immortal pointer that must not be deleted by the caller.
 const Comparator* ReverseBytewiseComparator();
 
 // Returns a builtin comparator that enables user-defined timestamps (formatted
diff --git a/include/rocksdb/experimental.h b/include/rocksdb/experimental.h
index e3da33aaf..68380f42f 100644
--- a/include/rocksdb/experimental.h
+++ b/include/rocksdb/experimental.h
@@ -61,83 +61,89 @@ Status UpdateManifestForFilesState(
 // EXPERIMENTAL new filtering features
 // ****************************************************************************
 
-// A class for splitting a key into meaningful pieces, or "segments" for
-// filtering purposes. Keys can also be put in "categories" to simplify
-// some configuration and handling. To simplify satisfying some filtering
-// requirements, the segments must encompass a complete key prefix (or the whole
-// key) and segments cannot overlap.
-//
-// Once in production, the behavior associated with a particular Name()
-// cannot change. Introduce a new Name() when introducing new behaviors.
+// KeySegmentsExtractor - A class for splitting a key into meaningful pieces, or
+// "segments" for filtering purposes. We say the first key segment has segment
+// ordinal 0, the second has segment ordinal 1, etc. To simplify satisfying some
+// filtering requirements, the segments must encompass a complete key prefix (or
+// the whole key). There cannot be gaps between segments (though segments are
+// allowed to be essentially unused), and segments cannot overlap.
+//
+// Keys can also be put in "categories" to simplify some configuration and
+// handling. A "legal" key or bound is one that does not return an error (as a
+// special, unused category) from the extractor. It is also allowed for all
+// keys in a category to return an empty sequence of segments.
+//
+// To eliminate a confusing distinction between a segment that is empty vs.
+// "not present" for a particular key, each key is logically assiciated with
+// an infinite sequence of segments, including some infinite tail of 0-length
+// segments. In practice, we only represent a finite sequence that (at least)
+// covers the non-trivial segments.
+//
+// Once in production, the behavior associated with a particular GetId()
+// cannot change. Introduce a new GetId() when introducing new behaviors.
 // See also SstQueryFilterConfigsManager below.
 //
-// OTHER CURRENT LIMITATIONS (maybe relaxed in the future for segments only
-// needing point query or WHERE filtering):
-// * Assumes the (default) byte-wise comparator is used.
-// * Assumes the category contiguousness property: that each category is
-// contiguous in comparator order. In other words, any key between two keys of
-// category c must also be in category c.
-// * Assumes the (weak) segment ordering property (described below) always
-// holds. (For byte-wise comparator, this is implied by the segment prefix
-// property, also described below.)
-// * Not yet compatible with user timestamp feature
-//
-// SEGMENT ORDERING PROPERTY: For maximum use in filters, especially for
-// filtering key range queries, we must have a correspondence between
-// the lexicographic ordering of key segments and the ordering of keys
-// they are extracted from. In other words, if we took the segmented keys
-// and ordered them primarily by (byte-wise) order on segment 0, then
-// on segment 1, etc., then key order of the original keys would not be
-// violated. This is the WEAK form of the property, where multiple keys
-// might generate the same segments, but such keys must be contiguous in
-// key order. (The STRONG form of the property is potentially more useful,
-// but for bytewise comparator, it can be inferred from segments satisfying
-// the weak property by assuming another segment that extends to the end of
-// the key, which would be empty if the segments already extend to the end
-// of the key.)
-//
-// The segment ordering property is hard to think about directly, but for
-// bytewise comparator, it is implied by a simpler property to reason about:
-// the segment prefix property (see below). (NOTE: an example way to satisfy
-// the segment ordering property while breaking the segment prefix property
-// is to have a segment delimited by any byte smaller than a certain value,
-// and not include the delimiter with the segment leading up to the delimiter.
-// For example, the space character is ordered before other printable
-// characters, so breaking "foo bar" into "foo", " ", and "bar" would be
-// legal, but not recommended.)
-//
-// SEGMENT PREFIX PROPERTY: If a key generates segments s0, ..., sn (possibly
-// more beyond sn) and sn does not extend to the end of the key, then all keys
-// starting with bytes s0+...+sn (concatenated) also generate the same segments
-// (possibly more). For example, if a key has segment s0 which is less than the
-// whole key and another key starts with the bytes of s0--or only has the bytes
-// of s0--then the other key must have the same segment s0. In other words, any
-// prefix of segments that might not extend to the end of the key must form an
-// unambiguous prefix code. See
-// https://en.wikipedia.org/wiki/Prefix_code  In other other words, parsing
-// a key into segments cannot use even a single byte of look-ahead. Upon
-// processing each byte, the extractor decides whether to cut a segment that
-// ends with that byte, but not one that ends before that byte. The only
-// exception is that upon reaching the end of the key, the extractor can choose
-// whether to make a segment that ends at the end of the key.
+// This feature hasn't yet been validated with user timestamp.
+//
+// = A SIMPLIFIED MODEL =
+// Let us start with the easiest set of contraints to satisfy with a key
+// segments extractor that generally allows for correct point and range
+// filtering, and add complexity from there. Here we first assume
+// * The column family is using the byte-wise comparator, or reverse byte-wise
+// * A single category is assigned to all keys (by the extractor)
+// * Using simplified criteria for legal segment extraction, the "segment
+//   maximal prefix property"
+//
+// SEGMENT MAXIMAL PREFIX PROPERTY: The segment that a byte is assigned to can
+// only depend on the bytes that come before it, not on the byte itself nor
+// anything later including the full length of the key or bound.
+//
+// Equivalently, two keys or bounds must agree on the segment assignment of
+// position i if the two keys share a common byte-wise prefix up to at least
+// position i - 1 (and i is within bounds of both keys).
+//
+// This specifically excludes "all or nothing" segments where it is only
+// included if it reaches a particular width or delimiter. A segment resembling
+// the FixedPrefixTransform would be illegal (without other assumptions); it
+// must be like CappedPrefixTransform.
+//
+// This basically matches the notion of parsing prefix codes (see
+// https://en.wikipedia.org/wiki/Prefix_code) except we have to include any
+// partial segment (code word) at the end whenever an extension to that key
+// might produce a full segment. An example would be parsing UTF-8 into
+// segments corresponding to encoded code points, where any incomplete code
+// at the end must be part of a trailing segment. Note a three-way
+// correspondence between
+// (a) byte-wise ordering of encoded code points, e.g.
+//     { D0 98 E2 82 AC }
+//     { E2 82 AC D0 98 }
+// (b) lexicographic-then-byte-wise ordering of segments that are each an
+//     encoded code point, e.g.
+//     {{ D0 98 } { E2 82 AC }}
+//     {{ E2 82 AC } { D0 98 }}
+// and (c) lexicographic ordering of the decoded code points, e.g.
+//     { U+0418 U+20AC }
+//     { U+20AC U+0418 }
+// The correspondence between (a) and (b) is a result of the segment maximal
+// prefix property and is critical for correct application of filters to
+// range queries. The correspondence with (c) is a handy attribute of UTF-8
+// (with no over-long encodings) and might be useful to the application.
 //
 // Example types of key segments that can be freely mixed in any order:
-// * Some fixed number of bytes or codewords.
-// * Ends in a delimiter byte or codeword. (Not including the delimiter as
-// part of the segment leading up to it would very likely violate the segment
-// prefix property.)
-// * Length-encoded sequence of bytes or codewords. The length could even
-// come from a preceding segment.
+// * Capped number of bytes or codewords. The number cap for the segment
+// could be the same for all keys or encoded earlier in the key.
+// * Up to *and including* a delimiter byte or codeword.
 // * Any/all remaining bytes to the end of the key, though this implies all
 // subsequent segments will be empty.
-// For each kind of segment, it should be determined before parsing the segment
-// whether an incomplete/short parse will be treated as a segment extending to
-// the end of the key or as an empty segment.
+// As part of the segment maximal prefix property, if the segments do not
+// extend to the end of the key, that must be implied by the bytes that are
+// in segments, NOT because the potential contents of a segment were considered
+// incomplete.
 //
 // For example, keys might consist of
 // * Segment 0: Any sequence of bytes up to and including the first ':'
 // character, or the whole key if no ':' is present.
-// * Segment 1: The next four bytes, all or nothing (in case of short key).
+// * Segment 1: The next four bytes, or less if we reach end of key.
 // * Segment 2: An unsigned byte indicating the number of additional bytes in
 // the segment, and then that many bytes (or less up to the end of the key).
 // * Segment 3: Any/all remaining bytes in the key
@@ -145,22 +151,208 @@ Status UpdateManifestForFilesState(
 // For an example of what can go wrong, consider using '4' as a delimiter
 // but not including it with the segment leading up to it. Suppose we have
 // these keys and corresponding first segments:
-// "123456" -> "123"
-// "124536" -> "12"
-// "125436" -> "125"
+// "123456" -> "123" (in file 1)
+// "124536" -> "12"  (in file 2)
+// "125436" -> "125" (in file 1)
 // Notice how byte-wise comparator ordering of the segments does not follow
 // the ordering of the keys. This means we cannot safely use a filter with
-// a range of segment values for filtering key range queries.
+// a range of segment values for filtering key range queries. For example,
+// we might get a range query for ["123", "125Z") and miss that key "124536"
+// in file 2 is in range because its first segment "12" is out of the range
+// of the first segments on the bounds, "123" and "125". We cannot even safely
+// use this for prefix-like range querying with a Bloom filter on the segments.
+// For a query ["12", "124Z"), segment "12" would likely not match the Bloom
+// filter in file 1 and miss "123456".
 //
-// Also note that it is legal for all keys in a category (or many categories)
-// to return an empty sequence of segments.
+// CATEGORIES: The KeySegmentsExtractor is allowed to place keys in categories
+// so that different parts of the key space can use different filtering
+// strategies. The following property is generally recommended for safe filter
+// applicability
+// * CATEGORY CONTIGUOUSNESS PROPERTY: each category is contiguous in
+//   comparator order. In other words, any key between two keys of category c
+//   must also be in category c.
+// An alternative to categories when distinct kinds of keys are interspersed
+// is to leave some segments empty when they do not apply to that key.
+// Filters are generally set up to handle an empty segment specially so that
+// it doesn't interfere with tracking accurate ranges on non-empty occurrences
+// of the segment.
 //
-// To eliminate a confusing distinction between a segment that is empty vs.
-// "not present" for a particular key, each key is logically assiciated with
-// an infinite sequence of segments, including some infinite tail of 0-length
-// segments. In practice, we only represent a finite sequence that (at least)
-// covers the non-trivial segments.
+// = BEYOND THE SIMPLIFIED MODEL =
+//
+// DETAILED GENERAL REQUIREMENTS (incl OTHER COMPARATORS): The exact
+// requirements on a key segments extractor depend on whether and how we use
+// filters to answer queries that they cannot answer directly. To understand
+// this, we describe
+// (A) the types of filters in terms of data they represent and can directly
+// answer queries about,
+// (B) the types of read queries that we want to use filters for, and
+// (C) the assumptions that need to be satisfied to connect those two.
+//
+// TYPES OF FILTERS: Although not exhaustive, here are some useful categories
+// of filter data:
+// * Equivalence class filtering - Represents or over-approximates a set of
+// equivalence classes on keys. The size of the representation is roughly
+// proportional to the number of equivalence classes added. Bloom and ribbon
+// filters are examples.
+// * Order-based filtering - Represents one or more subranges of a key space or
+// key segment space. A filter query only requires application of the CF
+// comparator. The size of the representation is roughly proportional to the
+// number of subranges and to the key or segment size. For example, we call a
+// simple filter representing a minimum and a maximum value for a segment a
+// min-max filter.
 //
+// TYPES OF READ QUERIES and their DIRECT FILTERS:
+// * Point query - Whether there {definitely isn't, might be} an entry for a
+// particular key in an SST file (or partition, etc.).
+// The DIRECT FILTER for a point query is an equivalence class filter on the
+// whole key.
+// * Range query - Whether there {definitely isn't, might be} any entries
+// within a lower and upper key bound, in an SST file (or partition, etc.).
+//    NOTE: For this disucssion, we ignore the detail of inclusive vs.
+//    exclusive bounds by assuming a generalized notion of "bound" (vs. key)
+//    that conveniently represents spaces between keys. For details, see
+//    https://github.com/facebook/rocksdb/pull/11434
+// The DIRECT FILTER for a range query is an order-based filter on the whole
+// key (non-empty intersection of bounds/keys). Simple minimum and maximum
+// keys for each SST file are automatically provided by metadata and used in
+// the read path for filtering (as well as binary search indexing).
+//    PARTITIONING NOTE: SST metadata partitions do not have recorded minimum
+//    and maximum keys, so require some special handling for range query
+//    filtering. See https://github.com/facebook/rocksdb/pull/12872 etc.
+// * Where clauses - Additional constraints that can be put on range queries.
+// Specifically, a where clause is a tuple <i,j,c,b1,b2> representing that the
+// concatenated sequence of segments from i to j (inclusive) compares between
+// b1 and b2 according to comparator c.
+//    EXAMPLE: To represent that segment of ordinal i is equal to s, that would
+//    be <i,i,bytewise_comparator,before(s),after(s)>.
+//    NOTE: To represent something like segment has a particular prefix, you
+//    would need to split the key into more segments appropriately. There is
+//    little loss of generality because we can combine adjacent segments for
+//    specifying where clauses and implementing filters.
+// The DIRECT FILTER for a where clause is an order-based filter on the same
+// sequence of segments and comparator (non-empty intersection of bounds/keys),
+// or in the special case of an equality clause (see example), an equivalence
+// class filter on the sequence of segments.
+//
+// GENERALIZING FILTERS (INDIRECT):
+// * Point queries can utilize essentially any kind of filter by extracting
+// applicable segments of the query key (if not using whole key) and querying
+// the corresponding equivalence class or trivial range.
+//    NOTE: There is NO requirement e.g. that the comparator used by the filter
+//    match the CF key comparator or similar. The extractor simply needs to be
+//    a pure function that does not return "out of bounds" segments.
+//    FOR EXAMPLE, a min-max filter on the 4th segment of keys can also be
+//    used for filtering point queries (Get/MultiGet) and could be as
+//    effective and much more space efficient than a Bloom filter, depending
+//    on the workload.
+//
+// Beyond point queries, we generally expect the key comparator to be a
+// lexicographic / big endian ordering at a high level (or the reverse of that
+// ordering), while each segment can use an arbitrary comparator.
+//    FOR EXAMPLE, with a custom key comparator and segments extractor,
+//    segment 0 could be a 4-byte unsigned little-endian integer,
+//    segment 1 could be an 8-byte signed big-endian integer. This framework
+//    requires segment 0 to come before segment 1 in the key and to take
+//    precedence in key ordering (i.e. segment 1 order is only consulted when
+//    keys are equal in segment 0).
+//
+// * Equivalence class filters can apply to range queries under conditions
+// resembling legacy prefix filtering (prefix_extractor). An equivalence class
+// filter on segments i through j and category set s is applicable to a range
+// query from lb to ub if
+//   * All segments through j extracted from lb and ub are equal.
+//     NOTE: being in the same filtering equivalence class is insufficient, as
+//     that could be unrelated inputs with a hash collision. Here we are
+//     omitting details that would formally accommodate comparators in which
+//     different bytes can be considered equal.
+//   * The categories of lb and ub are in the category set s.
+//   * COMMON SEGMENT PREFIX PROPERTY (for all x, y, z; params j, s): if
+//     * Keys x and z have equal segments up through ordinal j, and
+//     * Keys x and z are in categories in category set s, and
+//     * Key y is ordered x < y < z according to the CF comparator,
+//   then both
+//     * Key y has equal segments up through ordinal j (compared to x and z)
+//     * Key y is in a category in category set s
+//  (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
+//  model.)
+//
+// * Order-based filters on segments (rather than whole key) can apply to range
+// queries (with "whole key" bounds). Specifically, an order-based filter on
+// segments i through j and category set s is applicable to a range query from
+// lb to ub if
+//   * All segments through i-1 extracted from lb and ub are equal
+//   * The categories of lb and ub are in the category set s.
+//   * SEGMENT ORDERING PROPERTY for ordinal i through j, segments
+//   comparator c, category set s, for all x, y, and z: if
+//     * Keys x and z have equal segments up through ordinal i-1, and
+//     * Keys x and z are in categories in category set s, and
+//     * Key y is ordered x < y < z according to the CF comparator,
+// then both
+//     * The common segment prefix property is satisifed through ordinal i-1
+//     and with category set s
+//     * x_i..j <= y_i..j <= z_i..j according to segment comparator c, where
+//     x_i..j is the concatenation of segments i through j of key x (etc.).
+//     (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
+//     model.)
+//
+// INTERESTING EXAMPLES:
+// Consider a segment encoding called BadVarInt1 in which a byte with
+// highest-order bit 1 means "start a new segment". Also consider BadVarInt0
+// which starts a new segment on highest-order bit 0.
+//
+// Configuration: bytewise comp, BadVarInt1 format for segments 0-3 with
+// segment 3 also continuing to the end of the key
+// x = 0x 20 21|82 23|||
+// y = 0x 20 21|82 23 24|85||
+// z = 0x 20 21|82 23|84 25||
+//
+// For i=j=1, this set of keys violate the common segment prefix property and
+// segment ordering property, so can lead to incorrect equivalence class
+// filtering or order-based filtering.
+//
+// Suppose we modify the configuration so that "short" keys (empty in segment
+// 2) are placed in an unfiltered category. In that case, x above doesn't meet
+// the precondition for being limited by segment properties. Consider these
+// keys instead:
+// x = 0x 20 21|82 23 24|85||
+// y = 0x 20 21|82 23 24|85 26|87|
+// z = 0x 20 21|82 23 24|85|86|
+// m = 0x 20 21|82 23 25|85|86|
+// n = 0x 20 21|82 23|84 25||
+//
+// Although segment 1 values might be out of order with key order,
+// re-categorizing the short keys has allowed satisfying the common segment
+// prefix property with j=1 (and with j=0), so we can use equivalence class
+// filters on segment 1, or 0, or 0 to 1. However, violation of the segment
+// ordering property on i=j=1 (see z, m, n) means we can't use order-based.
+//
+// p = 0x 20 21|82 23|84 25 26||
+// q = 0x 20 21|82 23|84 25|86|
+//
+// But keys can still be short from segment 2 to 3, and thus we are violating
+// the common segment prefix property for segment 2 (see n, p, q).
+//
+// Configuration: bytewise comp, BadVarInt0 format for segments 0-3 with
+// segment 3 also continuing to the end of the key. No short key category.
+// x = 0x 80 81|22 83|||
+// y = 0x 80 81|22 83|24 85||
+// z = 0x 80 81|22 83 84|25||
+// m = 0x 80 82|22 83|||
+// n = 0x 80 83|22 84|24 85||
+//
+// Even though this violates the segment maximal prefix property of the
+// simplified model, the common segment prefix property and segment ordering
+// property are satisfied for the various segment ordinals. In broader terms,
+// the usual rule of the delimiter going with the segment before it can be
+// violated if every byte value below some threshold starts a segment. (This
+// has not been formally verified and is not recommended.)
+//
+// Suppose that we are paranoid, however, and decide to place short keys
+// (empty in segment 2) into an unfiltered category. This is potentially a
+// dangerous decision because loss of continuity at least affects the
+// ability to filter on segment 0 (common segment prefix property violated
+// with i=j=0; see z, m, n; m not in category set). Thus, excluding short keys
+// with categories is not a recommended solution either.
 class KeySegmentsExtractor {
  public:
   // The extractor assigns keys to categories so that it is easier to
@@ -269,6 +461,14 @@ class KeySegmentsExtractor {
                        Result* result) const = 0;
 };
 
+// Constructs a KeySegmentsExtractor for fixed-width key segments that safely
+// handles short keys by truncating segments at the end of the input key.
+// See comments on KeySegmentsExtractor for why this is much safer for
+// filtering than "all or nothing" fixed-size segments. This is essentially
+// a generalization of (New)CappedPrefixTransform.
+std::shared_ptr<const KeySegmentsExtractor>
+MakeSharedCappedKeySegmentsExtractor(const std::vector<size_t>& byte_widths);
+
 // Alternatives for filtering inputs
 
 // An individual key segment.
@@ -305,13 +505,13 @@ struct SelectUserTimestamp {};
 
 struct SelectColumnName {};
 
-struct SelectValue {};
-
-// Note: more variants might be added in the future.
+// NOTE: more variants might be added in the future.
+// NOTE2: filtering on values is not supported because it could easily break
+// overwrite semantics. (Filter out SST with newer, non-matching value but
+// see obsolete value that does match.)
 using FilterInput =
     std::variant<SelectWholeKey, SelectKeySegment, SelectKeySegmentRange,
-                 SelectLegacyKeyPrefix, SelectUserTimestamp, SelectColumnName,
-                 SelectValue>;
+                 SelectLegacyKeyPrefix, SelectUserTimestamp, SelectColumnName>;
 
 // Base class for individual filtering schemes in terms of chosen
 // FilterInputs, but not tied to a particular KeySegmentsExtractor.
@@ -336,6 +536,10 @@ std::shared_ptr<SstQueryFilterConfig> MakeSharedBytewiseMinMaxSQFC(
     FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
                             KeySegmentsExtractor::KeyCategorySet::All());
 
+std::shared_ptr<SstQueryFilterConfig> MakeSharedReverseBytewiseMinMaxSQFC(
+    FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
+                            KeySegmentsExtractor::KeyCategorySet::All());
+
 // TODO: more kinds of filters, eventually including Bloom/ribbon filters
 // and replacing the old filter configuration APIs
 

From 1238120fe65171df7a47540eef1357de6592755d Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Wed, 18 Sep 2024 17:48:18 -0700
Subject: [PATCH 03/17] Add an option to dump wal seqno gaps (#13014)

Summary:
Add an option `--only_print_seqno_gaps` for wal dump to help with debugging. This option will check the continuity of sequence numbers in WAL logs, assuming `seq_per_batch` is false. `--walfile` option now also takes a directory, and it will check all WAL logs in the directory in chronological order.

When a gap is found, we can further check if it's related to operations like external file ingestion.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13014

Test Plan: Manually tested

Reviewed By: ltamasi

Differential Revision: D62989115

Pulled By: jowlyzhang

fbshipit-source-id: 22e3326344e7969ff9d5091d21fec2935770fbc7
---
 tools/ldb_cmd.cc     | 204 +++++++++++++++++++++++++++++++++++++++----
 tools/ldb_cmd_impl.h |   2 +
 tools/ldb_test.py    |   2 +-
 3 files changed, 190 insertions(+), 18 deletions(-)

diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index e0b1b5471..e954e7007 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -114,11 +114,83 @@ const std::string LDBCommand::ARG_READ_TIMESTAMP = "read_timestamp";
 const char* LDBCommand::DELIM = " ==> ";
 
 namespace {
+// Helper class to iterate WAL logs in a directory in chronological order.
+class WALFileIterator {
+ public:
+  explicit WALFileIterator(const std::string& parent_dir,
+                           const std::vector<std::string>& filenames);
+  // REQUIRES Valid() == true
+  std::string GetNextWAL();
+  bool Valid() const { return wal_file_iter_ != log_files_.end(); }
 
-void DumpWalFile(Options options, std::string wal_file, bool print_header,
-                 bool print_values, bool is_write_committed,
+ private:
+  // WAL log file names(s)
+  std::string parent_dir_;
+  std::vector<std::string> log_files_;
+  std::vector<std::string>::const_iterator wal_file_iter_;
+};
+
+WALFileIterator::WALFileIterator(const std::string& parent_dir,
+                                 const std::vector<std::string>& filenames)
+    : parent_dir_(parent_dir) {
+  // populate wal logs
+  assert(!filenames.empty());
+  for (const auto& fname : filenames) {
+    uint64_t file_num = 0;
+    FileType file_type;
+    bool parse_ok = ParseFileName(fname, &file_num, &file_type);
+    if (parse_ok && file_type == kWalFile) {
+      log_files_.push_back(fname);
+    }
+  }
+
+  std::sort(log_files_.begin(), log_files_.end(),
+            [](const std::string& lhs, const std::string& rhs) {
+              uint64_t num1 = 0;
+              uint64_t num2 = 0;
+              FileType type1;
+              FileType type2;
+              bool parse_ok1 = ParseFileName(lhs, &num1, &type1);
+              bool parse_ok2 = ParseFileName(rhs, &num2, &type2);
+#ifndef NDEBUG
+              assert(parse_ok1);
+              assert(parse_ok2);
+#else
+        (void)parse_ok1;
+        (void)parse_ok2;
+#endif
+              return num1 < num2;
+            });
+  wal_file_iter_ = log_files_.begin();
+}
+
+std::string WALFileIterator::GetNextWAL() {
+  assert(Valid());
+  std::string ret;
+  if (wal_file_iter_ != log_files_.end()) {
+    ret.assign(parent_dir_);
+    if (ret.back() != kFilePathSeparator) {
+      ret.push_back(kFilePathSeparator);
+    }
+    ret.append(*wal_file_iter_);
+    ++wal_file_iter_;
+  }
+  return ret;
+}
+
+void DumpWalFiles(Options options, const std::string& dir_or_file,
+                  bool print_header, bool print_values,
+                  bool only_print_seqno_gaps, bool is_write_committed,
+                  const std::map<uint32_t, const Comparator*>& ucmps,
+                  LDBCommandExecuteResult* exec_state);
+
+void DumpWalFile(Options options, const std::string& wal_file,
+                 bool print_header, bool print_values,
+                 bool only_print_seqno_gaps, bool is_write_committed,
                  const std::map<uint32_t, const Comparator*>& ucmps,
-                 LDBCommandExecuteResult* exec_state);
+                 LDBCommandExecuteResult* exec_state,
+                 std::optional<SequenceNumber>* prev_batch_seqno,
+                 std::optional<uint32_t>* prev_batch_count);
 
 void DumpSstFile(Options options, std::string filename, bool output_hex,
                  bool show_properties, bool decode_blob_index,
@@ -2213,9 +2285,10 @@ void DBDumperCommand::DoCommand() {
     switch (type) {
       case kWalFile:
         // TODO(myabandeh): allow configuring is_write_commited
-        DumpWalFile(options_, path_, /* print_header_ */ true,
-                    /* print_values_ */ true, true /* is_write_commited */,
-                    ucmps_, &exec_state_);
+        DumpWalFiles(options_, path_, /* print_header_ */ true,
+                     /* print_values_ */ true,
+                     /* only_print_seqno_gaps */ false,
+                     true /* is_write_commited */, ucmps_, &exec_state_);
         break;
       case kTableFile:
         DumpSstFile(options_, path_, is_key_hex_, /* show_properties */ true,
@@ -2842,10 +2915,70 @@ class InMemoryHandler : public WriteBatch::Handler {
   const std::map<uint32_t, const Comparator*> ucmps_;
 };
 
-void DumpWalFile(Options options, std::string wal_file, bool print_header,
-                 bool print_values, bool is_write_committed,
+void DumpWalFiles(Options options, const std::string& dir_or_file,
+                  bool print_header, bool print_values,
+                  bool only_print_seqno_gaps, bool is_write_committed,
+                  const std::map<uint32_t, const Comparator*>& ucmps,
+                  LDBCommandExecuteResult* exec_state) {
+  std::vector<std::string> filenames;
+  ROCKSDB_NAMESPACE::Env* env = options.env;
+  ROCKSDB_NAMESPACE::Status st = env->GetChildren(dir_or_file, &filenames);
+  std::optional<SequenceNumber> prev_batch_seqno;
+  std::optional<uint32_t> prev_batch_count;
+  if (!st.ok() || filenames.empty()) {
+    // dir_or_file does not exist or does not contain children
+    // Check its existence first
+    Status s = env->FileExists(dir_or_file);
+    // dir_or_file does not exist
+    if (!s.ok()) {
+      if (exec_state) {
+        *exec_state = LDBCommandExecuteResult::Failed(
+            dir_or_file + ": No such file or directory");
+      }
+      return;
+    }
+    // If it exists and doesn't have children, it should be a log file.
+    if (dir_or_file.length() <= 4 ||
+        dir_or_file.rfind(".log") != dir_or_file.length() - 4) {
+      if (exec_state) {
+        *exec_state = LDBCommandExecuteResult::Failed(
+            dir_or_file + ": Invalid log file name");
+      }
+      return;
+    }
+    DumpWalFile(options, dir_or_file, print_header, print_values,
+                only_print_seqno_gaps, is_write_committed, ucmps, exec_state,
+                &prev_batch_seqno, &prev_batch_count);
+  } else {
+    WALFileIterator wal_file_iter(dir_or_file, filenames);
+    if (!wal_file_iter.Valid()) {
+      if (exec_state) {
+        *exec_state = LDBCommandExecuteResult::Failed(
+            dir_or_file + ": No valid wal logs found");
+      }
+      return;
+    }
+    std::string wal_file = wal_file_iter.GetNextWAL();
+    while (!wal_file.empty()) {
+      std::cout << "Checking wal file: " << wal_file << std::endl;
+      DumpWalFile(options, wal_file, print_header, print_values,
+                  only_print_seqno_gaps, is_write_committed, ucmps, exec_state,
+                  &prev_batch_seqno, &prev_batch_count);
+      if (exec_state->IsFailed() || !wal_file_iter.Valid()) {
+        return;
+      }
+      wal_file = wal_file_iter.GetNextWAL();
+    }
+  }
+}
+
+void DumpWalFile(Options options, const std::string& wal_file,
+                 bool print_header, bool print_values,
+                 bool only_print_seqno_gaps, bool is_write_committed,
                  const std::map<uint32_t, const Comparator*>& ucmps,
-                 LDBCommandExecuteResult* exec_state) {
+                 LDBCommandExecuteResult* exec_state,
+                 std::optional<SequenceNumber>* prev_batch_seqno,
+                 std::optional<uint32_t>* prev_batch_count) {
   const auto& fs = options.env->GetFileSystem();
   FileOptions soptions(options);
   std::unique_ptr<SequentialFileReader> wal_file_reader;
@@ -2948,8 +3081,32 @@ void DumpWalFile(Options options, std::string wal_file, bool print_header,
             break;
           }
         }
-        row << WriteBatchInternal::Sequence(&batch) << ",";
-        row << WriteBatchInternal::Count(&batch) << ",";
+        SequenceNumber sequence_number = WriteBatchInternal::Sequence(&batch);
+        uint32_t batch_count = WriteBatchInternal::Count(&batch);
+        assert(prev_batch_seqno);
+        assert(prev_batch_count);
+        assert(prev_batch_seqno->has_value() == prev_batch_count->has_value());
+        // TODO(yuzhangyu): handle pessimistic transactions case.
+        if (only_print_seqno_gaps) {
+          if (!prev_batch_seqno->has_value() ||
+              !prev_batch_count->has_value() ||
+              prev_batch_seqno->value() + prev_batch_count->value() ==
+                  sequence_number) {
+            *prev_batch_seqno = sequence_number;
+            *prev_batch_count = batch_count;
+            continue;
+          } else if (prev_batch_seqno->has_value() &&
+                     prev_batch_count->has_value()) {
+            row << "Prev batch sequence number: " << prev_batch_seqno->value()
+                << ", prev batch count: " << prev_batch_count->value() << ", ";
+            *prev_batch_seqno = sequence_number;
+            *prev_batch_count = batch_count;
+          }
+        }
+        row << sequence_number << ",";
+        row << batch_count << ",";
+        *prev_batch_seqno = sequence_number;
+        *prev_batch_count = batch_count;
         row << WriteBatchInternal::ByteSize(&batch) << ",";
         row << reader.LastRecordOffset() << ",";
         ColumnFamilyCollector cf_collector;
@@ -3003,6 +3160,8 @@ const std::string WALDumperCommand::ARG_WAL_FILE = "walfile";
 const std::string WALDumperCommand::ARG_WRITE_COMMITTED = "write_committed";
 const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
 const std::string WALDumperCommand::ARG_PRINT_HEADER = "header";
+const std::string WALDumperCommand::ARG_ONLY_PRINT_SEQNO_GAPS =
+    "only_print_seqno_gaps";
 
 WALDumperCommand::WALDumperCommand(
     const std::vector<std::string>& /*params*/,
@@ -3010,9 +3169,11 @@ WALDumperCommand::WALDumperCommand(
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, true,
                  BuildCmdLineOptions({ARG_WAL_FILE, ARG_DB, ARG_WRITE_COMMITTED,
-                                      ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
+                                      ARG_PRINT_HEADER, ARG_PRINT_VALUE,
+                                      ARG_ONLY_PRINT_SEQNO_GAPS})),
       print_header_(false),
       print_values_(false),
+      only_print_seqno_gaps_(false),
       is_write_committed_(false) {
   wal_file_.clear();
 
@@ -3023,6 +3184,7 @@ WALDumperCommand::WALDumperCommand(
 
   print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER);
   print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE);
+  only_print_seqno_gaps_ = IsFlagPresent(flags, ARG_ONLY_PRINT_SEQNO_GAPS);
   is_write_committed_ = ParseBooleanOption(options, ARG_WRITE_COMMITTED, true);
 
   if (wal_file_.empty()) {
@@ -3038,18 +3200,22 @@ WALDumperCommand::WALDumperCommand(
 void WALDumperCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(WALDumperCommand::Name());
-  ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
+  ret.append(" --" + ARG_WAL_FILE +
+             "=<write_ahead_log_file_path_or_directory>");
   ret.append(" [--" + ARG_DB + "=<db_path>]");
   ret.append(" [--" + ARG_PRINT_HEADER + "] ");
   ret.append(" [--" + ARG_PRINT_VALUE + "] ");
+  ret.append(" [--" + ARG_ONLY_PRINT_SEQNO_GAPS +
+             "] (only correct if not using pessimistic transactions)");
   ret.append(" [--" + ARG_WRITE_COMMITTED + "=true|false] ");
   ret.append("\n");
 }
 
 void WALDumperCommand::DoCommand() {
   PrepareOptions();
-  DumpWalFile(options_, wal_file_, print_header_, print_values_,
-              is_write_committed_, ucmps_, &exec_state_);
+  DumpWalFiles(options_, wal_file_, print_header_, print_values_,
+               only_print_seqno_gaps_, is_write_committed_, ucmps_,
+               &exec_state_);
 }
 
 // ----------------------------------------------------------------------------
@@ -4540,13 +4706,17 @@ void DBFileDumperCommand::DoCommand() {
     } else {
       wal_dir = NormalizePath(options_.wal_dir + "/");
     }
+    std::optional<SequenceNumber> prev_batch_seqno;
+    std::optional<uint32_t> prev_batch_count;
     for (auto& wal : wal_files) {
       // TODO(qyang): option.wal_dir should be passed into ldb command
       std::string filename = wal_dir + wal->PathName();
       std::cout << filename << std::endl;
       // TODO(myabandeh): allow configuring is_write_commited
-      DumpWalFile(options_, filename, true, true, true /* is_write_commited */,
-                  ucmps_, &exec_state_);
+      DumpWalFile(
+          options_, filename, true /* print_header */, true /* print_values */,
+          false /* only_print_seqno_gapstrue */, true /* is_write_commited */,
+          ucmps_, &exec_state_, &prev_batch_seqno, &prev_batch_count);
     }
   }
 }
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 32bf4da59..73130401e 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -379,6 +379,7 @@ class WALDumperCommand : public LDBCommand {
   bool print_header_;
   std::string wal_file_;
   bool print_values_;
+  bool only_print_seqno_gaps_;
   bool is_write_committed_;  // default will be set to true
   bool no_db_open_ = true;
 
@@ -386,6 +387,7 @@ class WALDumperCommand : public LDBCommand {
   static const std::string ARG_WRITE_COMMITTED;
   static const std::string ARG_PRINT_HEADER;
   static const std::string ARG_PRINT_VALUE;
+  static const std::string ARG_ONLY_PRINT_SEQNO_GAPS;
 };
 
 class GetCommand : public LDBCommand {
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index 0d6b125f0..09ab9b799 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -562,7 +562,7 @@ def testMiscAdminTask(self):
             0
             == run_err_null(
                 "./ldb dump_wal --db=%s --walfile=%s --header"
-                % (origDbPath, os.path.join(origDbPath, "LOG"))
+                % (origDbPath, origDbPath)
             )
         )
         self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4")

From 98c33cb8e39412ffdc4aa9b0ec97204dbd28acb9 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 19 Sep 2024 14:05:21 -0700
Subject: [PATCH 04/17] Steps toward making IDENTITY file obsolete (#13019)

Summary:
* Set write_dbid_to_manifest=true by default
* Add new option write_identity_file (default true) that allows us to opt-in to future behavior without identity file
* Refactor related DB open code to minimize code duplication

_Recommend hiding whitespace changes for review_

Intended follow-up: add support to ldb for reading and even replacing the DB identity in the manifest. Could be a variant of `update_manifest` command or based on it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13019

Test Plan: unit tests and stress test updated for new functionality

Reviewed By: anand1976

Differential Revision: D62898229

Pulled By: pdillinger

fbshipit-source-id: c08b25cf790610b034e51a9de0dc78b921abbcf0
---
 db/c.cc                                       |   9 +
 db/c_test.c                                   |  13 +-
 db/compaction/compaction_job_test.cc          |  15 +-
 db/db_basic_test.cc                           | 162 ++++++++++--------
 db/db_impl/db_impl.h                          |   9 +-
 db/db_impl/db_impl_files.cc                   |  63 +++----
 db/db_impl/db_impl_open.cc                    |  48 ++++--
 db/flush_job_test.cc                          |  13 +-
 db/version_set.cc                             |   1 +
 db/version_set_test.cc                        |   6 +-
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   4 +
 db_stress_tool/db_stress_test_base.cc         |   1 +
 include/rocksdb/c.h                           |   4 +
 include/rocksdb/options.h                     |  28 +--
 .../test/java/org/rocksdb/DBOptionsTest.java  |   4 +-
 .../test/java/org/rocksdb/OptionsTest.java    |   4 +-
 .../test/java/org/rocksdb/RocksDBTest.java    |   2 +-
 options/db_options.cc                         |   7 +
 options/db_options.h                          |   1 +
 test_util/testutil.cc                         |   2 +
 test_util/testutil.h                          |   2 +
 tools/db_crashtest.py                         |   7 +
 unreleased_history/behavior_changes/db_id.md  |   1 +
 24 files changed, 254 insertions(+), 153 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/db_id.md

diff --git a/db/c.cc b/db/c.cc
index 0858568af..9b6db666d 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -4075,6 +4075,15 @@ void rocksdb_options_set_write_dbid_to_manifest(
   opt->rep.write_dbid_to_manifest = write_dbid_to_manifest;
 }
 
+unsigned char rocksdb_options_get_write_identity_file(rocksdb_options_t* opt) {
+  return opt->rep.write_identity_file;
+}
+
+void rocksdb_options_set_write_identity_file(
+    rocksdb_options_t* opt, unsigned char write_identity_file) {
+  opt->rep.write_identity_file = write_identity_file;
+}
+
 unsigned char rocksdb_options_get_track_and_verify_wals_in_manifest(
     rocksdb_options_t* opt) {
   return opt->rep.track_and_verify_wals_in_manifest;
diff --git a/db/c_test.c b/db/c_test.c
index ebf39cb0a..eb9886ba4 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -772,6 +772,8 @@ int main(int argc, char** argv) {
   rocksdb_options_set_write_buffer_size(options, 100000);
   rocksdb_options_set_paranoid_checks(options, 1);
   rocksdb_options_set_max_open_files(options, 10);
+  /* Compatibility with how test was written */
+  rocksdb_options_set_write_dbid_to_manifest(options, 0);
 
   table_options = rocksdb_block_based_options_create();
   rocksdb_block_based_options_set_block_cache(table_options, cache);
@@ -962,15 +964,24 @@ int main(int argc, char** argv) {
     rocksdb_options_t* options_dbid_in_manifest = rocksdb_options_create();
     rocksdb_options_set_create_if_missing(options_dbid_in_manifest, 1);
 
+    rocksdb_options_set_write_dbid_to_manifest(options_dbid_in_manifest, false);
     unsigned char write_to_manifest =
         rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest);
     CheckCondition(!write_to_manifest);
     rocksdb_options_set_write_dbid_to_manifest(options_dbid_in_manifest, true);
-    CheckCondition(!write_to_manifest);
     write_to_manifest =
         rocksdb_options_get_write_dbid_to_manifest(options_dbid_in_manifest);
     CheckCondition(write_to_manifest);
 
+    rocksdb_options_set_write_identity_file(options_dbid_in_manifest, true);
+    unsigned char write_identity_file =
+        rocksdb_options_get_write_identity_file(options_dbid_in_manifest);
+    CheckCondition(write_identity_file);
+    rocksdb_options_set_write_identity_file(options_dbid_in_manifest, false);
+    write_identity_file =
+        rocksdb_options_get_write_identity_file(options_dbid_in_manifest);
+    CheckCondition(!write_identity_file);
+
     db = rocksdb_open(options_dbid_in_manifest, dbbackupname, &err);
     CheckNoError(err);
 
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index bbc0fe4cf..8e85a9f96 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -545,15 +545,14 @@ class CompactionJobTestBase : public testing::Test {
     ASSERT_OK(s);
     db_options_.info_log = info_log;
 
-    versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
-        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
-        /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       test::kUnitTestDbId, /*db_session_id=*/"",
+                       /*daily_offpeak_time_utc=*/"",
+                       /*error_handler=*/nullptr, /*read_only=*/false));
     compaction_job_stats_.Reset();
-    ASSERT_OK(
-        SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
 
     VersionEdit new_db;
     new_db.SetLogNumber(0);
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index bb8a132ae..ef497c114 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -688,76 +688,100 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) {
   constexpr size_t kMinIdSize = 10;
   do {
     for (bool with_manifest : {false, true}) {
-      std::string idfilename = IdentityFileName(dbname_);
-      std::string id1, tmp;
-      ASSERT_OK(db_->GetDbIdentity(id1));
-      ASSERT_GE(id1.size(), kMinIdSize);
-
-      Options options = CurrentOptions();
-      options.write_dbid_to_manifest = with_manifest;
-      Reopen(options);
-      std::string id2;
-      ASSERT_OK(db_->GetDbIdentity(id2));
-      // id2 should match id1 because identity was not regenerated
-      ASSERT_EQ(id1, id2);
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id2);
-
-      // Recover from deleted/missing IDENTITY
-      ASSERT_OK(env_->DeleteFile(idfilename));
-      Reopen(options);
-      std::string id3;
-      ASSERT_OK(db_->GetDbIdentity(id3));
-      if (with_manifest) {
-        // id3 should match id1 because identity was restored from manifest
-        ASSERT_EQ(id1, id3);
-      } else {
-        // id3 should NOT match id1 because identity was regenerated
-        ASSERT_NE(id1, id3);
-        ASSERT_GE(id3.size(), kMinIdSize);
-      }
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id3);
-
-      // Recover from truncated IDENTITY
-      {
-        std::unique_ptr<WritableFile> w;
-        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
-        ASSERT_OK(w->Close());
-      }
-      Reopen(options);
-      std::string id4;
-      ASSERT_OK(db_->GetDbIdentity(id4));
-      if (with_manifest) {
-        // id4 should match id1 because identity was restored from manifest
-        ASSERT_EQ(id1, id4);
-      } else {
-        // id4 should NOT match id1 because identity was regenerated
-        ASSERT_NE(id1, id4);
-        ASSERT_GE(id4.size(), kMinIdSize);
-      }
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id4);
-
-      // Recover from overwritten IDENTITY
-      std::string silly_id = "asdf123456789";
-      {
-        std::unique_ptr<WritableFile> w;
-        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
-        ASSERT_OK(w->Append(silly_id));
-        ASSERT_OK(w->Close());
-      }
-      Reopen(options);
-      std::string id5;
-      ASSERT_OK(db_->GetDbIdentity(id5));
-      if (with_manifest) {
-        // id4 should match id1 because identity was restored from manifest
-        ASSERT_EQ(id1, id5);
-      } else {
-        ASSERT_EQ(id5, silly_id);
+      for (bool write_file : {false, true}) {
+        std::string idfilename = IdentityFileName(dbname_);
+        std::string id1, tmp;
+        ASSERT_OK(db_->GetDbIdentity(id1));
+        ASSERT_GE(id1.size(), kMinIdSize);
+
+        Options options = CurrentOptions();
+        options.write_dbid_to_manifest = with_manifest;
+        options.write_identity_file = true;  // initially
+        Reopen(options);
+        std::string id2;
+        ASSERT_OK(db_->GetDbIdentity(id2));
+        // id2 should match id1 because identity was not regenerated
+        ASSERT_EQ(id1, id2);
+        ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+        ASSERT_EQ(tmp, id2);
+
+        if (write_file) {
+          // Recover from deleted/missing IDENTITY
+          ASSERT_OK(env_->DeleteFile(idfilename));
+        } else {
+          // Transition to no IDENTITY file
+          options.write_identity_file = false;
+          if (!with_manifest) {
+            // Incompatible options, should fail
+            ASSERT_NOK(TryReopen(options));
+            // Back to a usable config and continue
+            options.write_identity_file = true;
+            Reopen(options);
+            continue;
+          }
+        }
+        Reopen(options);
+        std::string id3;
+        ASSERT_OK(db_->GetDbIdentity(id3));
+        if (with_manifest) {
+          // id3 should match id1 because identity was restored from manifest
+          ASSERT_EQ(id1, id3);
+        } else {
+          // id3 should NOT match id1 because identity was regenerated
+          ASSERT_NE(id1, id3);
+          ASSERT_GE(id3.size(), kMinIdSize);
+        }
+        if (write_file) {
+          ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+          ASSERT_EQ(tmp, id3);
+
+          // Recover from truncated IDENTITY
+          std::unique_ptr<WritableFile> w;
+          ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+          ASSERT_OK(w->Close());
+        } else {
+          ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
+        }
+        Reopen(options);
+        std::string id4;
+        ASSERT_OK(db_->GetDbIdentity(id4));
+        if (with_manifest) {
+          // id4 should match id1 because identity was restored from manifest
+          ASSERT_EQ(id1, id4);
+        } else {
+          // id4 should NOT match id1 because identity was regenerated
+          ASSERT_NE(id1, id4);
+          ASSERT_GE(id4.size(), kMinIdSize);
+        }
+        std::string silly_id = "asdf123456789";
+        if (write_file) {
+          ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+          ASSERT_EQ(tmp, id4);
+
+          // Recover from overwritten IDENTITY
+          std::unique_ptr<WritableFile> w;
+          ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+          ASSERT_OK(w->Append(silly_id));
+          ASSERT_OK(w->Close());
+        } else {
+          ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
+        }
+        Reopen(options);
+        std::string id5;
+        ASSERT_OK(db_->GetDbIdentity(id5));
+        if (with_manifest) {
+          // id4 should match id1 because identity was restored from manifest
+          ASSERT_EQ(id1, id5);
+        } else {
+          ASSERT_EQ(id5, silly_id);
+        }
+        if (write_file) {
+          ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+          ASSERT_EQ(tmp, id5);
+        } else {
+          ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound());
+        }
       }
-      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
-      ASSERT_EQ(tmp, id5);
     }
   } while (ChangeCompactOptions());
 }
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 66b324825..218c1851e 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1584,11 +1584,12 @@ class DBImpl : public DB {
 
   virtual bool OwnTablesAndLogs() const { return true; }
 
-  // Setup DB identity file, and write DB ID to manifest if necessary.
+  // Read/create DB identity file (as appropriate), and write DB ID to
+  // version_edit if provided.
   Status SetupDBId(const WriteOptions& write_options, bool read_only,
-                   RecoveryContext* recovery_ctx);
-  // Assign db_id_ and write DB ID to manifest if necessary.
-  void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx);
+                   bool is_new_db, VersionEdit* version_edit);
+  // Assign db_id_ and write DB ID to version_edit if provided.
+  void SetDBId(std::string&& id, bool read_only, VersionEdit* version_edit);
 
   // Collect a deduplicated collection of paths used by this DB, including
   // dbname_, DBOptions.db_paths, ColumnFamilyOptions.cf_paths.
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index bb0ff3985..3812e1fcb 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -953,59 +953,60 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
 }
 
 void DBImpl::SetDBId(std::string&& id, bool read_only,
-                     RecoveryContext* recovery_ctx) {
+                     VersionEdit* version_edit) {
   assert(db_id_.empty());
   assert(!id.empty());
   db_id_ = std::move(id);
-  if (!read_only && immutable_db_options_.write_dbid_to_manifest) {
-    assert(recovery_ctx != nullptr);
+  if (!read_only && version_edit) {
+    assert(version_edit != nullptr);
     assert(versions_->GetColumnFamilySet() != nullptr);
-    VersionEdit edit;
-    edit.SetDBId(db_id_);
+    version_edit->SetDBId(db_id_);
     versions_->db_id_ = db_id_;
-    recovery_ctx->UpdateVersionEdits(
-        versions_->GetColumnFamilySet()->GetDefault(), edit);
   }
 }
 
 Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
-                         RecoveryContext* recovery_ctx) {
+                         bool is_new_db, VersionEdit* version_edit) {
   Status s;
-  // Check for the IDENTITY file and create it if not there or
-  // broken or not matching manifest
-  std::string db_id_in_file;
-  s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
-  if (s.ok()) {
-    s = GetDbIdentityFromIdentityFile(&db_id_in_file);
-    if (s.ok() && !db_id_in_file.empty()) {
-      if (db_id_.empty()) {
-        // Loaded from file and wasn't already known from manifest
-        SetDBId(std::move(db_id_in_file), read_only, recovery_ctx);
-        return s;
-      } else if (db_id_ == db_id_in_file) {
-        // Loaded from file and matches manifest
-        return s;
+  if (!is_new_db) {
+    // Check for the IDENTITY file and create it if not there or
+    // broken or not matching manifest
+    std::string db_id_in_file;
+    s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+    if (s.ok()) {
+      s = GetDbIdentityFromIdentityFile(&db_id_in_file);
+      if (s.ok() && !db_id_in_file.empty()) {
+        if (db_id_.empty()) {
+          // Loaded from file and wasn't already known from manifest
+          SetDBId(std::move(db_id_in_file), read_only, version_edit);
+          return s;
+        } else if (db_id_ == db_id_in_file) {
+          // Loaded from file and matches manifest
+          return s;
+        }
       }
     }
-  }
-  if (s.IsNotFound()) {
-    s = Status::OK();
-  }
-  if (!s.ok()) {
-    assert(s.IsIOError());
-    return s;
+    if (s.IsNotFound()) {
+      s = Status::OK();
+    }
+    if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
+    }
   }
   // Otherwise IDENTITY file is missing or no good.
   // Generate new id if needed
   if (db_id_.empty()) {
-    SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx);
+    SetDBId(env_->GenerateUniqueId(), read_only, version_edit);
   }
   // Persist it to IDENTITY file if allowed
-  if (!read_only) {
+  if (!read_only && immutable_db_options_.write_identity_file) {
     s = SetIdentityFile(write_options, env_, dbname_,
                         immutable_db_options_.metadata_write_temperature,
                         db_id_);
   }
+  // NOTE: an obsolete IDENTITY file with write_identity_file=false is handled
+  // elsewhere, so that it's only deleted after successful recovery
   return s;
 }
 
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 4ce2e82e8..4fb100876 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -289,28 +289,25 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
           "start_time and end_time cannot be the same");
     }
   }
+
+  if (!db_options.write_dbid_to_manifest && !db_options.write_identity_file) {
+    return Status::InvalidArgument(
+        "write_dbid_to_manifest and write_identity_file cannot both be false");
+  }
   return Status::OK();
 }
 
 Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
-  VersionEdit new_db;
+  VersionEdit new_db_edit;
   const WriteOptions write_options(Env::IOActivity::kDBOpen);
-  Status s = SetIdentityFile(write_options, env_, dbname_,
-                             immutable_db_options_.metadata_write_temperature);
+  Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true,
+                       &new_db_edit);
   if (!s.ok()) {
     return s;
   }
-  if (immutable_db_options_.write_dbid_to_manifest) {
-    std::string temp_db_id;
-    s = GetDbIdentityFromIdentityFile(&temp_db_id);
-    if (!s.ok()) {
-      return s;
-    }
-    new_db.SetDBId(temp_db_id);
-  }
-  new_db.SetLogNumber(0);
-  new_db.SetNextFile(2);
-  new_db.SetLastSequence(0);
+  new_db_edit.SetLogNumber(0);
+  new_db_edit.SetNextFile(2);
+  new_db_edit.SetLastSequence(0);
 
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
@@ -342,7 +339,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
         tmp_set.Contains(FileType::kDescriptorFile)));
     log::Writer log(std::move(file_writer), 0, false);
     std::string record;
-    new_db.EncodeTo(&record);
+    new_db_edit.EncodeTo(&record);
     s = log.AddRecord(write_options, record);
     if (s.ok()) {
       s = SyncManifest(&immutable_db_options_, write_options, log.file());
@@ -528,7 +525,7 @@ Status DBImpl::Recover(
     }
     assert(s.ok());
   }
-  assert(db_id_.empty());
+  assert(is_new_db || db_id_.empty());
   Status s;
   bool missing_table_file = false;
   if (!immutable_db_options_.best_efforts_recovery) {
@@ -674,7 +671,17 @@ Status DBImpl::Recover(
       }
     }
   }
-  s = SetupDBId(write_options, read_only, recovery_ctx);
+  if (is_new_db) {
+    // Already set up DB ID in NewDB
+  } else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) {
+    VersionEdit edit;
+    s = SetupDBId(write_options, read_only, is_new_db, &edit);
+    recovery_ctx->UpdateVersionEdits(
+        versions_->GetColumnFamilySet()->GetDefault(), edit);
+  } else {
+    s = SetupDBId(write_options, read_only, is_new_db, nullptr);
+  }
+  assert(!s.ok() || !db_id_.empty());
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
   if (s.ok() && !read_only) {
     s = MaybeUpdateNextFileNumber(recovery_ctx);
@@ -2132,6 +2139,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     s = impl->LogAndApplyForRecovery(recovery_ctx);
   }
 
+  if (s.ok() && !impl->immutable_db_options_.write_identity_file) {
+    // On successful recovery, delete an obsolete IDENTITY file to avoid DB ID
+    // inconsistency
+    impl->env_->DeleteFile(IdentityFileName(impl->dbname_))
+        .PermitUncheckedError();
+  }
+
   if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
     impl->mutex_.AssertHeld();
     s = impl->InitPersistStatsColumnFamily();
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index d407e4815..ce21ec648 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -142,12 +142,13 @@ class FlushJobTestBase : public testing::Test {
       column_families.emplace_back(cf_name, cf_options_);
     }
 
-    versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
-        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
-        /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       test::kUnitTestDbId, /*db_session_id=*/"",
+                       /*daily_offpeak_time_utc=*/"",
+                       /*error_handler=*/nullptr, /*read_only=*/false));
     EXPECT_OK(versions_->Recover(column_families, false));
   }
 
diff --git a/db/version_set.cc b/db/version_set.cc
index c2fa7aad7..c42a50cba 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5143,6 +5143,7 @@ VersionSet::VersionSet(
       fs_(_db_options->fs, io_tracer),
       clock_(_db_options->clock),
       dbname_(dbname),
+      db_id_(db_id),
       db_options_(_db_options),
       next_file_number_(2),
       manifest_file_number_(0),  // Filled by Recover()
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 4f3665fba..4bc9806e2 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -1282,6 +1282,8 @@ class VersionSetTestBase {
     assert(column_families != nullptr);
     assert(last_seqno != nullptr);
     assert(log_writer != nullptr);
+    ASSERT_OK(
+        SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
     VersionEdit new_db;
     if (db_options_.write_dbid_to_manifest) {
       DBOptions tmp_db_options;
@@ -1426,8 +1428,6 @@ class VersionSetTestBase {
   void NewDB() {
     SequenceNumber last_seqno;
     std::unique_ptr<log::Writer> log_writer;
-    ASSERT_OK(
-        SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
     PrepareManifest(&column_families_, &last_seqno, &log_writer);
     log_writer.reset();
     CreateCurrentFile();
@@ -3890,6 +3890,8 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
     assert(column_families != nullptr);
     assert(last_seqno != nullptr);
     assert(log_writer != nullptr);
+    ASSERT_OK(
+        SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
     const std::string manifest = DescriptorFileName(dbname_, 1);
     const auto& fs = env_->GetFileSystem();
     std::unique_ptr<WritableFileWriter> file_writer;
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 67f82808f..8ae432cc7 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -260,6 +260,7 @@ DECLARE_bool(use_full_merge_v1);
 DECLARE_int32(sync_wal_one_in);
 DECLARE_bool(avoid_unnecessary_blocking_io);
 DECLARE_bool(write_dbid_to_manifest);
+DECLARE_bool(write_identity_file);
 DECLARE_bool(avoid_flush_during_recovery);
 DECLARE_uint64(max_write_batch_group_size_bytes);
 DECLARE_bool(level_compaction_dynamic_level_bytes);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index bb2d9d453..c23a163a8 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -993,6 +993,10 @@ DEFINE_bool(write_dbid_to_manifest,
             ROCKSDB_NAMESPACE::Options().write_dbid_to_manifest,
             "Write DB_ID to manifest");
 
+DEFINE_bool(write_identity_file,
+            ROCKSDB_NAMESPACE::Options().write_identity_file,
+            "Write DB_ID to IDENTITY file");
+
 DEFINE_bool(avoid_flush_during_recovery,
             ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
             "Avoid flush during recovery");
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index b8ab0cc4f..5d4839df8 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4044,6 +4044,7 @@ void InitializeOptionsFromFlags(
   options.manual_wal_flush = FLAGS_manual_wal_flush_one_in > 0 ? true : false;
   options.avoid_unnecessary_blocking_io = FLAGS_avoid_unnecessary_blocking_io;
   options.write_dbid_to_manifest = FLAGS_write_dbid_to_manifest;
+  options.write_identity_file = FLAGS_write_identity_file;
   options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
   options.max_write_batch_group_size_bytes =
       FLAGS_max_write_batch_group_size_bytes;
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 9f91f7643..ed403a6ea 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1644,6 +1644,10 @@ extern ROCKSDB_LIBRARY_API unsigned char
 rocksdb_options_get_write_dbid_to_manifest(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_dbid_to_manifest(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_write_identity_file(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_identity_file(
+    rocksdb_options_t*, unsigned char);
 
 extern ROCKSDB_LIBRARY_API unsigned char
 rocksdb_options_get_track_and_verify_wals_in_manifest(rocksdb_options_t*);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index fb272663b..507f9bab8 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1380,16 +1380,24 @@ struct DBOptions {
   // ReadOptions::background_purge_on_iterator_cleanup.
   bool avoid_unnecessary_blocking_io = false;
 
-  // Historically DB ID has always been stored in Identity File in DB folder.
-  // If this flag is true, the DB ID is written to Manifest file in addition
-  // to the Identity file. By doing this 2 problems are solved
-  // 1. We don't checksum the Identity file where as Manifest file is.
-  // 2. Since the source of truth for DB is Manifest file DB ID will sit with
-  //    the source of truth. Previously the Identity file could be copied
-  //    independent of Manifest and that can result in wrong DB ID.
-  // We recommend setting this flag to true.
-  // Default: false
-  bool write_dbid_to_manifest = false;
+  // The DB unique ID can be saved in the DB manifest (preferred, this option)
+  // or an IDENTITY file (historical, deprecated), or both. If this option is
+  // set to false (old behavior), then write_identity_file must be set to true.
+  // The manifest is preferred because
+  // 1. The IDENTITY file is not checksummed, so it is not as safe against
+  //    corruption.
+  // 2. The IDENTITY file may or may not be copied with the DB (e.g. not
+  //    copied by BackupEngine), so is not reliable for the provenance of a DB.
+  // This option might eventually be obsolete and removed as Identity files
+  // are phased out.
+  bool write_dbid_to_manifest = true;
+
+  // It is expected that the Identity file will be obsoleted by recording
+  // DB ID in the manifest (see write_dbid_to_manifest). Setting this to true
+  // maintains the historical behavior of writing an Identity file, while
+  // setting to false is expected to be the future default. This option might
+  // eventually be obsolete and removed as Identity files are phased out.
+  bool write_identity_file = true;
 
   // The number of bytes to prefetch when reading the log. This is mostly useful
   // for reading a remotely located log, as it can save the number of
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index 189acdb4a..bf71704cb 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -793,9 +793,9 @@ public void persistStatsToDisk() {
   @Test
   public void writeDbidToManifest() {
     try (final DBOptions options = new DBOptions()) {
-      assertThat(options.writeDbidToManifest()).isEqualTo(false);
-      assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options);
       assertThat(options.writeDbidToManifest()).isEqualTo(true);
+      assertThat(options.setWriteDbidToManifest(false)).isEqualTo(options);
+      assertThat(options.writeDbidToManifest()).isEqualTo(false);
     }
   }
 
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index 9d2316737..38f8243fd 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -1379,9 +1379,9 @@ public void persistStatsToDisk() {
   @Test
   public void writeDbidToManifest() {
     try (final Options options = new Options()) {
-      assertThat(options.writeDbidToManifest()).isEqualTo(false);
-      assertThat(options.setWriteDbidToManifest(true)).isEqualTo(options);
       assertThat(options.writeDbidToManifest()).isEqualTo(true);
+      assertThat(options.setWriteDbidToManifest(false)).isEqualTo(options);
+      assertThat(options.writeDbidToManifest()).isEqualTo(false);
     }
   }
 
diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
index 1459f03b0..42bb4ab3c 100644
--- a/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -1641,7 +1641,7 @@ public void getLiveFiles() throws RocksDBException {
       try (final RocksDB db = RocksDB.open(options, dbPath)) {
         final RocksDB.LiveFiles livefiles = db.getLiveFiles(true);
         assertThat(livefiles).isNotNull();
-        assertThat(livefiles.manifestFileSize).isEqualTo(70);
+        assertThat(livefiles.manifestFileSize).isEqualTo(116);
         assertThat(livefiles.files.size()).isEqualTo(3);
         assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
         assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000005");
diff --git a/options/db_options.cc b/options/db_options.cc
index 2678bb5a7..0b880f4b9 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -407,6 +407,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"write_identity_file",
+         {offsetof(struct ImmutableDBOptions, write_identity_file),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"log_readahead_size",
          {offsetof(struct ImmutableDBOptions, log_readahead_size),
           OptionType::kSizeT, OptionVerificationType::kNormal,
@@ -772,6 +776,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
       persist_stats_to_disk(options.persist_stats_to_disk),
       write_dbid_to_manifest(options.write_dbid_to_manifest),
+      write_identity_file(options.write_identity_file),
       log_readahead_size(options.log_readahead_size),
       file_checksum_gen_factory(options.file_checksum_gen_factory),
       best_efforts_recovery(options.best_efforts_recovery),
@@ -947,6 +952,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    persist_stats_to_disk);
   ROCKS_LOG_HEADER(log, "                Options.write_dbid_to_manifest: %d",
                    write_dbid_to_manifest);
+  ROCKS_LOG_HEADER(log, "                Options.write_identity_file: %d",
+                   write_identity_file);
   ROCKS_LOG_HEADER(
       log, "                Options.log_readahead_size: %" ROCKSDB_PRIszt,
       log_readahead_size);
diff --git a/options/db_options.h b/options/db_options.h
index 7e0752626..842fefca8 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -89,6 +89,7 @@ struct ImmutableDBOptions {
   bool avoid_unnecessary_blocking_io;
   bool persist_stats_to_disk;
   bool write_dbid_to_manifest;
+  bool write_identity_file;
   size_t log_readahead_size;
   std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory;
   bool best_efforts_recovery;
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 40531586a..59a07e4a2 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -773,4 +773,6 @@ void RegisterTestLibrary(const std::string& arg) {
     ObjectRegistry::Default()->AddLibrary("test", RegisterTestObjects, arg);
   }
 }
+
+const std::string kUnitTestDbId = "UnitTest";
 }  // namespace ROCKSDB_NAMESPACE::test
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 9e0499cb3..2d693b5f2 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -904,5 +904,7 @@ struct ReadOptionsNoIo : public ReadOptions {
 };
 extern const ReadOptionsNoIo kReadOptionsNoIo;
 
+extern const std::string kUnitTestDbId;
+
 }  // namespace test
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 038e5fd53..f88c0be19 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -209,6 +209,7 @@
     "use_write_buffer_manager": lambda: random.randint(0, 1),
     "avoid_unnecessary_blocking_io": random.randint(0, 1),
     "write_dbid_to_manifest": random.randint(0, 1),
+    "write_identity_file": random.randint(0, 1),
     "avoid_flush_during_recovery": lambda: random.choice(
         [1 if t == 0 else 0 for t in range(0, 8)]
     ),
@@ -959,6 +960,12 @@ def finalize_and_sanitize(src_params):
         and dest_params.get("use_timed_put_one_in") == 1
     ):
         dest_params["use_timed_put_one_in"] = 3
+    if (
+        dest_params.get("write_dbid_to_manifest") == 0
+        and dest_params.get("write_identity_file") == 0
+    ):
+        # At least one must be true
+        dest_params["write_dbid_to_manifest"] = 1
     return dest_params
 
 
diff --git a/unreleased_history/behavior_changes/db_id.md b/unreleased_history/behavior_changes/db_id.md
new file mode 100644
index 000000000..5536f3ce5
--- /dev/null
+++ b/unreleased_history/behavior_changes/db_id.md
@@ -0,0 +1 @@
+* Set `write_dbid_to_manifest=true` by default. This means DB ID will now be preserved through backups, checkpoints, etc. by default. Also add `write_identity_file` option which can be set to false for anticipated future behavior.

From 54ace7f34083090dcefa51c0fd9381436ccb8fa0 Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@meta.com>
Date: Thu, 19 Sep 2024 15:47:13 -0700
Subject: [PATCH 05/17] Change the semantics of
 blob_garbage_collection_force_threshold to provide better control over space
 amp (#13022)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13022

Currently, `blob_garbage_collection_force_threshold` applies to the oldest batch of blob files, which is typically only a small subset of the blob files currently eligible for garbage collection. This can result in a form of head-of-line blocking: no GC-triggered compactions will be scheduled if the oldest batch does not currently exceed the threshold, even if a lot of higher-numbered blob files do. This can in turn lead to high space amplification that exceeds the soft bound implicit in the force threshold (e.g. 50% would suggest a space amp of <2 and 75% would imply a space amp of <4). The patch changes the semantics of this configuration threshold to apply to the entire set of blob files that are eligible for garbage collection based on `blob_garbage_collection_age_cutoff`. This provides more intuitive semantics for the option and can provide a better write amp/space amp trade-off. (Note that GC-triggered compactions still pick the same SST files as before, so triggered GC still targets the oldest the blob files.)

Reviewed By: jowlyzhang

Differential Revision: D62977860

fbshipit-source-id: a999f31fe9cdda313de513f0e7a6fc707424d4a3
---
 db/version_set.cc                             | 39 +++-------
 db/version_set_test.cc                        | 78 +------------------
 db_stress_tool/db_stress_gflags.cc            |  2 +-
 include/rocksdb/advanced_options.h            | 13 ++--
 ...edMutableColumnFamilyOptionsInterface.java |  9 +--
 .../java/org/rocksdb/ColumnFamilyOptions.java |  9 +--
 tools/db_bench_tool.cc                        |  2 +-
 ...blob_garbage_collection_force_threshold.md |  1 +
 8 files changed, 32 insertions(+), 121 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md

diff --git a/db/version_set.cc b/db/version_set.cc
index c42a50cba..d28b2e2d9 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3772,14 +3772,17 @@ void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
     return;
   }
 
-  // Compute the sum of total and garbage bytes over the oldest batch of blob
-  // files. The oldest batch is defined as the set of blob files which are
-  // kept alive by the same SSTs as the very oldest one. Here is a toy example.
-  // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
-  // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
-  // potentially some higher-numbered ones, while SST 3 relies on blob file 12
-  // and potentially some higher-numbered ones. Then, the SST to oldest blob
-  // file mapping is as follows:
+  // Compute the sum of total and garbage bytes over the batch of blob files
+  // currently eligible for garbage collection based on
+  // blob_garbage_collection_age_cutoff, and if the garbage ratio exceeds
+  // blob_garbage_collection_force_threshold, schedule compaction for the
+  // SST files that reference the oldest batch of blob files. Here is a toy
+  // example. Let's assume we have three SSTs 1, 2, and 3, and four blob files
+  // 10, 11, 12, and 13, which correspond to the range that is eligible for GC
+  // and satisfy the garbage ratio threshold. Also, let's say SSTs 1 and 2 both
+  // rely on blob file 10 and potentially some higher-numbered ones, while SST 3
+  // relies on blob file 12 and potentially some higher-numbered ones. Then, the
+  // SST to oldest blob file mapping is as follows:
   //
   // SST file number               Oldest blob file number
   // 1                             10
@@ -3797,11 +3800,6 @@ void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
   //
   // Then, the oldest batch of blob files consists of blob files 10 and 11,
   // and we can get rid of them by forcing the compaction of SSTs 1 and 2.
-  //
-  // Note that the overall ratio of garbage computed for the batch has to exceed
-  // blob_garbage_collection_force_threshold and the entire batch has to be
-  // eligible for GC according to blob_garbage_collection_age_cutoff in order
-  // for us to schedule any compactions.
   const auto& oldest_meta = blob_files_.front();
   assert(oldest_meta);
 
@@ -3818,25 +3816,10 @@ void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
     const auto& meta = blob_files_[count];
     assert(meta);
 
-    if (!meta->GetLinkedSsts().empty()) {
-      // Found the beginning of the next batch of blob files
-      break;
-    }
-
     sum_total_blob_bytes += meta->GetTotalBlobBytes();
     sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
   }
 
-  if (count < blob_files_.size()) {
-    const auto& meta = blob_files_[count];
-    assert(meta);
-
-    if (meta->GetLinkedSsts().empty()) {
-      // Some files in the oldest batch are not eligible for GC
-      return;
-    }
-  }
-
   if (sum_garbage_blob_bytes <
       blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
     return;
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 4bc9806e2..a483ccf0e 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -727,20 +727,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) {
     ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
   }
 
-  // Part of the oldest batch of blob files (specifically, #12 and #13) is
-  // ineligible for GC due to the age cutoff
-
-  {
-    constexpr double age_cutoff = 0.5;
-    constexpr double force_threshold = 0.0;
-    vstorage_.ComputeFilesMarkedForForcedBlobGC(
-        age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
-
-    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
-  }
-
-  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
-  // is below threshold
+  // Overall garbage ratio of eligible files is below threshold
 
   {
     constexpr double age_cutoff = 1.0;
@@ -751,8 +738,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) {
     ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
   }
 
-  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
-  // meets threshold
+  // Overall garbage ratio of eligible files meets threshold
 
   {
     constexpr double age_cutoff = 1.0;
@@ -878,20 +864,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) {
     ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
   }
 
-  // Part of the oldest batch of blob files (specifically, the second file) is
-  // ineligible for GC due to the age cutoff
-
-  {
-    constexpr double age_cutoff = 0.25;
-    constexpr double force_threshold = 0.0;
-    vstorage_.ComputeFilesMarkedForForcedBlobGC(
-        age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
-
-    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
-  }
-
-  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
-  // is below threshold
+  // Overall garbage ratio of eligible files is below threshold
 
   {
     constexpr double age_cutoff = 0.5;
@@ -902,8 +875,7 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) {
     ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
   }
 
-  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
-  // meets threshold
+  // Overall garbage ratio of eligible files meets threshold
 
   {
     constexpr double age_cutoff = 0.5;
@@ -929,48 +901,6 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) {
     ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
     ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
   }
-
-  // Now try the last two cases again with a greater than necessary age cutoff
-
-  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
-  // is below threshold
-
-  {
-    constexpr double age_cutoff = 0.75;
-    constexpr double force_threshold = 0.6;
-    vstorage_.ComputeFilesMarkedForForcedBlobGC(
-        age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
-
-    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
-  }
-
-  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
-  // meets threshold
-
-  {
-    constexpr double age_cutoff = 0.75;
-    constexpr double force_threshold = 0.5;
-    vstorage_.ComputeFilesMarkedForForcedBlobGC(
-        age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
-
-    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
-    ASSERT_EQ(ssts_to_be_compacted.size(), 2);
-
-    std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
-              [](const std::pair<int, FileMetaData*>& lhs,
-                 const std::pair<int, FileMetaData*>& rhs) {
-                assert(lhs.second);
-                assert(rhs.second);
-                return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
-              });
-
-    const autovector<std::pair<int, FileMetaData*>>
-        expected_ssts_to_be_compacted{{level, level_files[0]},
-                                      {level, level_files[1]}};
-
-    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
-    ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
-  }
 }
 
 class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase {
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index c23a163a8..9f165cf97 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -497,7 +497,7 @@ DEFINE_double(blob_garbage_collection_force_threshold,
               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                   .blob_garbage_collection_force_threshold,
               "[Integrated BlobDB] The threshold for the ratio of garbage in "
-              "the oldest blob files for forcing garbage collection.");
+              "the eligible blob files for forcing garbage collection.");
 
 DEFINE_uint64(blob_compaction_readahead_size,
               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 309d0c510..0805e2aab 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -929,13 +929,12 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through the SetOptions() API
   double blob_garbage_collection_age_cutoff = 0.25;
 
-  // If the ratio of garbage in the oldest blob files exceeds this threshold,
-  // targeted compactions are scheduled in order to force garbage collecting
-  // the blob files in question, assuming they are all eligible based on the
-  // value of blob_garbage_collection_age_cutoff above. This option is
-  // currently only supported with leveled compactions.
-  // Note that enable_blob_garbage_collection has to be set in order for this
-  // option to have any effect.
+  // If the ratio of garbage in the blob files currently eligible for garbage
+  // collection exceeds this threshold, targeted compactions are scheduled in
+  // order to force garbage collecting the oldest blob files. This option is
+  // currently only supported with leveled compactions. Note that
+  // enable_blob_garbage_collection has to be set in order for this option to
+  // have any effect.
   //
   // Default: 1.0
   //
diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
index c8fc84173..44e61c6d7 100644
--- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
@@ -733,11 +733,10 @@ T setReportBgIoStats(
   double blobGarbageCollectionAgeCutoff();
 
   /**
-   *  If the ratio of garbage in the oldest blob files exceeds this threshold,
-   *  targeted compactions are scheduled in order to force garbage collecting
-   *  the blob files in question, assuming they are all eligible based on the
-   *  value of {@link #blobGarbageCollectionAgeCutoff} above. This option is
-   *  currently only supported with leveled compactions.
+   * If the ratio of garbage in the blob files currently eligible for garbage
+   * collection exceeds this threshold, targeted compactions are scheduled in
+   * order to force garbage collecting the oldest blob files. This option is
+   * currently only supported with leveled compactions.
    * <p>
    *  Note that {@link #enableBlobGarbageCollection} has to be set in order for this
    *  option to have any effect.
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
index bb458078c..3af4d2a8e 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
@@ -1204,11 +1204,10 @@ public double blobGarbageCollectionAgeCutoff() {
   }
 
   /**
-   *  If the ratio of garbage in the oldest blob files exceeds this threshold,
-   *  targeted compactions are scheduled in order to force garbage collecting
-   *  the blob files in question, assuming they are all eligible based on the
-   *  value of {@link #blobGarbageCollectionAgeCutoff} above. This option is
-   *  currently only supported with leveled compactions.
+   * If the ratio of garbage in the blob files currently eligible for garbage
+   * collection exceeds this threshold, targeted compactions are scheduled in
+   * order to force garbage collecting the oldest blob files. This option is
+   * currently only supported with leveled compactions.
    * <p>
    *  Note that {@link #enableBlobGarbageCollection} has to be set in order for this
    *  option to have any effect.
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index d51dbf30a..ebb558351 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1104,7 +1104,7 @@ DEFINE_double(blob_garbage_collection_force_threshold,
               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                   .blob_garbage_collection_force_threshold,
               "[Integrated BlobDB] The threshold for the ratio of garbage in "
-              "the oldest blob files for forcing garbage collection.");
+              "the eligible blob files for forcing garbage collection.");
 
 DEFINE_uint64(blob_compaction_readahead_size,
               ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
diff --git a/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md b/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md
new file mode 100644
index 000000000..0c4b8bba2
--- /dev/null
+++ b/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md
@@ -0,0 +1 @@
+Changed the semantics of the BlobDB configuration option `blob_garbage_collection_force_threshold` to define a threshold for the overall garbage ratio of all blob files currently eligible for garbage collection (according to `blob_garbage_collection_age_cutoff`). This can provide better control over space amplification at the cost of slightly higher write amplification.

From 71e38dbe25253dc19e0ca0214ee8244d6731ac76 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 19 Sep 2024 15:50:41 -0700
Subject: [PATCH 06/17] Compact one file at a time for FIFO temperature change
 compactions (#13018)

Summary:
Per customer request, we should not merge multiple SST files together during temperature change compaction, since this can cause FIFO TTL compactions to be delayed. This PR changes the compaction picking logic to pick one file at a time.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13018

Test Plan: * updated some existing unit tests to test this new behavior.

Reviewed By: jowlyzhang

Differential Revision: D62883292

Pulled By: cbi42

fbshipit-source-id: 6a9fc8c296b5d9b17168ef6645f25153241c8b93
---
 db/compaction/compaction_picker_fifo.cc       | 47 ++++++---------
 db/compaction/compaction_picker_fifo.h        |  3 +-
 db/compaction/compaction_picker_test.cc       | 57 +++----------------
 db/db_compaction_test.cc                      |  5 +-
 include/rocksdb/advanced_options.h            |  5 +-
 .../behavior_changes/fifo-temp-compaction.md  |  1 +
 6 files changed, 33 insertions(+), 85 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/fifo-temp-compaction.md

diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index d898b5126..29f4462d4 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -294,7 +294,7 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
 Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer) const {
   const std::vector<FileTemperatureAge>& ages =
       mutable_cf_options.compaction_options_fifo
           .file_temperature_age_thresholds;
@@ -344,12 +344,10 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
   Temperature compaction_target_temp = Temperature::kLastTemperature;
   if (current_time > min_age) {
     uint64_t create_time_threshold = current_time - min_age;
-    uint64_t compaction_size = 0;
     // We will ideally identify a file qualifying for temperature change by
     // knowing the timestamp for the youngest entry in the file. However, right
     // now we don't have the information. We infer it by looking at timestamp of
     // the previous file's (which is just younger) oldest entry's timestamp.
-    Temperature cur_target_temp;
     // avoid index underflow
     assert(level_files.size() >= 1);
     for (size_t index = level_files.size() - 1; index >= 1; --index) {
@@ -374,7 +372,7 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
         // cur_file is too fresh
         break;
       }
-      cur_target_temp = ages[0].temperature;
+      Temperature cur_target_temp = ages[0].temperature;
       for (size_t i = 1; i < ages.size(); ++i) {
         if (current_time >= ages[i].age &&
             oldest_ancestor_time <= current_time - ages[i].age) {
@@ -382,35 +380,20 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
         }
       }
       if (cur_file->temperature == cur_target_temp) {
-        if (inputs[0].empty()) {
-          continue;
-        } else {
-          break;
-        }
+        continue;
       }
 
       // cur_file needs to change temperature
-      if (compaction_target_temp == Temperature::kLastTemperature) {
-        assert(inputs[0].empty());
-        compaction_target_temp = cur_target_temp;
-      } else if (cur_target_temp != compaction_target_temp) {
-        assert(!inputs[0].empty());
-        break;
-      }
-      if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <=
-                                   mutable_cf_options.max_compaction_bytes) {
-        inputs[0].files.push_back(cur_file);
-        compaction_size += cur_file->fd.GetFileSize();
-        ROCKS_LOG_BUFFER(
-            log_buffer,
-            "[%s] FIFO compaction: picking file %" PRIu64
-            " with next file's oldest time %" PRIu64 " for temperature %s.",
-            cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time,
-            temperature_to_string[cur_target_temp].c_str());
-      }
-      if (compaction_size > mutable_cf_options.max_compaction_bytes) {
-        break;
-      }
+      assert(compaction_target_temp == Temperature::kLastTemperature);
+      compaction_target_temp = cur_target_temp;
+      inputs[0].files.push_back(cur_file);
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] FIFO compaction: picking file %" PRIu64
+          " with next file's oldest time %" PRIu64 " for temperature %s.",
+          cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time,
+          temperature_to_string[cur_target_temp].c_str());
+      break;
     }
   }
 
@@ -418,7 +401,9 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
     return nullptr;
   }
   assert(compaction_target_temp != Temperature::kLastTemperature);
-
+  // Only compact one file at a time.
+  assert(inputs.size() == 1);
+  assert(inputs[0].size() == 1);
   Compaction* c = new Compaction(
       vstorage, ioptions_, mutable_cf_options, mutable_db_options,
       std::move(inputs), 0, 0 /* output file size limit */,
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index 5badc491c..cd7e56e8b 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -53,9 +53,10 @@ class FIFOCompactionPicker : public CompactionPicker {
                                  VersionStorageInfo* version,
                                  LogBuffer* log_buffer);
 
+  // Will pick one file to compact at a time, starting from the oldest file.
   Compaction* PickTemperatureChangeCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
       const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer);
+      LogBuffer* log_buffer) const;
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index b58f5d010..bec8e9182 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -1119,48 +1119,6 @@ TEST_F(CompactionPickerTest, FIFOToCold1) {
   ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
 }
 
-TEST_F(CompactionPickerTest, FIFOToCold2) {
-  NewVersionStorage(1, kCompactionStyleFIFO);
-  const uint64_t kFileSize = 100000;
-  const uint64_t kMaxSize = kFileSize * 100000;
-  uint64_t kColdThreshold = 2000;
-
-  fifo_options_.max_table_files_size = kMaxSize;
-  fifo_options_.file_temperature_age_thresholds = {
-      {Temperature::kCold, kColdThreshold}};
-  mutable_cf_options_.compaction_options_fifo = fifo_options_;
-  mutable_cf_options_.level0_file_num_compaction_trigger = 100;
-  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
-
-  int64_t current_time = 0;
-  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
-  uint64_t threshold_time =
-      static_cast<uint64_t>(current_time) - kColdThreshold;
-  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
-      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
-  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
-      Temperature::kUnknown, threshold_time);
-  // The following two files qualify for compaction to kCold.
-  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
-      Temperature::kUnknown, threshold_time - 3000);
-  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
-      Temperature::kUnknown, threshold_time - 4000);
-  UpdateVersionStorageInfo();
-
-  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
-  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
-      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
-      &log_buffer_));
-  ASSERT_TRUE(compaction.get() != nullptr);
-  ASSERT_EQ(compaction->compaction_reason(),
-            CompactionReason::kChangeTemperature);
-  ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
-  ASSERT_EQ(2U, compaction->num_input_files(0));
-  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
-  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
-}
-
 TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
   NewVersionStorage(1, kCompactionStyleFIFO);
   const uint64_t kFileSize = 100000;
@@ -1202,10 +1160,10 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(compaction->compaction_reason(),
             CompactionReason::kChangeTemperature);
+  // Compaction picker picks older files first and picks one file at a time.
   ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
-  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
-  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
 }
 
 TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
@@ -1248,10 +1206,10 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(compaction->compaction_reason(),
             CompactionReason::kChangeTemperature);
+  // Compaction picker picks older files first and picks one file at a time.
   ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
-  ASSERT_EQ(2U, compaction->num_input_files(0));
-  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
 }
 
 TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
@@ -1299,7 +1257,7 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
   ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
 }
 
-TEST_F(CompactionPickerTest, FIFOToColdAndWarm) {
+TEST_F(CompactionPickerTest, FIFOToHotAndWarm) {
   NewVersionStorage(1, kCompactionStyleFIFO);
   const uint64_t kFileSize = 100000;
   const uint64_t kMaxSize = kFileSize * 100000;
@@ -1344,11 +1302,10 @@ TEST_F(CompactionPickerTest, FIFOToColdAndWarm) {
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(compaction->compaction_reason(),
             CompactionReason::kChangeTemperature);
-  // Assumes compaction picker picks older files first.
+  // Compaction picker picks older files first and picks one file at a time.
   ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm);
-  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
-  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
 }
 
 TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index ed918f0b9..2a8e19e7b 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -9357,12 +9357,13 @@ TEST_F(DBCompactionTest, FIFOChangeTemperature) {
     ASSERT_OK(Flush());
 
     ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
     ASSERT_OK(Put(Key(2), "value2"));
     ASSERT_OK(Flush());
 
+    // First two L0 files both become eligible for temperature change compaction
+    // They should be compacted one-by-one.
     ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
+    env_->MockSleepForSeconds(1200);
     ASSERT_OK(Put(Key(2), "value2"));
     ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 0805e2aab..aaaf1100c 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -89,7 +89,10 @@ struct CompactionOptionsFIFO {
   // Age (in seconds) threshold for different file temperatures.
   // When not empty, each element specifies an age threshold `age` and a
   // temperature such that if all the data in a file is older than `age`,
-  // RocksDB will compact the file to the specified `temperature`.
+  // RocksDB will compact the file to the specified `temperature`. Oldest file
+  // will be considered first. Only one file is compacted at a time,
+  // so multiple files qualifying to be compacted to be same temperature
+  // won't be merged together.
   //
   // Note:
   // - Flushed files will always have temperature kUnknown.
diff --git a/unreleased_history/behavior_changes/fifo-temp-compaction.md b/unreleased_history/behavior_changes/fifo-temp-compaction.md
new file mode 100644
index 000000000..7ba5f643d
--- /dev/null
+++ b/unreleased_history/behavior_changes/fifo-temp-compaction.md
@@ -0,0 +1 @@
+* In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018). 
\ No newline at end of file

From 6549b117148f01a5c35287d5de9754183dd3b781 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 20 Sep 2024 12:13:19 -0700
Subject: [PATCH 07/17] Make Cache a customizable class (#13024)

Summary:
This PR allows a Cache object to be created using the object registry.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13024

Reviewed By: pdillinger

Differential Revision: D63043233

Pulled By: anand1976

fbshipit-source-id: 5bc3f7c29b35ad62638ff8205451303e2cecea9d
---
 cache/cache.cc                                | 26 +++++++++-------
 include/rocksdb/advanced_cache.h              |  4 ++-
 options/customizable_test.cc                  | 31 +++++++++++++++++++
 tools/db_bench_tool.cc                        | 11 ++++++-
 .../new_features/customizable_cache.md        |  1 +
 5 files changed, 60 insertions(+), 13 deletions(-)
 create mode 100644 unreleased_history/new_features/customizable_cache.md

diff --git a/cache/cache.cc b/cache/cache.cc
index 3dbea128e..cf5febb70 100644
--- a/cache/cache.cc
+++ b/cache/cache.cc
@@ -133,19 +133,23 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
                                std::shared_ptr<Cache>* result) {
   Status status;
   std::shared_ptr<Cache> cache;
-  if (value.find('=') == std::string::npos) {
-    cache = NewLRUCache(ParseSizeT(value));
-  } else {
-    LRUCacheOptions cache_opts;
-    status = OptionTypeInfo::ParseStruct(config_options, "",
-                                         &lru_cache_options_type_info, "",
-                                         value, &cache_opts);
+  if (value.find("://") == std::string::npos) {
+    if (value.find('=') == std::string::npos) {
+      cache = NewLRUCache(ParseSizeT(value));
+    } else {
+      LRUCacheOptions cache_opts;
+      status = OptionTypeInfo::ParseStruct(config_options, "",
+                                           &lru_cache_options_type_info, "",
+                                           value, &cache_opts);
+      if (status.ok()) {
+        cache = NewLRUCache(cache_opts);
+      }
+    }
     if (status.ok()) {
-      cache = NewLRUCache(cache_opts);
+      result->swap(cache);
     }
-  }
-  if (status.ok()) {
-    result->swap(cache);
+  } else {
+    status = LoadSharedObject<Cache>(config_options, value, result);
   }
   return status;
 }
diff --git a/include/rocksdb/advanced_cache.h b/include/rocksdb/advanced_cache.h
index ab9f722a0..d8eeb7d2e 100644
--- a/include/rocksdb/advanced_cache.h
+++ b/include/rocksdb/advanced_cache.h
@@ -41,7 +41,7 @@ class Statistics;
 //
 // INTERNAL: See typed_cache.h for convenient wrappers on top of this API.
 // New virtual functions must also be added to CacheWrapper below.
-class Cache {
+class Cache : public Customizable {
  public:  // types hidden from API client
   // Opaque handle to an entry stored in the cache.
   struct Handle {};
@@ -190,6 +190,8 @@ class Cache {
   // Destroys all remaining entries by calling the associated "deleter"
   virtual ~Cache() {}
 
+  static const char* Type() { return "Cache"; }
+
   // Creates a new Cache based on the input value string and returns the result.
   // Currently, this method can be used to create LRUCaches only
   // @param config_options
diff --git a/options/customizable_test.cc b/options/customizable_test.cc
index 696f1b25e..1ec2564a4 100644
--- a/options/customizable_test.cc
+++ b/options/customizable_test.cc
@@ -1396,6 +1396,19 @@ class MockFilterPolicy : public FilterPolicy {
   }
 };
 
+class MockCache : public CacheWrapper {
+ public:
+  static const char* kClassName() { return "MockCache"; }
+  const char* Name() const override { return kClassName(); }
+
+  MockCache()
+      : CacheWrapper(NewLRUCache(LRUCacheOptions(100, 0, false, 0.0))) {}
+
+  bool IsInstanceOf(const std::string& name) const override {
+    return name.find(Name()) == 0;
+  }
+};
+
 static int RegisterLocalObjects(ObjectLibrary& library,
                                 const std::string& /*arg*/) {
   size_t num_types;
@@ -1519,6 +1532,15 @@ static int RegisterLocalObjects(ObjectLibrary& library,
         return guard->get();
       });
 
+  library.AddFactory<Cache>(
+      ObjectLibrary::PatternEntry(MockCache::kClassName())
+          .AddSeparator("://", /*at_least_one=*/false),
+      [](const std::string& /*uri*/, std::unique_ptr<Cache>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new MockCache());
+        return guard->get();
+      });
+
   return static_cast<int>(library.GetFactoryCount(&num_types));
 }
 }  // namespace
@@ -2111,6 +2133,15 @@ TEST_F(LoadCustomizableTest, LoadFlushBlockPolicyFactoryTest) {
   }
 }
 
+TEST_F(LoadCustomizableTest, LoadCacheTest) {
+  if (RegisterTests("Test")) {
+    std::string uri(MockCache::kClassName());
+    uri.append("://");
+    auto cache = ExpectCreateShared<Cache>(uri);
+    ASSERT_TRUE(cache->IsInstanceOf(MockCache::kClassName()));
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ebb558351..713aaaa41 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1753,6 +1753,7 @@ DEFINE_bool(read_with_latest_user_timestamp, true,
             "If true, always use the current latest timestamp for read. If "
             "false, choose a random timestamp from the past.");
 
+DEFINE_string(cache_uri, "", "Full URI for creating a custom cache object");
 DEFINE_string(secondary_cache_uri, "",
               "Full URI for creating a custom secondary cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@@ -3138,7 +3139,15 @@ class Benchmark {
     }
 
     std::shared_ptr<Cache> block_cache;
-    if (FLAGS_cache_type == "clock_cache") {
+    if (!FLAGS_cache_uri.empty()) {
+      Status s = Cache::CreateFromString(ConfigOptions(), FLAGS_cache_uri,
+                                         &block_cache);
+      if (block_cache == nullptr) {
+        fprintf(stderr, "No  cache registered matching string: %s status=%s\n",
+                FLAGS_cache_uri.c_str(), s.ToString().c_str());
+        exit(1);
+      }
+    } else if (FLAGS_cache_type == "clock_cache") {
       fprintf(stderr, "Old clock cache implementation has been removed.\n");
       exit(1);
     } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) {
diff --git a/unreleased_history/new_features/customizable_cache.md b/unreleased_history/new_features/customizable_cache.md
new file mode 100644
index 000000000..b50d03c24
--- /dev/null
+++ b/unreleased_history/new_features/customizable_cache.md
@@ -0,0 +1 @@
+Make Cache a customizable class that can be instantiated by the object registry.

From 5f4a8c3da41327e9844d8d14793d34a0ec3453b9 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Fri, 20 Sep 2024 13:26:02 -0700
Subject: [PATCH 08/17] Load latest options from OPTIONS file in Remote host
 (#13025)

Summary:
We've been serializing and deserializing DBOptions and CFOptions (and other CF into) as part of `CompactionServiceInput`. These are all readily available in the OPTIONS file and the remote worker can read the OPTIONS file to obtain the same information. This helps reducing the size of payload significantly.

In a very rare scenario if the OPTIONS file is purged due to options change by primary host at the same time while the remote host is loading the latest options, it may fail. In this case, we just retry once.

This also solves the problem where we had to open the default CF with the CFOption from another CF if the remote compaction is for a non-default column family. (TODO comment in /db_impl_secondary.cc)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13025

Test Plan:
Unit Tests
```
./compaction_service_test
```
```
./compaction_job_test
```

Also tested with Meta's internal Offload Infra

Reviewed By: anand1976, cbi42

Differential Revision: D63100109

Pulled By: jaykorean

fbshipit-source-id: b7162695e31e2c5a920daa7f432842163a5b156d
---
 db/compaction/compaction_job.h          |   7 +-
 db/compaction/compaction_job_test.cc    |  16 +---
 db/compaction/compaction_service_job.cc |  68 ++++----------
 db/db_impl/db_impl_secondary.cc         | 113 ++++++++++++++++--------
 include/rocksdb/options.h               |   3 -
 5 files changed, 94 insertions(+), 113 deletions(-)

diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 224f4e46f..dd3b53737 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -377,9 +377,7 @@ class CompactionJob {
 // doesn't contain the LSM tree information, which is passed though MANIFEST
 // file.
 struct CompactionServiceInput {
-  ColumnFamilyDescriptor column_family;
-
-  DBOptions db_options;
+  std::string cf_name;
 
   std::vector<SequenceNumber> snapshots;
 
@@ -402,9 +400,6 @@ struct CompactionServiceInput {
   static Status Read(const std::string& data_str, CompactionServiceInput* obj);
   Status Write(std::string* output);
 
-  // Initialize a dummy ColumnFamilyDescriptor
-  CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
-
 #ifndef NDEBUG
   bool TEST_Equals(CompactionServiceInput* other);
   bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 8e85a9f96..e286817e6 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -1568,17 +1568,7 @@ TEST_F(CompactionJobTest, InputSerialization) {
   const int kStrMaxLen = 1000;
   Random rnd(static_cast<uint32_t>(time(nullptr)));
   Random64 rnd64(time(nullptr));
-  input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
-  input.column_family.options.comparator = ReverseBytewiseComparator();
-  input.column_family.options.max_bytes_for_level_base =
-      rnd64.Uniform(UINT64_MAX);
-  input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
-  input.column_family.options.compression = kZSTD;
-  input.column_family.options.compression_opts.level = 4;
-  input.db_options.max_background_flushes = 10;
-  input.db_options.paranoid_checks = rnd.OneIn(2);
-  input.db_options.statistics = CreateDBStatistics();
-  input.db_options.env = env_;
+  input.cf_name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
   while (!rnd.OneIn(10)) {
     input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
   }
@@ -1606,10 +1596,10 @@ TEST_F(CompactionJobTest, InputSerialization) {
   ASSERT_TRUE(deserialized1.TEST_Equals(&input));
 
   // Test mismatch
-  deserialized1.db_options.max_background_flushes += 10;
+  deserialized1.output_level += 10;
   std::string mismatch;
   ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
-  ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+  ASSERT_EQ(mismatch, "output_level");
 
   // Test unknown field
   CompactionServiceInput deserialized2;
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index a923e4fcc..8a8db3362 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -39,12 +39,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
           MakeTableFileName(file->fd.GetNumber()));
     }
   }
-  compaction_input.column_family.name =
-      compaction->column_family_data()->GetName();
-  compaction_input.column_family.options =
-      compaction->column_family_data()->GetLatestCFOptions();
-  compaction_input.db_options =
-      BuildDBOptions(db_options_, mutable_db_options_copy_);
+
+  compaction_input.cf_name = compaction->column_family_data()->GetName();
   compaction_input.snapshots = existing_snapshots_;
   compaction_input.has_begin = sub_compact->start.has_value();
   compaction_input.begin =
@@ -70,7 +66,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
   ROCKS_LOG_INFO(
       db_options_.info_log,
       "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
-      compaction_input.column_family.name.c_str(), job_id_,
+      compaction->column_family_data()->GetName().c_str(), job_id_,
       compaction_input.output_level, input_files_oss.str().c_str());
   CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
                                 GetCompactionId(sub_compact), thread_pri_);
@@ -84,13 +80,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
           "CompactionService failed to schedule a remote compaction job.");
       ROCKS_LOG_WARN(db_options_.info_log,
                      "[%s] [JOB %d] Remote compaction failed to start.",
-                     compaction_input.column_family.name.c_str(), job_id_);
+                     compaction->column_family_data()->GetName().c_str(),
+                     job_id_);
       return response.status;
     case CompactionServiceJobStatus::kUseLocal:
       ROCKS_LOG_INFO(
           db_options_.info_log,
           "[%s] [JOB %d] Remote compaction fallback to local by API (Schedule)",
-          compaction_input.column_family.name.c_str(), job_id_);
+          compaction->column_family_data()->GetName().c_str(), job_id_);
       return response.status;
     default:
       assert(false);  // unknown status
@@ -99,7 +96,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
 
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Waiting for remote compaction...",
-                 compaction_input.column_family.name.c_str(), job_id_);
+                 compaction->column_family_data()->GetName().c_str(), job_id_);
   std::string compaction_result_binary;
   CompactionServiceJobStatus compaction_status =
       db_options_.compaction_service->Wait(response.scheduled_job_id,
@@ -109,7 +106,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     ROCKS_LOG_INFO(
         db_options_.info_log,
         "[%s] [JOB %d] Remote compaction fallback to local by API (Wait)",
-        compaction_input.column_family.name.c_str(), job_id_);
+        compaction->column_family_data()->GetName().c_str(), job_id_);
     return compaction_status;
   }
 
@@ -134,9 +131,9 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
           "result is returned).");
       compaction_result.status.PermitUncheckedError();
     }
-    ROCKS_LOG_WARN(db_options_.info_log,
-                   "[%s] [JOB %d] Remote compaction failed.",
-                   compaction_input.column_family.name.c_str(), job_id_);
+    ROCKS_LOG_WARN(
+        db_options_.info_log, "[%s] [JOB %d] Remote compaction failed.",
+        compaction->column_family_data()->GetName().c_str(), job_id_);
     return compaction_status;
   }
 
@@ -162,7 +159,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       db_options_.info_log,
       "[%s] [JOB %d] Received remote compaction result, output path: "
       "%s, files: %s",
-      compaction_input.column_family.name.c_str(), job_id_,
+      compaction->column_family_data()->GetName().c_str(), job_id_,
       compaction_result.output_path.c_str(), output_files_oss.str().c_str());
 
   // Installation Starts
@@ -264,8 +261,8 @@ Status CompactionServiceCompactionJob::Run() {
   const VersionStorageInfo* storage_info = c->input_version()->storage_info();
   assert(storage_info);
   assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
-
   write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
+
   bottommost_level_ = c->bottommost_level();
 
   Slice begin = compaction_input_.begin;
@@ -404,42 +401,9 @@ static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
 };
 
 static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
-    {"column_family",
-     OptionTypeInfo::Struct(
-         "column_family", &cfd_type_info,
-         offsetof(struct CompactionServiceInput, column_family),
-         OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
-    {"db_options",
-     {offsetof(struct CompactionServiceInput, db_options),
-      OptionType::kConfigurable, OptionVerificationType::kNormal,
-      OptionTypeFlags::kNone,
-      [](const ConfigOptions& opts, const std::string& /*name*/,
-         const std::string& value, void* addr) {
-        auto options = static_cast<DBOptions*>(addr);
-        return GetDBOptionsFromString(opts, DBOptions(), value, options);
-      },
-      [](const ConfigOptions& opts, const std::string& /*name*/,
-         const void* addr, std::string* value) {
-        const auto options = static_cast<const DBOptions*>(addr);
-        std::string result;
-        auto status = GetStringFromDBOptions(opts, *options, &result);
-        *value = "{" + result + "}";
-        return status;
-      },
-      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
-         const void* addr2, std::string* mismatch) {
-        const auto this_one = static_cast<const DBOptions*>(addr1);
-        const auto that_one = static_cast<const DBOptions*>(addr2);
-        auto this_conf = DBOptionsAsConfigurable(*this_one);
-        auto that_conf = DBOptionsAsConfigurable(*that_one);
-        std::string mismatch_opt;
-        bool result =
-            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
-        if (!result) {
-          *mismatch = name + "." + mismatch_opt;
-        }
-        return result;
-      }}},
+    {"cf_name",
+     {offsetof(struct CompactionServiceInput, cf_name),
+      OptionType::kEncodedString}},
     {"snapshots", OptionTypeInfo::Vector<uint64_t>(
                       offsetof(struct CompactionServiceInput, snapshots),
                       OptionVerificationType::kNormal, OptionTypeFlags::kNone,
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 92944d118..fb7ea1110 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -12,7 +12,8 @@
 #include "logging/auto_roll_logger.h"
 #include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
-#include "rocksdb/configurable.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/options_util.h"
 #include "util/cast_util.h"
 #include "util/write_batch_util.h"
 
@@ -938,69 +939,103 @@ Status DB::OpenAndCompact(
     const std::string& output_directory, const std::string& input,
     std::string* output,
     const CompactionServiceOptionsOverride& override_options) {
+  // Check for cancellation
   if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
+
+  // 1. Deserialize Compaction Input
   CompactionServiceInput compaction_input;
   Status s = CompactionServiceInput::Read(input, &compaction_input);
   if (!s.ok()) {
     return s;
   }
 
-  compaction_input.db_options.max_open_files = -1;
-  compaction_input.db_options.compaction_service = nullptr;
-  if (compaction_input.db_options.statistics) {
-    compaction_input.db_options.statistics.reset();
+  // 2. Load the options from latest OPTIONS file
+  DBOptions db_options;
+  ConfigOptions config_options;
+  config_options.env = override_options.env;
+  std::vector<ColumnFamilyDescriptor> all_column_families;
+  s = LoadLatestOptions(config_options, name, &db_options,
+                        &all_column_families);
+  // In a very rare scenario, loading options may fail if the options changed by
+  // the primary host at the same time. Just retry once for now.
+  if (!s.ok()) {
+    s = LoadLatestOptions(config_options, name, &db_options,
+                          &all_column_families);
+    if (!s.ok()) {
+      return s;
+    }
   }
-  compaction_input.db_options.env = override_options.env;
-  compaction_input.db_options.file_checksum_gen_factory =
-      override_options.file_checksum_gen_factory;
-  compaction_input.db_options.statistics = override_options.statistics;
-  compaction_input.column_family.options.comparator =
-      override_options.comparator;
-  compaction_input.column_family.options.merge_operator =
-      override_options.merge_operator;
-  compaction_input.column_family.options.compaction_filter =
-      override_options.compaction_filter;
-  compaction_input.column_family.options.compaction_filter_factory =
-      override_options.compaction_filter_factory;
-  compaction_input.column_family.options.prefix_extractor =
-      override_options.prefix_extractor;
-  compaction_input.column_family.options.table_factory =
-      override_options.table_factory;
-  compaction_input.column_family.options.sst_partitioner_factory =
-      override_options.sst_partitioner_factory;
-  compaction_input.column_family.options.table_properties_collector_factories =
-      override_options.table_properties_collector_factories;
-  compaction_input.db_options.listeners = override_options.listeners;
 
+  // 3. Override pointer configurations in DBOptions with
+  // CompactionServiceOptionsOverride
+  db_options.env = override_options.env;
+  db_options.file_checksum_gen_factory =
+      override_options.file_checksum_gen_factory;
+  db_options.statistics = override_options.statistics;
+  db_options.listeners = override_options.listeners;
+  db_options.compaction_service = nullptr;
+  // We will close the DB after the compaction anyway.
+  // Open as many files as needed for the compaction.
+  db_options.max_open_files = -1;
+
+  // 4. Filter CFs that are needed for OpenAndCompact()
+  // We do not need to open all column families for the remote compaction.
+  // Only open default CF + target CF. If target CF == default CF, we will open
+  // just the default CF (Due to current limitation, DB cannot open without the
+  // default CF)
   std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.push_back(compaction_input.column_family);
-  // TODO: we have to open default CF, because of an implementation limitation,
-  // currently we just use the same CF option from input, which is not collect
-  // and open may fail.
-  if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
-    column_families.emplace_back(kDefaultColumnFamilyName,
-                                 compaction_input.column_family.options);
+  for (auto& cf : all_column_families) {
+    if (cf.name == compaction_input.cf_name) {
+      cf.options.comparator = override_options.comparator;
+      cf.options.merge_operator = override_options.merge_operator;
+      cf.options.compaction_filter = override_options.compaction_filter;
+      cf.options.compaction_filter_factory =
+          override_options.compaction_filter_factory;
+      cf.options.prefix_extractor = override_options.prefix_extractor;
+      cf.options.table_factory = override_options.table_factory;
+      cf.options.sst_partitioner_factory =
+          override_options.sst_partitioner_factory;
+      cf.options.table_properties_collector_factories =
+          override_options.table_properties_collector_factories;
+      column_families.emplace_back(cf);
+    } else if (cf.name == kDefaultColumnFamilyName) {
+      column_families.emplace_back(cf);
+    }
   }
 
+  // 5. Open db As Secondary
   DB* db;
   std::vector<ColumnFamilyHandle*> handles;
-
-  s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
-                          column_families, &handles, &db);
+  s = DB::OpenAsSecondary(db_options, name, output_directory, column_families,
+                          &handles, &db);
   if (!s.ok()) {
     return s;
   }
+  assert(db);
+
+  // 6. Find the handle of the Column Family that this will compact
+  ColumnFamilyHandle* cfh = nullptr;
+  for (auto* handle : handles) {
+    if (compaction_input.cf_name == handle->GetName()) {
+      cfh = handle;
+      break;
+    }
+  }
+  assert(cfh);
 
+  // 7. Run the compaction without installation.
+  // Output will be stored in the directory specified by output_directory
   CompactionServiceResult compaction_result;
   DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
-  assert(handles.size() > 0);
-  s = db_secondary->CompactWithoutInstallation(
-      options, handles[0], compaction_input, &compaction_result);
+  s = db_secondary->CompactWithoutInstallation(options, cfh, compaction_input,
+                                               &compaction_result);
 
+  // 8. Serialize the result
   Status serialization_status = compaction_result.Write(output);
 
+  // 9. Close the db and return
   for (auto& handle : handles) {
     delete handle;
   }
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 507f9bab8..27feadb80 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2295,9 +2295,6 @@ struct SizeApproximationOptions {
 };
 
 struct CompactionServiceOptionsOverride {
-  // Currently pointer configurations are not passed to compaction service
-  // compaction so the user needs to set it. It will be removed once pointer
-  // configuration passing is supported.
   Env* env = Env::Default();
   std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
 

From a1a102ffcefab83e75b9d026d204395d00f2a582 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 20 Sep 2024 15:54:19 -0700
Subject: [PATCH 09/17] Steps toward deprecating implicit prefix seek, related
 fixes (#13026)

Summary:
With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly.

Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`.

Related fixes / changes:
* Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation).
* Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.)

Suggested follow-up:
* Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness.
* Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek.
* Add `prefix_same_as_start` testing to db_stress

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026

Test Plan:
tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while.

Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all).

Reviewed By: ltamasi

Differential Revision: D63147378

Pulled By: pdillinger

fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e
---
 db/arena_wrapped_db_iter.cc                   |  21 +-
 db/column_family.cc                           |   4 +-
 db/db_bloom_filter_test.cc                    | 202 ++++++++++++++----
 db/db_impl/db_impl.cc                         |  17 +-
 db/db_impl/db_impl_open.cc                    |   3 +-
 db/db_iter.cc                                 |   8 +-
 db/db_test2.cc                                |  58 +++--
 db/db_with_timestamp_basic_test.cc            |   2 +
 db/flush_job.cc                               |   8 +-
 db/forward_iterator.cc                        |  16 +-
 db/memtable.cc                                |  38 ++--
 db/memtable.h                                 |   3 +-
 db/memtable_list.cc                           |  11 +-
 db/memtable_list.h                            |   2 +
 db/repair.cc                                  |   3 +-
 db/write_batch_test.cc                        |   2 +-
 db_stress_tool/db_stress_test_base.cc         |   2 +-
 include/rocksdb/options.h                     |  19 +-
 java/rocksjni/write_batch_test.cc             |   3 +-
 options/db_options.cc                         |   7 +
 options/db_options.h                          |   1 +
 table/block_based/block_based_table_reader.cc |   8 +-
 table/plain/plain_table_reader.cc             |   6 +-
 table/table_test.cc                           |   7 +-
 .../bug_fixes/memtable_prefix.md              |   1 +
 .../new_features/prefix_seek_opt_in_only.md   |   1 +
 26 files changed, 325 insertions(+), 128 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/memtable_prefix.md
 create mode 100644 unreleased_history/new_features/prefix_seek_opt_in_only.md

diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index 83cc3cfa5..22be567fd 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -45,20 +45,23 @@ void ArenaWrappedDBIter::Init(
     const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
     uint64_t version_number, ReadCallback* read_callback,
     ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
-  auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(
-      env, read_options, ioptions, mutable_cf_options, ioptions.user_comparator,
-      /* iter */ nullptr, version, sequence, true,
-      max_sequential_skip_in_iteration, read_callback, cfh, expose_blob_index);
-  sv_number_ = version_number;
   read_options_ = read_options;
-  allow_refresh_ = allow_refresh;
-  memtable_range_tombstone_iter_ = nullptr;
-
   if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
                              FSSupportedOps::kAsyncIO)) {
     read_options_.async_io = false;
   }
+  read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
+
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
+                              ioptions.user_comparator,
+                              /* iter */ nullptr, version, sequence, true,
+                              max_sequential_skip_in_iteration, read_callback,
+                              cfh, expose_blob_index);
+
+  sv_number_ = version_number;
+  allow_refresh_ = allow_refresh;
+  memtable_range_tombstone_iter_ = nullptr;
 }
 
 Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }
diff --git a/db/column_family.cc b/db/column_family.cc
index 2b611fda7..8abad2941 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1201,8 +1201,10 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
   read_opts.total_order_seek = true;
   MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
   merge_iter_builder.AddIterator(super_version->mem->NewIterator(
-      read_opts, /*seqno_to_time_mapping=*/nullptr, &arena));
+      read_opts, /*seqno_to_time_mapping=*/nullptr, &arena,
+      /*prefix_extractor=*/nullptr));
   super_version->imm->AddIterators(read_opts, /*seqno_to_time_mapping=*/nullptr,
+                                   /*prefix_extractor=*/nullptr,
                                    &merge_iter_builder,
                                    false /* add_range_tombstone_iter */);
   ScopedArenaPtr<InternalIterator> memtable_iter(merge_iter_builder.Finish());
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index 7b42f4ee5..f0b3c4066 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -171,7 +171,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
 
     ASSERT_OK(Put(1, "a", "b"));
     bool value_found = false;
@@ -187,7 +187,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
     uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
     ASSERT_TRUE(
         db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
-    ASSERT_TRUE(!value_found);
+    ASSERT_FALSE(value_found);
     // assert that no new files were opened and no new blocks were
     // read into block cache.
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
@@ -197,7 +197,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -207,7 +207,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "a", &value));
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -215,7 +215,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+    ASSERT_FALSE(db_->KeyMayExist(ropts, handles_[1], "c", &value));
     ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -2177,24 +2177,146 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
 
   db_->ReleaseSnapshot(snapshot);
 }
+namespace {
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+  if (sst) {
+    return {options.statistics->getAndResetTickerCount(
+                NON_LAST_LEVEL_SEEK_FILTER_MATCH),
+            options.statistics->getAndResetTickerCount(
+                NON_LAST_LEVEL_SEEK_FILTERED)};
+  } else {
+    auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+    auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+    return {hit, miss};
+  }
+}
+
+std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
+  return {hits, misses};
+}
+}  // namespace
 
-TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
-  constexpr size_t kPrefixSize = 8;
-  const std::string kKey = "key";
-  assert(kKey.size() < kPrefixSize);
+TEST_F(DBBloomFilterTest, MemtablePrefixBloom) {
   Options options = CurrentOptions();
-  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
   options.memtable_prefix_bloom_size_ratio = 0.25;
   Reopen(options);
-  ASSERT_OK(Put(kKey, "v"));
-  ASSERT_EQ("v", Get(kKey));
-  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
-  iter->Seek(kKey);
+  ASSERT_FALSE(options.prefix_extractor->InDomain("key"));
+  ASSERT_OK(Put("key", "v"));
+  ASSERT_OK(Put("goat1", "g1"));
+  ASSERT_OK(Put("goat2", "g2"));
+
+  // Reset from other tests
+  GetBloomStat(options, false);
+
+  // Out of domain (Get)
+  ASSERT_EQ("v", Get("key"));
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+  // In domain (Get)
+  ASSERT_EQ("g1", Get("goat1"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goat9"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goan1"));
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+  ReadOptions ropts;
+  if (options.prefix_seek_opt_in_only) {
+    ropts.prefix_same_as_start = true;
+  }
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+  // Out of domain (scan)
+  iter->Seek("ke");
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(kKey, iter->key());
-  iter->SeekForPrev(kKey);
+  ASSERT_EQ("key", iter->key());
+  iter->SeekForPrev("kez");
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(kKey, iter->key());
+  ASSERT_EQ("key", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+  // In domain (scan)
+  iter->Seek("goan");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+  iter->Seek("goat");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat1", iter->key());
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+
+  // Changing prefix extractor should affect prefix query semantics
+  // and bypass the existing memtable Bloom filter
+  ASSERT_OK(db_->SetOptions({{"prefix_extractor", "fixed:5"}}));
+  iter.reset(db_->NewIterator(ropts));
+  // Now out of domain (scan)
+  iter->Seek("goan");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat1", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  iter->Seek("goat2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat2", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  if (ropts.prefix_same_as_start) {
+    iter->Seek("goat0");
+    ASSERT_OK(iter->status());
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  } else {
+    // NOTE: legacy prefix Seek may return keys outside of prefix
+  }
+
+  // Start a fresh new memtable, using new prefix extractor
+  ASSERT_OK(SingleDelete("key"));
+  ASSERT_OK(SingleDelete("goat1"));
+  ASSERT_OK(SingleDelete("goat2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("key", "_v"));
+  ASSERT_OK(Put("goat1", "_g1"));
+  ASSERT_OK(Put("goat2", "_g2"));
+
+  iter.reset(db_->NewIterator(ropts));
+
+  // Still out of domain (Get)
+  ASSERT_EQ("_v", Get("key"));
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+
+  // Still in domain (Get)
+  ASSERT_EQ("_g1", Get("goat1"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goat11"));
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goat9"));
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+  ASSERT_EQ("NOT_FOUND", Get("goan1"));
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
+
+  // Now out of domain (scan)
+  iter->Seek("goan");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat1", iter->key());
+  ASSERT_EQ(HitAndMiss(0, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  iter->Seek("goat2");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("goat2", iter->key());
+  ASSERT_EQ(HitAndMiss(1, 0), GetBloomStat(options, false));
+  // In domain (scan)
+  iter->Seek("goat0");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_EQ(HitAndMiss(0, 1), GetBloomStat(options, false));
 }
 
 class DBBloomFilterTestVaryPrefixAndFormatVer
@@ -2507,7 +2629,11 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_OK(Put(key1, value1, WriteOptions()));
   ASSERT_OK(Put(key3, value3, WriteOptions()));
 
-  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+  ReadOptions ropts;
+  if (options_.prefix_seek_opt_in_only) {
+    ropts.prefix_same_as_start = true;
+  }
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ropts));
 
   // check memtable bloom stats
   iter->Seek(key1);
@@ -2526,13 +2652,13 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
 
   iter->Seek(key2);
   ASSERT_OK(iter->status());
-  ASSERT_TRUE(!iter->Valid());
+  ASSERT_FALSE(iter->Valid());
   ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
   ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
 
   ASSERT_OK(Flush());
 
-  iter.reset(dbfull()->NewIterator(ReadOptions()));
+  iter.reset(dbfull()->NewIterator(ropts));
 
   // Check SST bloom stats
   iter->Seek(key1);
@@ -2550,7 +2676,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
 
   iter->Seek(key2);
   ASSERT_OK(iter->status());
-  ASSERT_TRUE(!iter->Valid());
+  ASSERT_FALSE(iter->Valid());
   ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
   ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 }
@@ -2659,9 +2785,14 @@ TEST_F(DBBloomFilterTest, PrefixScan) {
     PrefixScanInit(this);
     count = 0;
     env_->random_read_counter_.Reset();
-    iter = db_->NewIterator(ReadOptions());
+    ReadOptions ropts;
+    if (options.prefix_seek_opt_in_only) {
+      ropts.prefix_same_as_start = true;
+    }
+    iter = db_->NewIterator(ropts);
     for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
       if (!iter->key().starts_with(prefix)) {
+        ASSERT_FALSE(ropts.prefix_same_as_start);
         break;
       }
       count++;
@@ -3397,23 +3528,6 @@ class FixedSuffix4Transform : public SliceTransform {
 
   bool InDomain(const Slice& src) const override { return src.size() >= 4; }
 };
-
-std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
-  if (sst) {
-    return {options.statistics->getAndResetTickerCount(
-                NON_LAST_LEVEL_SEEK_FILTER_MATCH),
-            options.statistics->getAndResetTickerCount(
-                NON_LAST_LEVEL_SEEK_FILTERED)};
-  } else {
-    auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
-    auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
-    return {hit, miss};
-  }
-}
-
-std::pair<uint64_t, uint64_t> HitAndMiss(uint64_t hits, uint64_t misses) {
-  return {hits, misses};
-}
 }  // anonymous namespace
 
 // This uses a prefix_extractor + comparator combination that violates
@@ -3520,9 +3634,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
     if (flushed) {  // TODO: support auto_prefix_mode in memtable?
       read_options.auto_prefix_mode = true;
     } else {
-      // TODO: why needed?
-      get_perf_context()->bloom_memtable_hit_count = 0;
-      get_perf_context()->bloom_memtable_miss_count = 0;
+      // Reset from other tests
+      GetBloomStat(options, flushed);
     }
     EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
     {
@@ -3664,9 +3777,8 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
       if (flushed) {  // TODO: support auto_prefix_mode in memtable?
         read_options.auto_prefix_mode = true;
       } else {
-        // TODO: why needed?
-        get_perf_context()->bloom_memtable_hit_count = 0;
-        get_perf_context()->bloom_memtable_miss_count = 0;
+        // Reset from other tests
+        GetBloomStat(options, flushed);
       }
       EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0));
       {
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index e95561efa..b218bd0ba 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2066,15 +2066,19 @@ InternalIterator* DBImpl::NewInternalIterator(
     bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
   InternalIterator* internal_iter;
   assert(arena != nullptr);
+  auto prefix_extractor =
+      super_version->mutable_cf_options.prefix_extractor.get();
   // Need to create internal iterator from the arena.
   MergeIteratorBuilder merge_iter_builder(
       &cfd->internal_comparator(), arena,
-      !read_options.total_order_seek &&
-          super_version->mutable_cf_options.prefix_extractor != nullptr,
+      // FIXME? It's not clear what interpretation of prefix seek is needed
+      // here, and no unit test cares about the value provided here.
+      !read_options.total_order_seek && prefix_extractor != nullptr,
       read_options.iterate_upper_bound);
   // Collect iterator for mutable memtable
   auto mem_iter = super_version->mem->NewIterator(
-      read_options, super_version->GetSeqnoToTimeMapping(), arena);
+      read_options, super_version->GetSeqnoToTimeMapping(), arena,
+      super_version->mutable_cf_options.prefix_extractor.get());
   Status s;
   if (!read_options.ignore_range_deletions) {
     std::unique_ptr<TruncatedRangeDelIterator> mem_tombstone_iter;
@@ -2098,6 +2102,7 @@ InternalIterator* DBImpl::NewInternalIterator(
   if (s.ok()) {
     super_version->imm->AddIterators(
         read_options, super_version->GetSeqnoToTimeMapping(),
+        super_version->mutable_cf_options.prefix_extractor.get(),
         &merge_iter_builder, !read_options.ignore_range_deletions);
   }
   TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
@@ -3843,6 +3848,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
     }
   }
   if (read_options.tailing) {
+    read_options.total_order_seek |=
+        immutable_db_options_.prefix_seek_opt_in_only;
+
     auto iter = new ForwardIterator(this, read_options, cfd, sv,
                                     /* allow_unprepared_value */ true);
     result = NewDBIterator(
@@ -4044,6 +4052,9 @@ Status DBImpl::NewIterators(
 
   assert(cf_sv_pairs.size() == column_families.size());
   if (read_options.tailing) {
+    read_options.total_order_seek |=
+        immutable_db_options_.prefix_seek_opt_in_only;
+
     for (const auto& cf_sv_pair : cf_sv_pairs) {
       auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
                                       cf_sv_pair.super_version,
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 4fb100876..dec61c050 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1669,7 +1669,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   TableProperties table_properties;
   {
     ScopedArenaPtr<InternalIterator> iter(
-        mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+        mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+                         /*prefix_extractor=*/nullptr));
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] [WriteLevel0TableForRecovery]"
                     " Level-0 table #%" PRIu64 ": started",
diff --git a/db/db_iter.cc b/db/db_iter.cc
index b42acc4bc..e02586377 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -65,9 +65,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       valid_(false),
       current_entry_is_merged_(false),
       is_key_seqnum_zero_(false),
-      prefix_same_as_start_(mutable_cf_options.prefix_extractor
-                                ? read_options.prefix_same_as_start
-                                : false),
+      prefix_same_as_start_(
+          prefix_extractor_ ? read_options.prefix_same_as_start : false),
       pin_thru_lifetime_(read_options.pin_data),
       expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
                                      read_options.total_order_seek ||
@@ -93,6 +92,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
   status_.PermitUncheckedError();
   assert(timestamp_size_ ==
          user_comparator_.user_comparator()->timestamp_size());
+  // prefix_seek_opt_in_only should force total_order_seek whereever the caller
+  // is duplicating the original ReadOptions
+  assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
 }
 
 Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 92211ad42..fbe66a986 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -5597,32 +5597,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) {
   bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
   bbto.whole_key_filtering = false;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyAndReopen(options);
 
-  // Construct two L1 files with keys:
-  // f1:[aaa1 ccc1] f2:[ddd0]
-  ASSERT_OK(Put("aaa1", ""));
-  ASSERT_OK(Put("ccc1", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("ddd0", ""));
-  ASSERT_OK(Flush());
-  CompactRangeOptions cro;
-  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
-  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // This test is also the primary test for prefix_seek_opt_in_only
+  for (bool opt_in : {false, true}) {
+    options.prefix_seek_opt_in_only = opt_in;
+    DestroyAndReopen(options);
 
-  Iterator* iter = db_->NewIterator(ReadOptions());
-  ASSERT_OK(iter->status());
+    // Construct two L1 files with keys:
+    // f1:[aaa1 ccc1] f2:[ddd0]
+    ASSERT_OK(Put("aaa1", ""));
+    ASSERT_OK(Put("ccc1", ""));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("ddd0", ""));
+    ASSERT_OK(Flush());
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  // Bloom filter is filterd out by f1.
-  // This is just one of several valid position following the contract.
-  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
-  // the behavior of the current implementation. If underlying implementation
-  // changes, the test might fail here.
-  iter->Seek("bbb1");
-  ASSERT_OK(iter->status());
-  ASSERT_FALSE(iter->Valid());
+    ReadOptions ropts;
+    for (bool same : {false, true}) {
+      ropts.prefix_same_as_start = same;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
+      ASSERT_OK(iter->status());
 
-  delete iter;
+      iter->Seek("bbb1");
+      ASSERT_OK(iter->status());
+      if (opt_in && !same) {
+        // Unbounded total order seek
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key(), "ccc1");
+      } else {
+        // Bloom filter is filterd out by f1. When same == false, this is just
+        // one valid position following the contract. Postioning to ccc1 or ddd0
+        // is also valid. This is just to validate the behavior of the current
+        // implementation. If underlying implementation changes, the test might
+        // fail here.
+        ASSERT_FALSE(iter->Valid());
+      }
+    }
+  }
 }
 
 TEST_F(DBTest2, RowCacheSnapshot) {
@@ -5987,6 +6000,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) {
     // create a DB with block prefix index
     BlockBasedTableOptions table_options;
     Options options = CurrentOptions();
+    options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
 
     // Sometimes filter is checked based on upper bound. Assert counters
     // for that case. Otherwise, only check data correctness.
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index acb1c05c1..60441da0b 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -832,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
 
 TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
   Options options = CurrentOptions();
+  options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
   options.env = env_;
   options.create_if_missing = true;
   options.prefix_extractor.reset(NewFixedPrefixTransform(3));
@@ -1009,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
   TestComparator test_cmp(kTimestampSize);
   options.comparator = &test_cmp;
   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.prefix_seek_opt_in_only = false;  // Use legacy prefix seek
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   DestroyAndReopen(options);
   const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 6bd71dd56..8206bd298 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -421,8 +421,8 @@ Status FlushJob::MemPurge() {
   std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
       range_del_iters;
   for (MemTable* m : mems_) {
-    memtables.push_back(
-        m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+    memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+                                       &arena, /*prefix_extractor=*/nullptr));
     auto* range_del_iter = m->NewRangeTombstoneIterator(
         ro, kMaxSequenceNumber, true /* immutable_memtable */);
     if (range_del_iter != nullptr) {
@@ -893,8 +893,8 @@ Status FlushJob::WriteLevel0Table() {
           db_options_.info_log,
           "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
           cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
-      memtables.push_back(
-          m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+      memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr,
+                                         &arena, /*prefix_extractor=*/nullptr));
       auto* range_del_iter = m->NewRangeTombstoneIterator(
           ro, kMaxSequenceNumber, true /* immutable_memtable */);
       if (range_del_iter != nullptr) {
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index a4cbdb466..0bf7c15ab 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -712,9 +712,11 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
       sv_->GetSeqnoToTimeMapping();
   mutable_iter_ =
-      sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
-  sv_->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
-                         &arena_);
+      sv_->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+                            sv_->mutable_cf_options.prefix_extractor.get());
+  sv_->imm->AddIterators(read_options_, seqno_to_time_mapping,
+                         sv_->mutable_cf_options.prefix_extractor.get(),
+                         &imm_iters_, &arena_);
   if (!read_options_.ignore_range_deletions) {
     std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
         sv_->mem->NewRangeTombstoneIterator(
@@ -781,9 +783,11 @@ void ForwardIterator::RenewIterators() {
   UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping =
       svnew->GetSeqnoToTimeMapping();
   mutable_iter_ =
-      svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_);
-  svnew->imm->AddIterators(read_options_, seqno_to_time_mapping, &imm_iters_,
-                           &arena_);
+      svnew->mem->NewIterator(read_options_, seqno_to_time_mapping, &arena_,
+                              svnew->mutable_cf_options.prefix_extractor.get());
+  svnew->imm->AddIterators(read_options_, seqno_to_time_mapping,
+                           svnew->mutable_cf_options.prefix_extractor.get(),
+                           &imm_iters_, &arena_);
   ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
                                        kMaxSequenceNumber /* upper_bound */);
   if (!read_options_.ignore_range_deletions) {
diff --git a/db/memtable.cc b/db/memtable.cc
index ef1184ded..5ba0a0dac 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -365,9 +365,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
 
 class MemTableIterator : public InternalIterator {
  public:
-  MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
-                   UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
-                   Arena* arena, bool use_range_del_table = false)
+  enum Kind { kPointEntries, kRangeDelEntries };
+  MemTableIterator(
+      Kind kind, const MemTable& mem, const ReadOptions& read_options,
+      UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping = nullptr,
+      Arena* arena = nullptr,
+      const SliceTransform* cf_prefix_extractor = nullptr)
       : bloom_(nullptr),
         prefix_extractor_(mem.prefix_extractor_),
         comparator_(mem.comparator_),
@@ -382,14 +385,21 @@ class MemTableIterator : public InternalIterator {
         arena_mode_(arena != nullptr),
         paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
         allow_data_in_error(mem.moptions_.allow_data_in_errors) {
-    if (use_range_del_table) {
+    if (kind == kRangeDelEntries) {
       iter_ = mem.range_del_table_->GetIterator(arena);
-    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
-               !read_options.auto_prefix_mode) {
+    } else if (prefix_extractor_ != nullptr &&
+               // NOTE: checking extractor equivalence when not pointer
+               // equivalent is arguably too expensive for memtable
+               prefix_extractor_ == cf_prefix_extractor &&
+               (read_options.prefix_same_as_start ||
+                (!read_options.total_order_seek &&
+                 !read_options.auto_prefix_mode))) {
       // Auto prefix mode is not implemented in memtable yet.
+      assert(kind == kPointEntries);
       bloom_ = mem.bloom_filter_.get();
       iter_ = mem.table_->GetDynamicPrefixIterator(arena);
     } else {
+      assert(kind == kPointEntries);
       iter_ = mem.table_->GetIterator(arena);
     }
     status_.PermitUncheckedError();
@@ -433,8 +443,8 @@ class MemTableIterator : public InternalIterator {
       // iterator should only use prefix bloom filter
       Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
       if (prefix_extractor_->InDomain(user_k_without_ts)) {
-        if (!bloom_->MayContain(
-                prefix_extractor_->Transform(user_k_without_ts))) {
+        Slice prefix = prefix_extractor_->Transform(user_k_without_ts);
+        if (!bloom_->MayContain(prefix)) {
           PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
           valid_ = false;
           return;
@@ -594,11 +604,13 @@ class MemTableIterator : public InternalIterator {
 
 InternalIterator* MemTable::NewIterator(
     const ReadOptions& read_options,
-    UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena) {
+    UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+    const SliceTransform* prefix_extractor) {
   assert(arena != nullptr);
   auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
   return new (mem)
-      MemTableIterator(*this, read_options, seqno_to_time_mapping, arena);
+      MemTableIterator(MemTableIterator::kPointEntries, *this, read_options,
+                       seqno_to_time_mapping, arena, prefix_extractor);
 }
 
 FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
@@ -633,8 +645,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
     cache->reader_mutex.lock();
     if (!cache->tombstones) {
       auto* unfragmented_iter = new MemTableIterator(
-          *this, read_options, nullptr /* seqno_to_time_mapping= */,
-          nullptr /* arena */, true /* use_range_del_table */);
+          MemTableIterator::kRangeDelEntries, *this, read_options);
       cache->tombstones.reset(new FragmentedRangeTombstoneList(
           std::unique_ptr<InternalIterator>(unfragmented_iter),
           comparator_.comparator));
@@ -655,8 +666,7 @@ void MemTable::ConstructFragmentedRangeTombstones() {
   if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
     // TODO: plumb Env::IOActivity, Env::IOPriority
     auto* unfragmented_iter = new MemTableIterator(
-        *this, ReadOptions(), nullptr /*seqno_to_time_mapping=*/,
-        nullptr /* arena */, true /* use_range_del_table */);
+        MemTableIterator::kRangeDelEntries, *this, ReadOptions());
 
     fragmented_range_tombstone_list_ =
         std::make_unique<FragmentedRangeTombstoneList>(
diff --git a/db/memtable.h b/db/memtable.h
index ca0652bc0..194b4543c 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -210,7 +210,8 @@ class MemTable {
   // data, currently only needed for iterators serving user reads.
   InternalIterator* NewIterator(
       const ReadOptions& read_options,
-      UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena);
+      UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping, Arena* arena,
+      const SliceTransform* prefix_extractor);
 
   // Returns an iterator that yields the range tombstones of the memtable.
   // The caller must ensure that the underlying MemTable remains live
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index c3612656e..c81c096b5 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -214,20 +214,23 @@ Status MemTableListVersion::AddRangeTombstoneIterators(
 void MemTableListVersion::AddIterators(
     const ReadOptions& options,
     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+    const SliceTransform* prefix_extractor,
     std::vector<InternalIterator*>* iterator_list, Arena* arena) {
   for (auto& m : memlist_) {
-    iterator_list->push_back(
-        m->NewIterator(options, seqno_to_time_mapping, arena));
+    iterator_list->push_back(m->NewIterator(options, seqno_to_time_mapping,
+                                            arena, prefix_extractor));
   }
 }
 
 void MemTableListVersion::AddIterators(
     const ReadOptions& options,
     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+    const SliceTransform* prefix_extractor,
     MergeIteratorBuilder* merge_iter_builder, bool add_range_tombstone_iter) {
   for (auto& m : memlist_) {
-    auto mem_iter = m->NewIterator(options, seqno_to_time_mapping,
-                                   merge_iter_builder->GetArena());
+    auto mem_iter =
+        m->NewIterator(options, seqno_to_time_mapping,
+                       merge_iter_builder->GetArena(), prefix_extractor);
     if (!add_range_tombstone_iter || options.ignore_range_deletions) {
       merge_iter_builder->AddIterator(mem_iter);
     } else {
diff --git a/db/memtable_list.h b/db/memtable_list.h
index dd439de55..390b4137d 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -113,11 +113,13 @@ class MemTableListVersion {
 
   void AddIterators(const ReadOptions& options,
                     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+                    const SliceTransform* prefix_extractor,
                     std::vector<InternalIterator*>* iterator_list,
                     Arena* arena);
 
   void AddIterators(const ReadOptions& options,
                     UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping,
+                    const SliceTransform* prefix_extractor,
                     MergeIteratorBuilder* merge_iter_builder,
                     bool add_range_tombstone_iter);
 
diff --git a/db/repair.cc b/db/repair.cc
index 114d36a6a..f7f4fbafb 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -443,7 +443,8 @@ class Repairer {
       ro.total_order_seek = true;
       Arena arena;
       ScopedArenaPtr<InternalIterator> iter(
-          mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
+          mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
+                           /*prefix_extractor=*/nullptr));
       int64_t _current_time = 0;
       immutable_db_options_.clock->GetCurrentTime(&_current_time)
           .PermitUncheckedError();  // ignore error
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 8db8c32a0..d09dd0167 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -59,7 +59,7 @@ static std::string PrintContents(WriteBatch* b,
     InternalIterator* iter;
     if (i == 0) {
       iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
-                              &arena);
+                              &arena, /*prefix_extractor=*/nullptr);
       arena_iter_guard.reset(iter);
     } else {
       iter = mem->NewRangeTombstoneIterator(ReadOptions(),
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 5d4839df8..5a4c310c4 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1592,7 +1592,7 @@ Status StressTest::TestIterateImpl(ThreadState* thread,
     ro.total_order_seek = true;
     expect_total_order = true;
   } else if (thread->rand.OneIn(4)) {
-    ro.total_order_seek = false;
+    ro.total_order_seek = thread->rand.OneIn(2);
     ro.auto_prefix_mode = true;
     expect_total_order = true;
   } else if (options_.prefix_extractor.get() == nullptr) {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 27feadb80..e272e3a69 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1399,6 +1399,17 @@ struct DBOptions {
   // eventually be obsolete and removed as Identity files are phased out.
   bool write_identity_file = true;
 
+  // Historically, when prefix_extractor != nullptr, iterators have an
+  // unfortunate default semantics of *possibly* only returning data
+  // within the same prefix. To avoid "spooky action at a distance," iterator
+  // bounds should come from the instantiation or seeking of the iterator,
+  // not from a mutable column family option.
+  //
+  // When set to true, it is as if every iterator is created with
+  // total_order_seek=true and only auto_prefix_mode=true and
+  // prefix_same_as_start=true can take advantage of prefix seek optimizations.
+  bool prefix_seek_opt_in_only = false;
+
   // The number of bytes to prefetch when reading the log. This is mostly useful
   // for reading a remotely located log, as it can save the number of
   // round-trips. If 0, then the prefetching is disabled.
@@ -1848,10 +1859,10 @@ struct ReadOptions {
   bool auto_prefix_mode = false;
 
   // Enforce that the iterator only iterates over the same prefix as the seek.
-  // This option is effective only for prefix seeks, i.e. prefix_extractor is
-  // non-null for the column family and total_order_seek is false.  Unlike
-  // iterate_upper_bound, prefix_same_as_start only works within a prefix
-  // but in both directions.
+  // This makes the iterator bounds dependent on the column family's current
+  // prefix_extractor, which is mutable. When SST files have been built with
+  // the same prefix extractor, prefix filtering optimizations will be used
+  // for both Seek and SeekForPrev.
   bool prefix_same_as_start = false;
 
   // Keep the blocks loaded by the iterator pinned in memory as long as the
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index 53f10998c..ed456acc3 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
   ROCKSDB_NAMESPACE::Arena arena;
   ROCKSDB_NAMESPACE::ScopedArenaPtr<ROCKSDB_NAMESPACE::InternalIterator> iter(
       mem->NewIterator(ROCKSDB_NAMESPACE::ReadOptions(),
-                       /*seqno_to_time_mapping=*/nullptr, &arena));
+                       /*seqno_to_time_mapping=*/nullptr, &arena,
+                       /*prefix_extractor=*/nullptr));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
     ikey.clear();
diff --git a/options/db_options.cc b/options/db_options.cc
index 0b880f4b9..e9e8fc5b7 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -403,6 +403,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"prefix_seek_opt_in_only",
+         {offsetof(struct ImmutableDBOptions, prefix_seek_opt_in_only),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"write_dbid_to_manifest",
          {offsetof(struct ImmutableDBOptions, write_dbid_to_manifest),
           OptionType::kBoolean, OptionVerificationType::kNormal,
@@ -774,6 +778,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       background_close_inactive_wals(options.background_close_inactive_wals),
       atomic_flush(options.atomic_flush),
       avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
+      prefix_seek_opt_in_only(options.prefix_seek_opt_in_only),
       persist_stats_to_disk(options.persist_stats_to_disk),
       write_dbid_to_manifest(options.write_dbid_to_manifest),
       write_identity_file(options.write_identity_file),
@@ -948,6 +953,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "            Options.avoid_unnecessary_blocking_io: %d",
                    avoid_unnecessary_blocking_io);
+  ROCKS_LOG_HEADER(log, "            Options.prefix_seek_opt_in_only: %d",
+                   prefix_seek_opt_in_only);
   ROCKS_LOG_HEADER(log, "                Options.persist_stats_to_disk: %u",
                    persist_stats_to_disk);
   ROCKS_LOG_HEADER(log, "                Options.write_dbid_to_manifest: %d",
diff --git a/options/db_options.h b/options/db_options.h
index 842fefca8..ac76ea40d 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -87,6 +87,7 @@ struct ImmutableDBOptions {
   bool background_close_inactive_wals;
   bool atomic_flush;
   bool avoid_unnecessary_blocking_io;
+  bool prefix_seek_opt_in_only;
   bool persist_stats_to_disk;
   bool write_dbid_to_manifest;
   bool write_identity_file;
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index fe45224d0..0dfe3e38a 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2063,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator(
   if (arena == nullptr) {
     return new BlockBasedTableIterator(
         this, read_options, rep_->internal_comparator, std::move(index_iter),
-        !skip_filters && !read_options.total_order_seek &&
+        !skip_filters &&
+            (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+             read_options.prefix_same_as_start) &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, caller,
         compaction_readahead_size, allow_unprepared_value);
@@ -2071,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator(
     auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
     return new (mem) BlockBasedTableIterator(
         this, read_options, rep_->internal_comparator, std::move(index_iter),
-        !skip_filters && !read_options.total_order_seek &&
+        !skip_filters &&
+            (!read_options.total_order_seek || read_options.auto_prefix_mode ||
+             read_options.prefix_same_as_start) &&
             prefix_extractor != nullptr,
         need_upper_bound_check, prefix_extractor, caller,
         compaction_readahead_size, allow_unprepared_value);
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index d3c968f73..9d4e8eccf 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator(
   assert(table_properties_);
 
   // Auto prefix mode is not implemented in PlainTable.
-  bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
-                         !options.auto_prefix_mode;
+  bool use_prefix_seek =
+      !IsTotalOrderMode() &&
+      (options.prefix_same_as_start ||
+       (!options.total_order_seek && !options.auto_prefix_mode));
   if (arena == nullptr) {
     return new PlainTableIterator(this, use_prefix_seek);
   } else {
diff --git a/table/table_test.cc b/table/table_test.cc
index 9eee26761..bb0b70222 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -534,7 +534,7 @@ class MemTableConstructor : public Constructor {
       const SliceTransform* /*prefix_extractor*/) const override {
     return new KeyConvertingIterator(
         memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
-                               &arena_),
+                               &arena_, /*prefix_extractor=*/nullptr),
         true);
   }
 
@@ -4904,8 +4904,9 @@ TEST_F(MemTableTest, Simple) {
     std::unique_ptr<InternalIterator> iter_guard;
     InternalIterator* iter;
     if (i == 0) {
-      iter = GetMemTable()->NewIterator(
-          ReadOptions(), /*seqno_to_time_mapping=*/nullptr, &arena);
+      iter = GetMemTable()->NewIterator(ReadOptions(),
+                                        /*seqno_to_time_mapping=*/nullptr,
+                                        &arena, /*prefix_extractor=*/nullptr);
       arena_iter_guard.reset(iter);
     } else {
       iter = GetMemTable()->NewRangeTombstoneIterator(
diff --git a/unreleased_history/bug_fixes/memtable_prefix.md b/unreleased_history/bug_fixes/memtable_prefix.md
new file mode 100644
index 000000000..d7b45c65e
--- /dev/null
+++ b/unreleased_history/bug_fixes/memtable_prefix.md
@@ -0,0 +1 @@
+* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
diff --git a/unreleased_history/new_features/prefix_seek_opt_in_only.md b/unreleased_history/new_features/prefix_seek_opt_in_only.md
new file mode 100644
index 000000000..71c3cd689
--- /dev/null
+++ b/unreleased_history/new_features/prefix_seek_opt_in_only.md
@@ -0,0 +1 @@
+* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.

From fbbb08770fc46117d1f056d423cd12cab6fdacdb Mon Sep 17 00:00:00 2001
From: Levi Tamasi <ltamasi@fb.com>
Date: Fri, 20 Sep 2024 19:19:06 -0700
Subject: [PATCH 10/17] Update HISTORY.md, version.h, and the format
 compatibility check script for the 9.7 release (#13027)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13027

Reviewed By: jowlyzhang

Differential Revision: D63158344

Pulled By: ltamasi

fbshipit-source-id: e650a0024155d52c7aa2afd0f242b8363071279a
---
 HISTORY.md                                    | 20 +++++++++++++++++++
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              |  2 +-
 ...blob_garbage_collection_force_threshold.md |  1 -
 unreleased_history/behavior_changes/db_id.md  |  1 -
 .../behavior_changes/fifo-temp-compaction.md  |  1 -
 .../ingest-live-file-with-move.md             |  1 -
 .../behavior_changes/link-file-ingest.md      |  1 -
 .../sst_file_manager_untrack_close.md         |  2 --
 .../bug_fixes/bug-refit-level.md              |  1 -
 .../bug_fixes/memtable_prefix.md              |  1 -
 .../bug_fixes/wal-error-recovery.md           |  1 -
 .../new_features/customizable_cache.md        |  1 -
 .../new_features/prefix_seek_opt_in_only.md   |  1 -
 .../new_features/tp_largest_seqno.md          |  1 -
 15 files changed, 22 insertions(+), 15 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md
 delete mode 100644 unreleased_history/behavior_changes/db_id.md
 delete mode 100644 unreleased_history/behavior_changes/fifo-temp-compaction.md
 delete mode 100644 unreleased_history/behavior_changes/ingest-live-file-with-move.md
 delete mode 100644 unreleased_history/behavior_changes/link-file-ingest.md
 delete mode 100644 unreleased_history/behavior_changes/sst_file_manager_untrack_close.md
 delete mode 100644 unreleased_history/bug_fixes/bug-refit-level.md
 delete mode 100644 unreleased_history/bug_fixes/memtable_prefix.md
 delete mode 100644 unreleased_history/bug_fixes/wal-error-recovery.md
 delete mode 100644 unreleased_history/new_features/customizable_cache.md
 delete mode 100644 unreleased_history/new_features/prefix_seek_opt_in_only.md
 delete mode 100644 unreleased_history/new_features/tp_largest_seqno.md

diff --git a/HISTORY.md b/HISTORY.md
index f3ad192da..d4fc843a2 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,26 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 9.7.0 (09/20/2024)
+### New Features
+* Make Cache a customizable class that can be instantiated by the object registry.
+* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.
+* Add a new table property "rocksdb.key.largest.seqno" which records the largest sequence number of all keys in file. It is verified to be zero during SST file ingestion.
+
+### Behavior Changes
+* Changed the semantics of the BlobDB configuration option `blob_garbage_collection_force_threshold` to define a threshold for the overall garbage ratio of all blob files currently eligible for garbage collection (according to `blob_garbage_collection_age_cutoff`). This can provide better control over space amplification at the cost of slightly higher write amplification.
+* Set `write_dbid_to_manifest=true` by default. This means DB ID will now be preserved through backups, checkpoints, etc. by default. Also add `write_identity_file` option which can be set to false for anticipated future behavior.
+* In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018).
+* Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files.
+* Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion.
+* DB::Close now untracks files in SstFileManager, making avaialble any space used
+by them. Prior to this change they would be orphaned until the DB is re-opened.
+
+### Bug Fixes
+* Fix a bug in CompactRange() where result files may not be compacted in any future compaction. This can only happen when users configure CompactRangeOptions::change_level to true and the change level step of manual compaction fails (#13009).
+* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
+* Fix a bug with manual_wal_flush and auto error recovery from WAL failure that may cause CFs to be inconsistent (#12995). The fix will set potential WAL write failure as fatal error when manual_wal_flush is true, and disables auto error recovery from these errors.
+
 ## 9.6.0 (08/19/2024)
 ### New Features
 * *Best efforts recovery supports recovering to incomplete Version with a clean seqno cut that presents a valid point in time view from the user's perspective, if versioning history doesn't include atomic flush.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index fbbd9765c..2a9b1aff3 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 9
-#define ROCKSDB_MINOR 7
+#define ROCKSDB_MINOR 8
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 98b8ca094..16309f18c 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -125,7 +125,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md b/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md
deleted file mode 100644
index 0c4b8bba2..000000000
--- a/unreleased_history/behavior_changes/blob_garbage_collection_force_threshold.md
+++ /dev/null
@@ -1 +0,0 @@
-Changed the semantics of the BlobDB configuration option `blob_garbage_collection_force_threshold` to define a threshold for the overall garbage ratio of all blob files currently eligible for garbage collection (according to `blob_garbage_collection_age_cutoff`). This can provide better control over space amplification at the cost of slightly higher write amplification.
diff --git a/unreleased_history/behavior_changes/db_id.md b/unreleased_history/behavior_changes/db_id.md
deleted file mode 100644
index 5536f3ce5..000000000
--- a/unreleased_history/behavior_changes/db_id.md
+++ /dev/null
@@ -1 +0,0 @@
-* Set `write_dbid_to_manifest=true` by default. This means DB ID will now be preserved through backups, checkpoints, etc. by default. Also add `write_identity_file` option which can be set to false for anticipated future behavior.
diff --git a/unreleased_history/behavior_changes/fifo-temp-compaction.md b/unreleased_history/behavior_changes/fifo-temp-compaction.md
deleted file mode 100644
index 7ba5f643d..000000000
--- a/unreleased_history/behavior_changes/fifo-temp-compaction.md
+++ /dev/null
@@ -1 +0,0 @@
-* In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018). 
\ No newline at end of file
diff --git a/unreleased_history/behavior_changes/ingest-live-file-with-move.md b/unreleased_history/behavior_changes/ingest-live-file-with-move.md
deleted file mode 100644
index 444a7a45e..000000000
--- a/unreleased_history/behavior_changes/ingest-live-file-with-move.md
+++ /dev/null
@@ -1 +0,0 @@
-* Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files.
\ No newline at end of file
diff --git a/unreleased_history/behavior_changes/link-file-ingest.md b/unreleased_history/behavior_changes/link-file-ingest.md
deleted file mode 100644
index e9f909eee..000000000
--- a/unreleased_history/behavior_changes/link-file-ingest.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion.
\ No newline at end of file
diff --git a/unreleased_history/behavior_changes/sst_file_manager_untrack_close.md b/unreleased_history/behavior_changes/sst_file_manager_untrack_close.md
deleted file mode 100644
index 7251b547e..000000000
--- a/unreleased_history/behavior_changes/sst_file_manager_untrack_close.md
+++ /dev/null
@@ -1,2 +0,0 @@
-DB::Close now untracks files in SstFileManager, making avaialble any space used
-by them. Prior to this change they would be orphaned until the DB is re-opened.
diff --git a/unreleased_history/bug_fixes/bug-refit-level.md b/unreleased_history/bug_fixes/bug-refit-level.md
deleted file mode 100644
index f41e699e6..000000000
--- a/unreleased_history/bug_fixes/bug-refit-level.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug in CompactRange() where result files may not be compacted in any future compaction. This can only happen when users configure CompactRangeOptions::change_level to true and the change level step of manual compaction fails (#13009).
\ No newline at end of file
diff --git a/unreleased_history/bug_fixes/memtable_prefix.md b/unreleased_history/bug_fixes/memtable_prefix.md
deleted file mode 100644
index d7b45c65e..000000000
--- a/unreleased_history/bug_fixes/memtable_prefix.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix handling of dynamic change of `prefix_extractor` with memtable prefix filter. Previously, prefix seek could mix different prefix interpretations between memtable and SST files. Now the latest `prefix_extractor` at the time of iterator creation or refresh is respected.
diff --git a/unreleased_history/bug_fixes/wal-error-recovery.md b/unreleased_history/bug_fixes/wal-error-recovery.md
deleted file mode 100644
index 07d87473d..000000000
--- a/unreleased_history/bug_fixes/wal-error-recovery.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug with manual_wal_flush and auto error recovery from WAL failure that may cause CFs to be inconsistent (#12995). The fix will set potential WAL write failure as fatal error when manual_wal_flush is true, and disables auto error recovery from these errors.   
\ No newline at end of file
diff --git a/unreleased_history/new_features/customizable_cache.md b/unreleased_history/new_features/customizable_cache.md
deleted file mode 100644
index b50d03c24..000000000
--- a/unreleased_history/new_features/customizable_cache.md
+++ /dev/null
@@ -1 +0,0 @@
-Make Cache a customizable class that can be instantiated by the object registry.
diff --git a/unreleased_history/new_features/prefix_seek_opt_in_only.md b/unreleased_history/new_features/prefix_seek_opt_in_only.md
deleted file mode 100644
index 71c3cd689..000000000
--- a/unreleased_history/new_features/prefix_seek_opt_in_only.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add new option `prefix_seek_opt_in_only` that makes iterators generally safer when you might set a `prefix_extractor`. When `prefix_seek_opt_in_only=true`, which is expected to be the future default, prefix seek is only used when `prefix_same_as_start` or `auto_prefix_mode` are set. Also, `prefix_same_as_start` and `auto_prefix_mode` now allow prefix filtering even with `total_order_seek=true`.
diff --git a/unreleased_history/new_features/tp_largest_seqno.md b/unreleased_history/new_features/tp_largest_seqno.md
deleted file mode 100644
index 6776cfcf2..000000000
--- a/unreleased_history/new_features/tp_largest_seqno.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add a new table property "rocksdb.key.largest.seqno" which records the largest sequence number of all keys in file. It is verified to be zero during SST file ingestion.
\ No newline at end of file

From 2a5ff78c12d89c7e23df7ee0899cd9152a6ade7a Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 25 Sep 2024 10:26:15 -0700
Subject: [PATCH 11/17] More info in CompactionServiceJobInfo and
 CompactionJobStats (#13029)

Summary:
Add the following to the `CompactionServiceJobInfo`
- compaction_reason
- is_full_compaction
- is_manual_compaction
- bottommost_level

Added `is_remote_compaction` to the `CompactionJobStats` and set initial values to avoid UB for uninitialized values.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13029

Test Plan:
```
./compaction_service_test --gtest_filter="*CompactionInfo*"
```

Reviewed By: anand1976

Differential Revision: D63322878

Pulled By: jaykorean

fbshipit-source-id: f02a66ca45e660b9d354a43837d8ec6beb7621fb
---
 db/compaction/compaction_job_test.cc     |   3 +-
 db/compaction/compaction_service_job.cc  |  12 ++-
 db/compaction/compaction_service_test.cc | 104 ++++++++++++++++++++++-
 include/rocksdb/compaction_job_stats.h   |  59 +++++++------
 include/rocksdb/options.h                |  17 +++-
 util/compaction_job_stats_impl.cc        |   1 +
 6 files changed, 159 insertions(+), 37 deletions(-)

diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index e286817e6..f8a78bc9b 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -50,7 +50,8 @@ void VerifyInitializationOfCompactionJobStats(
   ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
   ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
 
-  ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+  ASSERT_TRUE(compaction_job_stats.is_manual_compaction);
+  ASSERT_FALSE(compaction_job_stats.is_remote_compaction);
 
   ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
   ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 8a8db3362..b34c7e662 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -68,8 +68,11 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
       compaction->column_family_data()->GetName().c_str(), job_id_,
       compaction_input.output_level, input_files_oss.str().c_str());
-  CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
-                                GetCompactionId(sub_compact), thread_pri_);
+  CompactionServiceJobInfo info(
+      dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact),
+      thread_pri_, compaction->compaction_reason(),
+      compaction->is_full_compaction(), compaction->is_manual_compaction(),
+      compaction->bottommost_level());
   CompactionServiceScheduleResponse response =
       db_options_.compaction_service->Schedule(info, compaction_input_binary);
   switch (response.status) {
@@ -333,6 +336,7 @@ Status CompactionServiceCompactionJob::Run() {
   // Build compaction result
   compaction_result_->output_level = compact_->compaction->output_level();
   compaction_result_->output_path = output_path_;
+  compaction_result_->stats.is_remote_compaction = true;
   for (const auto& output_file : sub_compact->GetOutputs()) {
     auto& meta = output_file.meta;
     compaction_result_->output_files.emplace_back(
@@ -527,6 +531,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct CompactionJobStats, is_manual_compaction),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"is_remote_compaction",
+         {offsetof(struct CompactionJobStats, is_remote_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"total_input_bytes",
          {offsetof(struct CompactionJobStats, total_input_bytes),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 8aacf2b6d..bb53a4029 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -21,8 +21,10 @@ class MyTestCompactionService : public CompactionService {
       : db_path_(std::move(db_path)),
         options_(options),
         statistics_(statistics),
-        start_info_("na", "na", "na", 0, Env::TOTAL),
-        wait_info_("na", "na", "na", 0, Env::TOTAL),
+        start_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
+                    false, false, false),
+        wait_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
+                   false, false, false),
         listeners_(listeners),
         table_properties_collector_factories_(
             std::move(table_properties_collector_factories)) {}
@@ -97,8 +99,12 @@ class MyTestCompactionService : public CompactionService {
     Status s =
         DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id,
                            compaction_input, result, options_override);
-    if (is_override_wait_result_) {
-      *result = override_wait_result_;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (is_override_wait_result_) {
+        *result = override_wait_result_;
+      }
+      result_ = *result;
     }
     compaction_num_.fetch_add(1);
     if (s.ok()) {
@@ -141,6 +147,10 @@ class MyTestCompactionService : public CompactionService {
 
   void SetCanceled(bool canceled) { canceled_ = canceled; }
 
+  void GetResult(CompactionServiceResult* deserialized) {
+    CompactionServiceResult::Read(result_, deserialized).PermitUncheckedError();
+  }
+
   CompactionServiceJobStatus GetFinalCompactionServiceJobStatus() {
     return final_updated_status_.load();
   }
@@ -162,6 +172,7 @@ class MyTestCompactionService : public CompactionService {
   CompactionServiceJobStatus override_wait_status_ =
       CompactionServiceJobStatus::kFailure;
   bool is_override_wait_result_ = false;
+  std::string result_;
   std::string override_wait_result_;
   std::vector<std::shared_ptr<EventListener>> listeners_;
   std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
@@ -331,6 +342,14 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
   ReopenWithColumnFamilies({kDefaultColumnFamilyName, "cf_1", "cf_2", "cf_3"},
                            options);
   ASSERT_GT(verify_passed, 0);
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  if (s.IsAborted()) {
+    ASSERT_NOK(result.status);
+  } else {
+    ASSERT_OK(result.status);
+  }
+  ASSERT_TRUE(result.stats.is_remote_compaction);
   Close();
 }
 
@@ -369,6 +388,12 @@ TEST_F(CompactionServiceTest, ManualCompaction) {
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
   VerifyTestData();
+
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
 }
 
 TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) {
@@ -601,11 +626,20 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
                               {file.db_path + "/" + file.name}, 2));
   info = my_cs->GetCompactionInfoForStart();
   ASSERT_EQ(Env::USER, info.priority);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(Env::USER, info.priority);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
 
   // Test priority BOTTOM
   env_->SetBackgroundThreads(1, Env::BOTTOM);
+  // This will set bottommost_level = true but is_full_compaction = false
   options.num_levels = 2;
   ReopenWithCompactionService(&options);
   my_cs =
@@ -628,9 +662,71 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(Env::BOTTOM, info.priority);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(Env::BOTTOM, info.priority);
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
+
+  // Test Non-Bottommost Level
+  options.num_levels = 4;
+  ReopenWithCompactionService(&options);
+  my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+  for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value_new_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(false, info.bottommost_level);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(false, info.is_manual_compaction);
+  ASSERT_EQ(false, info.is_full_compaction);
+  ASSERT_EQ(false, info.bottommost_level);
+
+  // Test Full Compaction + Bottommost Level
+  options.num_levels = 6;
+  ReopenWithCompactionService(&options);
+  my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value_new_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(true, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(true, info.is_manual_compaction);
+  ASSERT_EQ(true, info.is_full_compaction);
+  ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
 }
 
 TEST_F(CompactionServiceTest, FallbackLocalAuto) {
diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h
index 7e8153044..8ddb6a330 100644
--- a/include/rocksdb/compaction_job_stats.h
+++ b/include/rocksdb/compaction_job_stats.h
@@ -19,80 +19,83 @@ struct CompactionJobStats {
   void Add(const CompactionJobStats& stats);
 
   // the elapsed time of this compaction in microseconds.
-  uint64_t elapsed_micros;
+  uint64_t elapsed_micros = 0;
 
   // the elapsed CPU time of this compaction in microseconds.
-  uint64_t cpu_micros;
+  uint64_t cpu_micros = 0;
 
   // Used internally indicating whether a subcompaction's
   // `num_input_records` is accurate.
-  bool has_num_input_records;
+  bool has_num_input_records = false;
   // the number of compaction input records.
-  uint64_t num_input_records;
+  uint64_t num_input_records = 0;
   // the number of blobs read from blob files
-  uint64_t num_blobs_read;
+  uint64_t num_blobs_read = 0;
   // the number of compaction input files (table files)
-  size_t num_input_files;
+  size_t num_input_files = 0;
   // the number of compaction input files at the output level (table files)
-  size_t num_input_files_at_output_level;
+  size_t num_input_files_at_output_level = 0;
 
   // the number of compaction output records.
-  uint64_t num_output_records;
+  uint64_t num_output_records = 0;
   // the number of compaction output files (table files)
-  size_t num_output_files;
+  size_t num_output_files = 0;
   // the number of compaction output files (blob files)
-  size_t num_output_files_blob;
+  size_t num_output_files_blob = 0;
 
   // true if the compaction is a full compaction (all live SST files input)
-  bool is_full_compaction;
+  bool is_full_compaction = false;
   // true if the compaction is a manual compaction
-  bool is_manual_compaction;
+  bool is_manual_compaction = false;
+  // true if the compaction ran in a remote worker
+  bool is_remote_compaction = false;
 
   // the total size of table files in the compaction input
-  uint64_t total_input_bytes;
+  uint64_t total_input_bytes = 0;
   // the total size of blobs read from blob files
-  uint64_t total_blob_bytes_read;
+  uint64_t total_blob_bytes_read = 0;
   // the total size of table files in the compaction output
-  uint64_t total_output_bytes;
+  uint64_t total_output_bytes = 0;
   // the total size of blob files in the compaction output
-  uint64_t total_output_bytes_blob;
+  uint64_t total_output_bytes_blob = 0;
+  ;
 
   // number of records being replaced by newer record associated with same key.
   // this could be a new value or a deletion entry for that key so this field
   // sums up all updated and deleted keys
-  uint64_t num_records_replaced;
+  uint64_t num_records_replaced = 0;
 
   // the sum of the uncompressed input keys in bytes.
-  uint64_t total_input_raw_key_bytes;
+  uint64_t total_input_raw_key_bytes = 0;
   // the sum of the uncompressed input values in bytes.
-  uint64_t total_input_raw_value_bytes;
+  uint64_t total_input_raw_value_bytes = 0;
 
   // the number of deletion entries before compaction. Deletion entries
   // can disappear after compaction because they expired
-  uint64_t num_input_deletion_records;
+  uint64_t num_input_deletion_records = 0;
   // number of deletion records that were found obsolete and discarded
   // because it is not possible to delete any more keys with this entry
   // (i.e. all possible deletions resulting from it have been completed)
-  uint64_t num_expired_deletion_records;
+  uint64_t num_expired_deletion_records = 0;
 
   // number of corrupt keys (ParseInternalKey returned false when applied to
   // the key) encountered and written out.
-  uint64_t num_corrupt_keys;
+  uint64_t num_corrupt_keys = 0;
 
   // Following counters are only populated if
   // options.report_bg_io_stats = true;
 
   // Time spent on file's Append() call.
-  uint64_t file_write_nanos;
+  uint64_t file_write_nanos = 0;
 
   // Time spent on sync file range.
-  uint64_t file_range_sync_nanos;
+  uint64_t file_range_sync_nanos = 0;
 
   // Time spent on file fsync.
-  uint64_t file_fsync_nanos;
+  uint64_t file_fsync_nanos = 0;
 
   // Time spent on preparing file write (fallocate, etc)
-  uint64_t file_prepare_write_nanos;
+  uint64_t file_prepare_write_nanos = 0;
 
   // 0-terminated strings storing the first 8 bytes of the smallest and
   // largest key in the output.
@@ -102,10 +105,10 @@ struct CompactionJobStats {
   std::string largest_output_key_prefix;
 
   // number of single-deletes which do not meet a put
-  uint64_t num_single_del_fallthru;
+  uint64_t num_single_del_fallthru = 0;
 
   // number of single-deletes which meet something other than a put
-  uint64_t num_single_del_mismatch;
+  uint64_t num_single_del_mismatch = 0;
 
   // TODO: Add output_to_penultimate_level output information
 };
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index e272e3a69..9700e25af 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -466,14 +466,27 @@ struct CompactionServiceJobInfo {
 
   Env::Priority priority;
 
+  // Additional Compaction Details that can be useful in the CompactionService
+  CompactionReason compaction_reason;
+  bool is_full_compaction;
+  bool is_manual_compaction;
+  bool bottommost_level;
+
   CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
                            std::string db_session_id_, uint64_t job_id_,
-                           Env::Priority priority_)
+                           Env::Priority priority_,
+                           CompactionReason compaction_reason_,
+                           bool is_full_compaction_, bool is_manual_compaction_,
+                           bool bottommost_level_)
       : db_name(std::move(db_name_)),
         db_id(std::move(db_id_)),
         db_session_id(std::move(db_session_id_)),
         job_id(job_id_),
-        priority(priority_) {}
+        priority(priority_),
+        compaction_reason(compaction_reason_),
+        is_full_compaction(is_full_compaction_),
+        is_manual_compaction(is_manual_compaction_),
+        bottommost_level(bottommost_level_) {}
 };
 
 struct CompactionServiceScheduleResponse {
diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc
index cdb591f23..37e39987e 100644
--- a/util/compaction_job_stats_impl.cc
+++ b/util/compaction_job_stats_impl.cc
@@ -24,6 +24,7 @@ void CompactionJobStats::Reset() {
 
   is_full_compaction = false;
   is_manual_compaction = false;
+  is_remote_compaction = false;
 
   total_input_bytes = 0;
   total_blob_bytes_read = 0;

From d02f63cc5458231aaf4ff11266a3055127219e60 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 25 Sep 2024 11:45:51 -0700
Subject: [PATCH 12/17] Respect lowest_used_cache_tier option for compressed
 blocks (#13030)

Summary:
If the lowest_used_cache_tier DB option is set to kVolatileTier, skip insertion of compressed blocks into the secondary cache. Previously, these were always inserted into the secondary cache via the InsertSaved() method, leading to pollution of the secondary cache with blocks that would never be read.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13030

Test Plan: Add a new unit test

Reviewed By: pdillinger

Differential Revision: D63329841

Pulled By: anand1976

fbshipit-source-id: 14d2fce2ed309401d9ad4d2e7c356218b6673f7b
---
 cache/secondary_cache_adapter.cc              |  3 +-
 cache/tiered_secondary_cache_test.cc          | 48 +++++++++++++++++++
 .../skip_insertion_tiered_sec_cache.md        |  1 +
 3 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 unreleased_history/bug_fixes/skip_insertion_tiered_sec_cache.md

diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index 9c1b96010..57a77bc7f 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -271,7 +271,8 @@ Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value,
   // Warm up the secondary cache with the compressed block. The secondary
   // cache may choose to ignore it based on the admission policy.
   if (value != nullptr && !compressed_value.empty() &&
-      adm_policy_ == TieredAdmissionPolicy::kAdmPolicyThreeQueue) {
+      adm_policy_ == TieredAdmissionPolicy::kAdmPolicyThreeQueue &&
+      helper->IsSecondaryCacheCompatible()) {
     Status status = secondary_cache_->InsertSaved(key, compressed_value, type);
     assert(status.ok() || status.IsNotSupported());
   }
diff --git a/cache/tiered_secondary_cache_test.cc b/cache/tiered_secondary_cache_test.cc
index e4b420226..f38a358ad 100644
--- a/cache/tiered_secondary_cache_test.cc
+++ b/cache/tiered_secondary_cache_test.cc
@@ -765,6 +765,54 @@ TEST_F(DBTieredSecondaryCacheTest, IterateTest) {
   Destroy(options);
 }
 
+TEST_F(DBTieredSecondaryCacheTest, VolatileTierTest) {
+  if (!LZ4_Supported()) {
+    ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+    return;
+  }
+
+  BlockBasedTableOptions table_options;
+  // We want a block cache of size 5KB, and a compressed secondary cache of
+  // size 5KB. However, we specify a block cache size of 256KB here in order
+  // to take into account the cache reservation in the block cache on
+  // behalf of the compressed cache. The unit of cache reservation is 256KB.
+  // The effective block cache capacity will be calculated as 256 + 5 = 261KB,
+  // and 256KB will be reserved for the compressed cache, leaving 5KB for
+  // the primary block cache. We only have to worry about this here because
+  // the cache size is so small.
+  table_options.block_cache = NewCache(256 * 1024, 5 * 1024, 256 * 1024);
+  table_options.block_size = 4 * 1024;
+  table_options.cache_index_and_filter_blocks = false;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.compression = kLZ4Compression;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Disable paranoid_file_checks so that flush will not read back the newly
+  // written file
+  options.paranoid_file_checks = false;
+  options.lowest_used_cache_tier = CacheTier::kVolatileTier;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 256;
+  for (int i = 0; i < N; i++) {
+    std::string p_v;
+    test::CompressibleString(&rnd, 0.5, 1007, &p_v);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+
+  // Since lowest_used_cache_tier is the volatile tier, nothing should be
+  // inserted in the secondary cache.
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 0u);
+  ASSERT_EQ(nvm_sec_cache()->num_misses(), 0u);
+
+  Destroy(options);
+}
+
 class DBTieredAdmPolicyTest
     : public DBTieredSecondaryCacheTest,
       public testing::WithParamInterface<TieredAdmissionPolicy> {};
diff --git a/unreleased_history/bug_fixes/skip_insertion_tiered_sec_cache.md b/unreleased_history/bug_fixes/skip_insertion_tiered_sec_cache.md
new file mode 100644
index 000000000..7dcbe099f
--- /dev/null
+++ b/unreleased_history/bug_fixes/skip_insertion_tiered_sec_cache.md
@@ -0,0 +1 @@
+Skip insertion of compressed blocks in the secondary cache if the lowest_used_cache_tier DB option is kVolatileTier.

From d327d560812be6da0fde11ada256f7088775516d Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 25 Sep 2024 15:32:22 -0700
Subject: [PATCH 13/17] Remove unnecessary semi-colon (#13034)

Summary:
As title

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13034

Reviewed By: ltamasi

Differential Revision: D63413712

Pulled By: jaykorean

fbshipit-source-id: 0070761b0d9de1f50fe0baf235643d36aeb9f7f5
---
 include/rocksdb/compaction_job_stats.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h
index 8ddb6a330..f9c1bb3ec 100644
--- a/include/rocksdb/compaction_job_stats.h
+++ b/include/rocksdb/compaction_job_stats.h
@@ -58,7 +58,6 @@ struct CompactionJobStats {
   uint64_t total_output_bytes = 0;
   // the total size of blob files in the compaction output
   uint64_t total_output_bytes_blob = 0;
-  ;
 
   // number of records being replaced by newer record associated with same key.
   // this could be a new value or a deletion entry for that key so this field

From 22105366d93ce3e6290282eec60529cde2f3e8d3 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:47:40 -0700
Subject: [PATCH 14/17] More accurate accounting of compressed cache memory
 (#13032)

Summary:
When an item is inserted into the compressed secondary cache, this PR calculates the charge using the malloc_usable_size of the allocated memory, as well as the unique pointer allocation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13032

Test Plan: New unit test

Reviewed By: pdillinger

Differential Revision: D63418493

Pulled By: anand1976

fbshipit-source-id: 1db2835af6867442bb8cf6d9bf412e120ddd3824
---
 cache/compressed_secondary_cache.cc           | 46 +++++++++++++++----
 cache/compressed_secondary_cache.h            |  2 +
 cache/compressed_secondary_cache_test.cc      |  4 ++
 .../compressed_secondary_cache_account.md     |  1 +
 4 files changed, 45 insertions(+), 8 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/compressed_secondary_cache_account.md

diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index ef2417f8d..4d3d0a2cd 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -79,7 +79,11 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1,
                               static_cast<uint32_t*>(&source_32));
     source = static_cast<CacheTier>(source_32);
-    handle_value_charge -= (data_ptr - ptr->get());
+    uint64_t data_size = 0;
+    data_ptr = GetVarint64Ptr(data_ptr, ptr->get() + handle_value_charge,
+                              static_cast<uint64_t*>(&data_size));
+    assert(handle_value_charge > data_size);
+    handle_value_charge = data_size;
   }
   MemoryAllocator* allocator = cache_options_.memory_allocator.get();
 
@@ -169,13 +173,15 @@ Status CompressedSecondaryCache::InsertInternal(
   }
 
   auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
-  char header[10];
+  char header[20];
   char* payload = header;
   payload = EncodeVarint32(payload, static_cast<uint32_t>(type));
   payload = EncodeVarint32(payload, static_cast<uint32_t>(source));
+  size_t data_size = (*helper->size_cb)(value);
+  char* data_size_ptr = payload;
+  payload = EncodeVarint64(payload, data_size);
 
   size_t header_size = payload - header;
-  size_t data_size = (*helper->size_cb)(value);
   size_t total_size = data_size + header_size;
   CacheAllocationPtr ptr =
       AllocateBlock(total_size, cache_options_.memory_allocator.get());
@@ -210,6 +216,8 @@ Status CompressedSecondaryCache::InsertInternal(
 
     val = Slice(compressed_val);
     data_size = compressed_val.size();
+    payload = EncodeVarint64(data_size_ptr, data_size);
+    header_size = payload - header;
     total_size = header_size + data_size;
     PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size);
 
@@ -222,14 +230,21 @@ Status CompressedSecondaryCache::InsertInternal(
 
   PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
   if (cache_options_.enable_custom_split_merge) {
-    size_t charge{0};
-    CacheValueChunk* value_chunks_head =
-        SplitValueIntoChunks(val, cache_options_.compression_type, charge);
-    return cache_->Insert(key, value_chunks_head, internal_helper, charge);
+    size_t split_charge{0};
+    CacheValueChunk* value_chunks_head = SplitValueIntoChunks(
+        val, cache_options_.compression_type, split_charge);
+    return cache_->Insert(key, value_chunks_head, internal_helper,
+                          split_charge);
   } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    size_t charge = malloc_usable_size(ptr.get());
+#else
+    size_t charge = total_size;
+#endif
     std::memcpy(ptr.get(), header, header_size);
     CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
-    return cache_->Insert(key, buf, internal_helper, total_size);
+    charge += sizeof(CacheAllocationPtr);
+    return cache_->Insert(key, buf, internal_helper, charge);
   }
 }
 
@@ -398,6 +413,21 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
   }
 }
 
+size_t CompressedSecondaryCache::TEST_GetCharge(const Slice& key) {
+  Cache::Handle* lru_handle = cache_->Lookup(key);
+  if (lru_handle == nullptr) {
+    return 0;
+  }
+
+  size_t charge = cache_->GetCharge(lru_handle);
+  if (cache_->Value(lru_handle) != nullptr &&
+      !cache_options_.enable_custom_split_merge) {
+    charge -= 10;
+  }
+  cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
+  return charge;
+}
+
 std::shared_ptr<SecondaryCache>
 CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
   return std::make_shared<CompressedSecondaryCache>(*this);
diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h
index 90e134fcf..45eab656e 100644
--- a/cache/compressed_secondary_cache.h
+++ b/cache/compressed_secondary_cache.h
@@ -139,6 +139,8 @@ class CompressedSecondaryCache : public SecondaryCache {
                         const Cache::CacheItemHelper* helper,
                         CompressionType type, CacheTier source);
 
+  size_t TEST_GetCharge(const Slice& key);
+
   // TODO: clean up to use cleaner interfaces in typed_cache.h
   const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
   std::shared_ptr<Cache> cache_;
diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc
index 058a80dd7..071601783 100644
--- a/cache/compressed_secondary_cache_test.cc
+++ b/cache/compressed_secondary_cache_test.cc
@@ -39,6 +39,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
  protected:
   void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
                        bool sec_cache_is_compressed) {
+    CompressedSecondaryCache* comp_sec_cache =
+        static_cast<CompressedSecondaryCache*>(sec_cache.get());
     get_perf_context()->Reset();
     bool kept_in_sec_cache{true};
     // Lookup an non-existent key.
@@ -66,6 +68,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
 
+    ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), 1000);
+
     std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
         sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
                           /*stats=*/nullptr, kept_in_sec_cache);
diff --git a/unreleased_history/bug_fixes/compressed_secondary_cache_account.md b/unreleased_history/bug_fixes/compressed_secondary_cache_account.md
new file mode 100644
index 000000000..07c73b85e
--- /dev/null
+++ b/unreleased_history/bug_fixes/compressed_secondary_cache_account.md
@@ -0,0 +1 @@
+Fix under counting of allocated memory in the compressed secondary cache due to looking at the compressed block size rather than the actual memory allocated, which could be larger due to internal fragmentation.

From 79790cf2a80fb5e5b6799ebd69d3fb2ebe71d612 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 26 Sep 2024 14:36:29 -0700
Subject: [PATCH 15/17] Bug fix and test BuildDBOptions (#13038)

Summary:
The following DBOptions were not being propagated through BuildDBOptions, which could at least lead to settings being lost through `GetOptionsFromString()`, possibly elsewhere as well:
* background_close_inactive_wals
* write_dbid_to_manifest
* write_identity_file
* prefix_seek_opt_in_only

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13038

Test Plan:
This problem was not being caught by
OptionsSettableTest.DBOptionsAllFieldsSettable when the option was omitted from both options_helper.cc and options_settable_test.cc. I have added to the test to catch future instances (and the updated test was how I found three of the four missing options).

The same kind of bug seems to be caught by
ColumnFamilyOptionsAllFieldsSettable, and AFAIK analogous code does not exist for BlockBasedTableOptions.

Reviewed By: ltamasi

Differential Revision: D63483779

Pulled By: pdillinger

fbshipit-source-id: a5d5f6e434174bacb8e5d251b767e81e62b7225a
---
 options/options_helper.cc                     | 19 +++++++++++++++----
 options/options_helper.h                      |  4 ++++
 options/options_settable_test.cc              | 12 +++++++++++-
 .../bug_fixes/build_db_options.md             |  1 +
 4 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/build_db_options.md

diff --git a/options/options_helper.cc b/options/options_helper.cc
index 011f47b98..232b3f3bd 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -55,7 +55,13 @@ Status ValidateOptions(const DBOptions& db_opts,
 DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
                          const MutableDBOptions& mutable_db_options) {
   DBOptions options;
+  BuildDBOptions(immutable_db_options, mutable_db_options, options);
+  return options;
+}
 
+void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
+                    const MutableDBOptions& mutable_db_options,
+                    DBOptions& options) {
   options.create_if_missing = immutable_db_options.create_if_missing;
   options.create_missing_column_families =
       immutable_db_options.create_missing_column_families;
@@ -88,9 +94,6 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.max_background_jobs = mutable_db_options.max_background_jobs;
   options.max_background_compactions =
       mutable_db_options.max_background_compactions;
-  options.bytes_per_sync = mutable_db_options.bytes_per_sync;
-  options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync;
-  options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync;
   options.max_subcompactions = mutable_db_options.max_subcompactions;
   options.max_background_flushes = mutable_db_options.max_background_flushes;
   options.max_log_file_size = immutable_db_options.max_log_file_size;
@@ -127,6 +130,9 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.writable_file_max_buffer_size =
       mutable_db_options.writable_file_max_buffer_size;
   options.use_adaptive_mutex = immutable_db_options.use_adaptive_mutex;
+  options.bytes_per_sync = mutable_db_options.bytes_per_sync;
+  options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync;
+  options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync;
   options.listeners = immutable_db_options.listeners;
   options.enable_thread_tracking = immutable_db_options.enable_thread_tracking;
   options.delayed_write_rate = mutable_db_options.delayed_write_rate;
@@ -161,9 +167,15 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.two_write_queues = immutable_db_options.two_write_queues;
   options.manual_wal_flush = immutable_db_options.manual_wal_flush;
   options.wal_compression = immutable_db_options.wal_compression;
+  options.background_close_inactive_wals =
+      immutable_db_options.background_close_inactive_wals;
   options.atomic_flush = immutable_db_options.atomic_flush;
   options.avoid_unnecessary_blocking_io =
       immutable_db_options.avoid_unnecessary_blocking_io;
+  options.write_dbid_to_manifest = immutable_db_options.write_dbid_to_manifest;
+  options.write_identity_file = immutable_db_options.write_identity_file;
+  options.prefix_seek_opt_in_only =
+      immutable_db_options.prefix_seek_opt_in_only;
   options.log_readahead_size = immutable_db_options.log_readahead_size;
   options.file_checksum_gen_factory =
       immutable_db_options.file_checksum_gen_factory;
@@ -189,7 +201,6 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.metadata_write_temperature =
       immutable_db_options.metadata_write_temperature;
   options.wal_write_temperature = immutable_db_options.wal_write_temperature;
-  return options;
 }
 
 ColumnFamilyOptions BuildColumnFamilyOptions(
diff --git a/options/options_helper.h b/options/options_helper.h
index 679a1a7ee..7519b627d 100644
--- a/options/options_helper.h
+++ b/options/options_helper.h
@@ -44,6 +44,10 @@ Status ValidateOptions(const DBOptions& db_opts,
 
 DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
                          const MutableDBOptions& mutable_db_options);
+// Overwrites `options`
+void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
+                    const MutableDBOptions& mutable_db_options,
+                    DBOptions& options);
 
 ColumnFamilyOptions BuildColumnFamilyOptions(
     const ColumnFamilyOptions& ioptions,
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 67aab055e..80021444f 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -271,6 +271,12 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
   ASSERT_GT(unset_bytes_base, 0);
   options->~DBOptions();
 
+  // Now also check that BuildDBOptions populates everything
+  FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
+  BuildDBOptions({}, {}, *options);
+  ASSERT_EQ(unset_bytes_base,
+            NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsExcluded));
+
   options = new (options_ptr) DBOptions();
   FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
 
@@ -372,7 +378,11 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "follower_catchup_retry_count=456;"
                              "follower_catchup_retry_wait_ms=789;"
                              "metadata_write_temperature=kCold;"
-                             "wal_write_temperature=kHot;",
+                             "wal_write_temperature=kHot;"
+                             "background_close_inactive_wals=true;"
+                             "write_dbid_to_manifest=true;"
+                             "write_identity_file=true;"
+                             "prefix_seek_opt_in_only=true;",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
diff --git a/unreleased_history/bug_fixes/build_db_options.md b/unreleased_history/bug_fixes/build_db_options.md
new file mode 100644
index 000000000..6994ea719
--- /dev/null
+++ b/unreleased_history/bug_fixes/build_db_options.md
@@ -0,0 +1 @@
+* Several DB option settings could be lost through `GetOptionsFromString()`, possibly elsewhere as well. Affected options, now fixed:`background_close_inactive_wals`, `write_dbid_to_manifest`, `write_identity_file`, `prefix_seek_opt_in_only`

From 2c2776f1f396e0f23dd4d484b871144a2972d787 Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Fri, 27 Sep 2024 14:53:53 -0700
Subject: [PATCH 16/17] Fix some missing values in stress test (#13039)

Summary:
When `avoid_flush_during_shutdown` is false, DB will flush the memtables if there is some unpersisted data:
https://github.com/facebook/rocksdb/blob/79790cf2a80fb5e5b6799ebd69d3fb2ebe71d612/db/db_impl/db_impl.cc#L505-L510

`has_unpersisted_data_` is a flag that is only turned on for when WAL is disabled, for example:
https://github.com/facebook/rocksdb/blob/79790cf2a80fb5e5b6799ebd69d3fb2ebe71d612/db/db_impl/db_impl_write.cc#L525-L528
In other cases, it just has its default false value.

So if disableWAL is false, and avoid_flush_during_shutdown is false, close won't flush memtables. Stress test is also not flush wal/sync wal. There could be missing data, while reopen in stress test doesn't tolerate missing data. To make the test simpler, this changes it to always flush/sync wal during reopen.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13039

Reviewed By: hx235

Differential Revision: D63494695

Pulled By: jowlyzhang

fbshipit-source-id: 8f0fd9ed50a482a3955abc0882257ecc2e95926d
---
 db_stress_tool/db_stress_test_base.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 5a4c310c4..2b6db414f 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3674,7 +3674,7 @@ void StressTest::Reopen(ThreadState* thread) {
   // crash-recovery verification does. Therefore it always expects no data loss
   // and we should ensure no data loss in testing.
   // TODO(hx235): eliminate the FlushWAL(true /* sync */)/SyncWAL() below
-  if (!FLAGS_disable_wal && FLAGS_avoid_flush_during_shutdown) {
+  if (!FLAGS_disable_wal) {
     Status s;
     if (FLAGS_manual_wal_flush_one_in > 0) {
       s = db_->FlushWAL(/*sync=*/true);

From 389e66bef56b4f81d3c9683469acb5affc32bd7f Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 30 Sep 2024 10:27:45 -0700
Subject: [PATCH 17/17] Add comment for memory usage in BeginTransaction() and
 WriteBatch::Clear() (#13042)

Summary:
... to note that memory may not be freed when reusing a transaction. This means reusing a large transaction can cause excessive memory usage and it may be better to destruct the transaction object in some cases.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13042

Test Plan: no code change.

Reviewed By: jowlyzhang

Differential Revision: D63570612

Pulled By: cbi42

fbshipit-source-id: f19ff556f76d54831fb94715e8808035d07e25fa
---
 include/rocksdb/utilities/transaction_db.h | 5 ++++-
 include/rocksdb/write_batch.h              | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 702266640..78ff0816f 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -460,7 +460,10 @@ class TransactionDB : public StackableDB {
   //
   // If old_txn is not null, BeginTransaction will reuse this Transaction
   // handle instead of allocating a new one.  This is an optimization to avoid
-  // extra allocations when repeatedly creating transactions.
+  // extra allocations when repeatedly creating transactions. **Note that this
+  // may not free all the allocated memory by the previous transaction (see
+  // WriteBatch::Clear()). To ensure that all allocated memory is freed, users
+  // must destruct the transaction object.
   virtual Transaction* BeginTransaction(
       const WriteOptions& write_options,
       const TransactionOptions& txn_options = TransactionOptions(),
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index df7048af3..7cdca233a 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -209,6 +209,8 @@ class WriteBatch : public WriteBatchBase {
 
   using WriteBatchBase::Clear;
   // Clear all updates buffered in this batch.
+  // Internally, it calls resize() on the string buffer. So allocated memory
+  // capacity may not be freed.
   void Clear() override;
 
   // Records the state of the batch for future calls to RollbackToSavePoint().