From 9286ce2b5be1b8c5052475c36fcc4421df5857e4 Mon Sep 17 00:00:00 2001 From: ArneMayer Date: Fri, 29 Jun 2018 20:03:11 +0200 Subject: [PATCH] B-Tree Index (#929) * add btree index * Btree index now implements BaseIndex * add cpp-btree submodule * btree include * btree include * started refactoring * btree refactoring * remove trailing whitespace * fixed linter issues * add cpp-btree subdirectory * clang disable werror test * another test * disable btree warnings for clang * fix gcc * Incorporated PR comments * Added a test for BTreeIndex * lint * Added BTreeIndex to IndexScanTest and IndexJoinTest --- .gitmodules | 3 + DEPENDENCIES.md | 1 + src/CMakeLists.txt | 1 + src/lib/CMakeLists.txt | 4 + .../adaptive_radix_tree_index.cpp | 4 +- src/lib/storage/index/b_tree/b_tree_index.cpp | 36 +++++++ src/lib/storage/index/b_tree/b_tree_index.hpp | 35 +++++++ .../index/b_tree/b_tree_index_impl.cpp | 96 +++++++++++++++++++ .../index/b_tree/b_tree_index_impl.hpp | 75 +++++++++++++++ src/lib/storage/index/column_index_type.hpp | 6 +- .../group_key/composite_group_key_index.cpp | 6 +- .../index/group_key/group_key_index.cpp | 4 +- src/test/CMakeLists.txt | 1 + src/test/operators/index_scan_test.cpp | 6 +- src/test/operators/join_index_test.cpp | 4 +- src/test/storage/btree_index_test.cpp | 68 +++++++++++++ third_party/cpp-btree | 1 + 17 files changed, 339 insertions(+), 12 deletions(-) create mode 100644 src/lib/storage/index/b_tree/b_tree_index.cpp create mode 100644 src/lib/storage/index/b_tree/b_tree_index.hpp create mode 100644 src/lib/storage/index/b_tree/b_tree_index_impl.cpp create mode 100644 src/lib/storage/index/b_tree/b_tree_index_impl.hpp create mode 100644 src/test/storage/btree_index_test.cpp create mode 160000 third_party/cpp-btree diff --git a/.gitmodules b/.gitmodules index fc925c4c34..ddbb04a3a0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -16,3 +16,6 @@ [submodule "third_party/libpqxx"] path = third_party/libpqxx url = https://github.com/jtv/libpqxx.git +[submodule "third_party/cpp-btree"] + path = third_party/cpp-btree + url = https://github.com/algorithm-ninja/cpp-btree diff --git a/DEPENDENCIES.md b/DEPENDENCIES.md index 98ac100ea9..585eda82f1 100644 --- a/DEPENDENCIES.md +++ b/DEPENDENCIES.md @@ -33,3 +33,4 @@ - libpqxx (https://github.com/jtv/libpqxx) - sql-parser (https://github.com/hyrise/sql-parser) - pgasus (https://github.com/kateyy/pgasus) +- cpp-btree (https://github.com/algorithm-ninja/cpp-btree) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4b337e4d3e..333209d1d9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -68,6 +68,7 @@ include_directories( ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include ${PROJECT_SOURCE_DIR}/third_party/json ${PROJECT_SOURCE_DIR}/third_party/sql-parser/src + ${PROJECT_SOURCE_DIR}/third_party/cpp-btree ${PROJECT_SOURCE_DIR}/src/benchmarklib/ ${PROJECT_SOURCE_DIR}/src/lib/ diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 4e4958a84b..5055f7530e 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -326,6 +326,10 @@ set( storage/index/adaptive_radix_tree/adaptive_radix_tree_index.hpp storage/index/adaptive_radix_tree/adaptive_radix_tree_nodes.cpp storage/index/adaptive_radix_tree/adaptive_radix_tree_nodes.hpp + storage/index/b_tree/b_tree_index.cpp + storage/index/b_tree/b_tree_index.hpp + storage/index/b_tree/b_tree_index_impl.cpp + storage/index/b_tree/b_tree_index_impl.hpp storage/index/base_index.cpp storage/index/base_index.hpp storage/index/column_index_type.hpp diff --git a/src/lib/storage/index/adaptive_radix_tree/adaptive_radix_tree_index.cpp b/src/lib/storage/index/adaptive_radix_tree/adaptive_radix_tree_index.cpp index 540d638e63..ca6c9dc639 100644 --- a/src/lib/storage/index/adaptive_radix_tree/adaptive_radix_tree_index.cpp +++ b/src/lib/storage/index/adaptive_radix_tree/adaptive_radix_tree_index.cpp @@ -19,8 +19,8 @@ namespace opossum { AdaptiveRadixTreeIndex::AdaptiveRadixTreeIndex(const std::vector>& index_columns) : BaseIndex{get_index_type_of()}, _index_column(std::dynamic_pointer_cast(index_columns.front())) { - DebugAssert(static_cast(_index_column), "AdaptiveRadixTree only works with dictionary columns for now"); - DebugAssert((index_columns.size() == 1), "AdaptiveRadixTree only works with a single column"); + Assert(static_cast(_index_column), "AdaptiveRadixTree only works with dictionary columns for now"); + Assert((index_columns.size() == 1), "AdaptiveRadixTree only works with a single column"); // For each value ID in the attribute vector, create a pair consisting of a BinaryComparable of // this value ID and its ChunkOffset (needed for bulk-inserting). diff --git a/src/lib/storage/index/b_tree/b_tree_index.cpp b/src/lib/storage/index/b_tree/b_tree_index.cpp new file mode 100644 index 0000000000..ddb7ccb151 --- /dev/null +++ b/src/lib/storage/index/b_tree/b_tree_index.cpp @@ -0,0 +1,36 @@ +#include "b_tree_index.hpp" + +#include "storage/index/column_index_type.hpp" +#include "resolve_type.hpp" + +namespace opossum { + + BTreeIndex::BTreeIndex(const std::vector> index_columns) + : BaseIndex{get_index_type_of()}, _index_column(index_columns[0]) { + Assert((index_columns.size() == 1), "BTreeIndex only works with a single column."); + _impl = make_shared_by_data_type(_index_column->data_type(), _index_column); + } + + uint64_t BTreeIndex::memory_consumption() const { + return _impl->memory_consumption(); + } + + BTreeIndex::Iterator BTreeIndex::_lower_bound(const std::vector& values) const { + return _impl->lower_bound(values); + } + + BTreeIndex::Iterator BTreeIndex::_upper_bound(const std::vector& values) const { + return _impl->upper_bound(values); + } + + BTreeIndex::Iterator BTreeIndex::_cbegin() const { + return _impl->cbegin(); + } + + BTreeIndex::Iterator BTreeIndex::_cend() const { + return _impl->cend(); + } + + std::vector> BTreeIndex::_get_index_columns() const { return {_index_column}; } + +} // namespace opossum diff --git a/src/lib/storage/index/b_tree/b_tree_index.hpp b/src/lib/storage/index/b_tree/b_tree_index.hpp new file mode 100644 index 0000000000..b1e9c3f0a3 --- /dev/null +++ b/src/lib/storage/index/b_tree/b_tree_index.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "types.hpp" +#include "all_type_variant.hpp" +#include "storage/index/base_index.hpp" +#include "storage/base_column.hpp" +#include "b_tree_index_impl.hpp" + +namespace opossum { + +class BTreeIndexTest; + +class BTreeIndex : public BaseIndex { + friend BTreeIndexTest; + + public: + using Iterator = std::vector::const_iterator; + + BTreeIndex() = delete; + explicit BTreeIndex(const std::vector> index_columns); + + virtual uint64_t memory_consumption() const; + + protected: + Iterator _lower_bound(const std::vector&) const override; + Iterator _upper_bound(const std::vector&) const override; + Iterator _cbegin() const override; + Iterator _cend() const override; + std::vector> _get_index_columns() const override; + + std::shared_ptr _index_column; + std::shared_ptr _impl; +}; + +} // namespace opossum diff --git a/src/lib/storage/index/b_tree/b_tree_index_impl.cpp b/src/lib/storage/index/b_tree/b_tree_index_impl.cpp new file mode 100644 index 0000000000..bb6d42131d --- /dev/null +++ b/src/lib/storage/index/b_tree/b_tree_index_impl.cpp @@ -0,0 +1,96 @@ +#include "b_tree_index_impl.hpp" + +#include "storage/index/base_index.hpp" +#include "types.hpp" +#include "resolve_type.hpp" +#include "utils/assert.hpp" +#include "storage/create_iterable_from_column.hpp" + +namespace opossum { + +template +BTreeIndexImpl::BTreeIndexImpl(std::shared_ptr index_column) { + _bulk_insert(index_column); +} + +template +BaseBTreeIndexImpl::Iterator BTreeIndexImpl::lower_bound(const std::vector& values) const { + return lower_bound(type_cast(values[0])); +} + +template +BaseBTreeIndexImpl::Iterator BTreeIndexImpl::upper_bound(const std::vector& values) const { + return upper_bound(type_cast(values[0])); +} + +template +BaseBTreeIndexImpl::Iterator BTreeIndexImpl::cbegin() const { + return _chunk_offsets.begin(); +} + +template +BaseBTreeIndexImpl::Iterator BTreeIndexImpl::cend() const { + return _chunk_offsets.end(); +} + +template +BaseBTreeIndexImpl::Iterator BTreeIndexImpl::lower_bound(DataType value) const { + auto result = _btree.lower_bound(value); + if (result == _btree.end()) { + return _chunk_offsets.end(); + } else { + return _chunk_offsets.begin() + result->second; + } +} + +template +BaseBTreeIndexImpl::Iterator BTreeIndexImpl::upper_bound(DataType value) const { + auto result = _btree.upper_bound(value); + if (result == _btree.end()) { + return _chunk_offsets.end(); + } else { + return _chunk_offsets.begin() + result->second; + } +} + +template +uint64_t BTreeIndexImpl::memory_consumption() const { + return sizeof(std::vector) + + sizeof(ChunkOffset) * _chunk_offsets.size() + + _btree.bytes_used(); +} + +template +void BTreeIndexImpl::_bulk_insert(const std::shared_ptr column) { + std::vector> values; + + // Materialize + resolve_column_type(*column, [&](const auto& typed_column) { + auto iterable_left = create_iterable_from_column(typed_column); + iterable_left.for_each([&](const auto& value) { + if (value.is_null()) return; + values.push_back(std::make_pair(value.chunk_offset(), value.value())); + }); + }); + + // Sort + std::sort(values.begin(), values.end(), [](const auto& a, const auto& b){ return a.second < b.second; }); + _chunk_offsets.resize(values.size()); + for (size_t i = 0; i < values.size(); i++) { + _chunk_offsets[i] = values[i].first; + } + + // Build index + DataType current_value = values[0].second; + _btree[current_value] = 0; + for (size_t i = 0; i < values.size(); i++) { + if (values[i].second != current_value) { + current_value = values[i].second; + _btree[current_value] = i; + } + } +} + +EXPLICITLY_INSTANTIATE_DATA_TYPES(BTreeIndexImpl); + +} // namespace opossum diff --git a/src/lib/storage/index/b_tree/b_tree_index_impl.hpp b/src/lib/storage/index/b_tree/b_tree_index_impl.hpp new file mode 100644 index 0000000000..aca9d5022f --- /dev/null +++ b/src/lib/storage/index/b_tree/b_tree_index_impl.hpp @@ -0,0 +1,75 @@ +#pragma once + +#ifdef __clang__ + #pragma clang diagnostic ignored "-Wall" + #include + #pragma clang diagnostic pop +#elif __GNUC__ + #pragma GCC system_header + #include +#endif + + +#include "types.hpp" +#include "all_type_variant.hpp" +#include "storage/base_column.hpp" + +namespace opossum { + +class BTreeIndexTest; + +class BaseBTreeIndexImpl { + friend BTreeIndexTest; + + public: + BaseBTreeIndexImpl() = default; + BaseBTreeIndexImpl(BaseBTreeIndexImpl&&) = default; + BaseBTreeIndexImpl& operator=(BaseBTreeIndexImpl&&) = default; + virtual ~BaseBTreeIndexImpl() = default; + + using Iterator = std::vector::const_iterator; + virtual uint64_t memory_consumption() const = 0; + virtual Iterator lower_bound(const std::vector&) const = 0; + virtual Iterator upper_bound(const std::vector&) const = 0; + virtual Iterator cbegin() const = 0; + virtual Iterator cend() const = 0; + + protected: + std::vector _chunk_offsets; +}; + +/** +* Implementation: https://code.google.com/archive/p/cpp-btree/ +* Note: does not support null values right now. +*/ +template +class BTreeIndexImpl : public BaseBTreeIndexImpl { + friend BTreeIndexTest; + + public: + BTreeIndexImpl() = delete; + explicit BTreeIndexImpl(std::shared_ptr index_column); + + BTreeIndexImpl(const BTreeIndexImpl&) = delete; + BTreeIndexImpl& operator=(const BTreeIndexImpl&) = delete; + + BTreeIndexImpl(BTreeIndexImpl&&) = default; + BTreeIndexImpl& operator=(BTreeIndexImpl&&) = default; + + uint64_t memory_consumption() const override; + + Iterator lower_bound(DataType value) const; + Iterator upper_bound(DataType value) const; + + Iterator lower_bound(const std::vector&) const override; + Iterator upper_bound(const std::vector&) const override; + Iterator cbegin() const override; + Iterator cend() const override; + + protected: + void _bulk_insert(const std::shared_ptr); + + btree::btree_map _btree; +}; + +} // namespace opossum diff --git a/src/lib/storage/index/column_index_type.hpp b/src/lib/storage/index/column_index_type.hpp index 142376bc00..89962e0845 100644 --- a/src/lib/storage/index/column_index_type.hpp +++ b/src/lib/storage/index/column_index_type.hpp @@ -10,18 +10,20 @@ namespace opossum { namespace hana = boost::hana; -enum class ColumnIndexType : uint8_t { Invalid, GroupKey, CompositeGroupKey, AdaptiveRadixTree }; +enum class ColumnIndexType : uint8_t { Invalid, GroupKey, CompositeGroupKey, AdaptiveRadixTree, BTree }; class GroupKeyIndex; class CompositeGroupKeyIndex; class AdaptiveRadixTreeIndex; +class BTreeIndex; namespace detail { constexpr auto column_index_map = hana::make_map(hana::make_pair(hana::type_c, ColumnIndexType::GroupKey), hana::make_pair(hana::type_c, ColumnIndexType::CompositeGroupKey), - hana::make_pair(hana::type_c, ColumnIndexType::AdaptiveRadixTree)); + hana::make_pair(hana::type_c, ColumnIndexType::AdaptiveRadixTree), + hana::make_pair(hana::type_c, ColumnIndexType::BTree)); } // namespace detail diff --git a/src/lib/storage/index/group_key/composite_group_key_index.cpp b/src/lib/storage/index/group_key/composite_group_key_index.cpp index 87b939ecb2..e927c14b29 100644 --- a/src/lib/storage/index/group_key/composite_group_key_index.cpp +++ b/src/lib/storage/index/group_key/composite_group_key_index.cpp @@ -21,7 +21,7 @@ namespace opossum { CompositeGroupKeyIndex::CompositeGroupKeyIndex(const std::vector>& indexed_columns) : BaseIndex{get_index_type_of()} { - DebugAssert(!indexed_columns.empty(), "CompositeGroupKeyIndex requires at least one column to be indexed."); + Assert(!indexed_columns.empty(), "CompositeGroupKeyIndex requires at least one column to be indexed."); if (IS_DEBUG) { auto first_size = indexed_columns.front()->size(); @@ -37,8 +37,8 @@ CompositeGroupKeyIndex::CompositeGroupKeyIndex(const std::vector(column); - DebugAssert(static_cast(dict_column), "CompositeGroupKeyIndex only works with dictionary columns."); - DebugAssert(is_fixed_size_byte_aligned(dict_column->compressed_vector_type()), + Assert(static_cast(dict_column), "CompositeGroupKeyIndex only works with dictionary columns."); + Assert(is_fixed_size_byte_aligned(dict_column->compressed_vector_type()), "CompositeGroupKeyIndex only works with fixed-size byte-aligned compressed attribute vectors."); _indexed_columns.emplace_back(dict_column); } diff --git a/src/lib/storage/index/group_key/group_key_index.cpp b/src/lib/storage/index/group_key/group_key_index.cpp index dd7c05d80e..dd0464a4f5 100644 --- a/src/lib/storage/index/group_key/group_key_index.cpp +++ b/src/lib/storage/index/group_key/group_key_index.cpp @@ -11,8 +11,8 @@ namespace opossum { GroupKeyIndex::GroupKeyIndex(const std::vector> index_columns) : BaseIndex{get_index_type_of()}, _index_column(std::dynamic_pointer_cast(index_columns[0])) { - DebugAssert(static_cast(_index_column), "GroupKeyIndex only works with dictionary columns."); - DebugAssert((index_columns.size() == 1), "GroupKeyIndex only works with a single column."); + Assert(static_cast(_index_column), "GroupKeyIndex only works with dictionary columns."); + Assert((index_columns.size() == 1), "GroupKeyIndex only works with a single column."); // 1) Initialize the index structures // 1a) Set the index_offset to size of the dictionary + 1 (plus one to mark the ending position) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 5141e365fd..3cb538490b 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -120,6 +120,7 @@ set( storage/encoding_test.hpp storage/encoded_column_test.cpp storage/group_key_index_test.cpp + storage/btree_index_test.cpp storage/iterables_test.cpp storage/materialize_test.cpp storage/multi_column_index_test.cpp diff --git a/src/test/operators/index_scan_test.cpp b/src/test/operators/index_scan_test.cpp index 8752724829..7b28b40a88 100644 --- a/src/test/operators/index_scan_test.cpp +++ b/src/test/operators/index_scan_test.cpp @@ -13,6 +13,7 @@ #include "storage/index/adaptive_radix_tree/adaptive_radix_tree_index.hpp" #include "storage/index/group_key/composite_group_key_index.hpp" #include "storage/index/group_key/group_key_index.hpp" +#include "storage/index/b_tree/b_tree_index.hpp" #include "storage/table.hpp" #include "types.hpp" @@ -87,8 +88,9 @@ class OperatorsIndexScanTest : public BaseTest { ColumnIndexType _index_type; }; -typedef ::testing::Types - DerivedIndices; +typedef ::testing::Types DerivedIndices; + TYPED_TEST_CASE(OperatorsIndexScanTest, DerivedIndices); TYPED_TEST(OperatorsIndexScanTest, SingleColumnScanOnDataTable) { diff --git a/src/test/operators/join_index_test.cpp b/src/test/operators/join_index_test.cpp index 79438277f0..96ec75e8d8 100644 --- a/src/test/operators/join_index_test.cpp +++ b/src/test/operators/join_index_test.cpp @@ -15,6 +15,7 @@ #include "storage/index/adaptive_radix_tree/adaptive_radix_tree_index.hpp" #include "storage/index/group_key/composite_group_key_index.hpp" #include "storage/index/group_key/group_key_index.hpp" +#include "storage/index/b_tree/b_tree_index.hpp" #include "storage/table.hpp" #include "types.hpp" @@ -100,7 +101,8 @@ class JoinIndexTest : public BaseTest { _table_wrapper_k, _table_wrapper_l, _table_wrapper_m, _table_wrapper_n; }; -typedef ::testing::Types DerivedIndices; +typedef ::testing::Types DerivedIndices; TYPED_TEST_CASE(JoinIndexTest, DerivedIndices); diff --git a/src/test/storage/btree_index_test.cpp b/src/test/storage/btree_index_test.cpp new file mode 100644 index 0000000000..023d48dda8 --- /dev/null +++ b/src/test/storage/btree_index_test.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include + +#include "../base_test.hpp" +#include "gtest/gtest.h" + +#include "../lib/storage/base_column.hpp" +#include "../lib/storage/chunk.hpp" +#include "../lib/storage/index/b_tree/b_tree_index.hpp" +#include "../lib/types.hpp" + +namespace opossum { + +class BTreeIndexTest : public BaseTest { + protected: + void SetUp() override { + values = {"hotel", "delta", "frank", "delta", "apple", "charlie", "charlie", "inbox"}; + column = std::make_shared>(values); + sorted = {"apple", "charlie", "charlie", "delta", "delta", "frank", "hotel", "inbox"}; + index = std::make_shared(std::vector>({column})); + + chunk_offsets = &(index->_impl->_chunk_offsets); + } + + std::vector values; + std::vector sorted; + std::shared_ptr index = nullptr; + std::shared_ptr> column = nullptr; + + /** + * Use pointers to inner data structures of BTreeIndex in order to bypass the + * private scope. Since the variable is set in setup() references are not possible. + */ + std::vector* chunk_offsets; +}; + +TEST_F(BTreeIndexTest, ChunkOffsets) { + for (size_t i = 0; i < values.size(); i++) { + EXPECT_EQ(values[chunk_offsets->at(i)], sorted[i]); + } +} + +TEST_F(BTreeIndexTest, IndexProbes) { + auto begin = index->cbegin(); + EXPECT_EQ(index->lower_bound({"apple"}) - begin, 0); + EXPECT_EQ(index->upper_bound({"apple"}) - begin, 1); + + EXPECT_EQ(index->lower_bound({"charlie"}) - begin, 1); + EXPECT_EQ(index->upper_bound({"charlie"}) - begin, 3); + + EXPECT_EQ(index->lower_bound({"delta"}) - begin, 3); + EXPECT_EQ(index->upper_bound({"delta"}) - begin, 5); + + EXPECT_EQ(index->lower_bound({"frank"}) - begin, 5); + EXPECT_EQ(index->upper_bound({"frank"}) - begin, 6); + + EXPECT_EQ(index->lower_bound({"hotel"}) - begin, 6); + EXPECT_EQ(index->upper_bound({"hotel"}) - begin, 7); + + EXPECT_EQ(index->lower_bound({"inbox"}) - begin, 7); + EXPECT_EQ(index->upper_bound({"inbox"}) - begin, 8); +} + +} // namespace opossum diff --git a/third_party/cpp-btree b/third_party/cpp-btree new file mode 160000 index 0000000000..92ec61e4b8 --- /dev/null +++ b/third_party/cpp-btree @@ -0,0 +1 @@ +Subproject commit 92ec61e4b8bf182c5c49ebf6540dac62d569d090