Skip to content

Commit

Permalink
Counting Quotient Filter (hyrise#1075)
Browse files Browse the repository at this point in the history
* Add CQF submodule

* start adapting new cqf version

* more cqf integration

* removed original cqf module

* new cqf module

* started cqf test

* cqf test

* integrating cqf wrapper

* delete old cqf clone

* Reset to older cqf version

* lint

* fix dependencies

* move cqf to filters

* add test for can_prune and fix can_prune

* lint

* Test more CQF configurations

* remainder size as enum

* added int cqf tests

* Now using boost::variant for cqf

* lint

* memory_consumptionn -> memory_consumption

* update cqf

* resolve merge issues

* Another misaligned bytes problem fix

* clang tidy fixes

* forgot to rename variables

* copy and move constructors

* new cqf false positive rate tests

* integrating pr comments

* remove second insert method

* update submodules to master

* moar

* adjust to hyrise coding standards

* should be clean now

* higher allowed fpr

* add comment

* moar tests

* lint
  • Loading branch information
ArneMayer authored and Moritz Eyssen committed Oct 21, 2018
1 parent 1f673f4 commit 60f0424
Show file tree
Hide file tree
Showing 10 changed files with 388 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
[submodule "third_party/cpp-btree"]
path = third_party/cpp-btree
url = https://github.com/algorithm-ninja/cpp-btree
[submodule "third_party/cqf"]
path = third_party/cqf
url = https://github.com/ArneMayer/cqf.git
[submodule "flat_hash_map"]
path = third_party/flat_hash_map
url = https://github.com/skarupke/flat_hash_map.git
1 change: 1 addition & 0 deletions DEPENDENCIES.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@
- sql-parser (https://github.com/hyrise/sql-parser)
- pgasus (https://github.com/kateyy/pgasus)
- cpp-btree (https://github.com/algorithm-ninja/cpp-btree)
- cqf (https://github.com/ArneMayer/cqf)
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ include_directories(
${PROJECT_SOURCE_DIR}/third_party/json
${PROJECT_SOURCE_DIR}/third_party/sql-parser/src
${PROJECT_SOURCE_DIR}/third_party/cpp-btree
${PROJECT_SOURCE_DIR}/third_party/cqf/include
${PROJECT_SOURCE_DIR}/third_party/flat_hash_map

${PROJECT_SOURCE_DIR}/src/benchmarklib/
Expand Down
3 changes: 3 additions & 0 deletions src/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,8 @@ set(
statistics/chunk_statistics/range_filter.hpp
statistics/chunk_statistics/segment_statistics.cpp
statistics/chunk_statistics/segment_statistics.hpp
statistics/chunk_statistics/counting_quotient_filter.hpp
statistics/chunk_statistics/counting_quotient_filter.cpp
statistics/column_statistics.cpp
statistics/column_statistics.cpp
statistics/generate_column_statistics.cpp
Expand Down Expand Up @@ -559,6 +561,7 @@ set(
LIBRARIES
pthread
sqlparser
cqf
uninitialized_vector
${Boost_CONTAINER_LIBRARY}
${TBB_LIBRARY}
Expand Down
122 changes: 122 additions & 0 deletions src/lib/statistics/chunk_statistics/counting_quotient_filter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#include "counting_quotient_filter.hpp"

#include <cmath>
#include <iostream>
#include <string>

#include "resolve_type.hpp"
#include "storage/create_iterable_from_segment.hpp"
#include "storage/storage_manager.hpp"
#include "storage/table.hpp"
#include "types.hpp"

using namespace gqf2; // NOLINT
using namespace gqf4; // NOLINT
using namespace gqf8; // NOLINT
using namespace gqf16; // NOLINT
using namespace gqf32; // NOLINT

namespace opossum {

template <typename ElementType>
CountingQuotientFilter<ElementType>::CountingQuotientFilter(const size_t quotient_size, const size_t remainder_size)
: _hash_bits(quotient_size + remainder_size) {
Assert(quotient_size > 0, "Quotient size can not be zero.");
Assert(_hash_bits <= 64u, "Hash length can not exceed 64 bits.");

if (remainder_size == 2) {
_quotient_filter = gqf2::quotient_filter{};
} else if (remainder_size == 4) {
_quotient_filter = gqf4::quotient_filter{};
} else if (remainder_size == 8) {
_quotient_filter = gqf8::quotient_filter{};
} else if (remainder_size == 16) {
_quotient_filter = gqf16::quotient_filter{};
} else if (remainder_size == 32) {
_quotient_filter = gqf32::quotient_filter{};
} else {
Fail("Invalid remainder remainder_size");
}

const auto number_of_slots = std::pow(2, quotient_size);
boost::apply_visitor([&](auto& filter) { qf_init(&filter, number_of_slots, _hash_bits, 0); }, _quotient_filter);
}

template <typename ElementType>
CountingQuotientFilter<ElementType>::~CountingQuotientFilter() {
boost::apply_visitor([&](auto& filter) { qf_destroy(&filter); }, _quotient_filter);
}

template <typename ElementType>
void CountingQuotientFilter<ElementType>::insert(ElementType value, size_t count) {
const auto bitmask = static_cast<size_t>(std::pow(2, _hash_bits)) - 1;
const auto hash = bitmask & _hash(value);
for (size_t idx = 0; idx < count; ++idx) {
boost::apply_visitor([&](auto& filter) { qf_insert(&filter, hash, 0, 1); }, _quotient_filter);
}
}

template <typename ElementType>
size_t CountingQuotientFilter<ElementType>::count(const AllTypeVariant& value) const {
DebugAssert(value.type() == typeid(ElementType), "Value does not have the same type as the filter elements");
return count(type_cast<ElementType>(value));
}

template <typename ElementType>
bool CountingQuotientFilter<ElementType>::can_prune(const PredicateCondition predicate_type,
const AllTypeVariant& value,
const std::optional<AllTypeVariant>& variant_value2) const {
DebugAssert(predicate_type == PredicateCondition::Equals && !variant_value2, "CQF only supports equality predicates");
return count(value) == 0;
}

template <typename ElementType>
size_t CountingQuotientFilter<ElementType>::count(const ElementType& value) const {
const auto bitmask = static_cast<uint64_t>(std::pow(2, _hash_bits)) - 1;
const auto hash = bitmask & _hash(value);

auto count = size_t{0};
boost::apply_visitor([&](auto& filter) { count = qf_count_key_value(&filter, hash, 0); }, _quotient_filter);
return count;
}

template <typename ElementType>
size_t CountingQuotientFilter<ElementType>::_hash(const ElementType& value) const {
auto hash = std::hash<ElementType>{}(value);
return static_cast<size_t>(hash);
}

template <typename ElementType>
void CountingQuotientFilter<ElementType>::populate(const std::shared_ptr<const BaseSegment>& segment) {
resolve_segment_type<ElementType>(*segment, [&](const auto& typed_segment) {
auto segment_iterable = create_iterable_from_segment<ElementType>(typed_segment);
segment_iterable.for_each([&](const auto& value) {
if (value.is_null()) return;
insert(value.value());
});
});
}

template <typename ElementType>
size_t CountingQuotientFilter<ElementType>::memory_consumption() const {
size_t consumption = 0;
boost::apply_visitor([&](auto& filter) { consumption = qf_memory_consumption(filter); }, _quotient_filter);
return consumption;
}

template <typename ElementType>
float CountingQuotientFilter<ElementType>::load_factor() const {
auto load_factor = 0.f;
boost::apply_visitor([&](auto& filter) { load_factor = filter.noccupied_slots / static_cast<float>(filter.nslots); },
_quotient_filter);
return load_factor;
}

template <typename ElementType>
bool CountingQuotientFilter<ElementType>::is_full() const {
return load_factor() > 0.99f;
}

EXPLICITLY_INSTANTIATE_DATA_TYPES(CountingQuotientFilter);

} // namespace opossum
67 changes: 67 additions & 0 deletions src/lib/statistics/chunk_statistics/counting_quotient_filter.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#pragma once

#include <string>
#include <vector>

#include "cqf16.hpp"
#include "cqf2.hpp"
#include "cqf32.hpp"
#include "cqf4.hpp"
#include "cqf8.hpp"
#include "types.hpp"

#include "abstract_filter.hpp"
#include "storage/base_segment.hpp"

namespace opossum {

/* Counting Quotient Filters allow you to keep track of which values are present in a segment and how often. Filters
work approximately. If a membership query yields a positive result, the value is probably present but there is
a chance of a false positive. If the query delivers a negative result, the item is guaranteed to not be contained.
In the same way, items can be over counted but not under counted.
CQF can be configured with quotient size, which determines the number of slots, and the remainder size, which
corresponds to the slot size. At this time, the remainder size must be 2, 4, 8, 16 or 32.
"When you configure the CQF the number of slots in the CQF must be at least the number of the distinct elements in your
input dataset plus the sum of logs of all the counts divided by the remainder size. You must have some estimate of the
number distinct elements in your dataset to configure the CQF correctly. For example, if in a dataset there are `M`
integers and `N` distinct integers. And let's assume each integer appears M/N times. Then the number of slots `S` you
would need would be `S = N*(1 + log(M/N)/r)`. Since the number of slots can only be a power of two, we choose the
smallest number greater than `S` that is a power of 2 as the number of slots." - Prashant Pandey
*/

template <typename ElementType>
class CountingQuotientFilter : public AbstractFilter, public Noncopyable {
public:
CountingQuotientFilter(const size_t quotient_size, const size_t remainder_size);
~CountingQuotientFilter() override;

void insert(ElementType value, size_t count = 1);
void populate(const std::shared_ptr<const BaseSegment>& segment);

size_t count(const ElementType& value) const;
size_t count(const AllTypeVariant& value) const;

size_t memory_consumption() const;

float load_factor() const;

bool is_full() const;

bool can_prune(const PredicateCondition predicate_type, const AllTypeVariant& value,
const std::optional<AllTypeVariant>& variant_value2 = std::nullopt) const override;

// Can't copy CountingQuotientFilter
CountingQuotientFilter(CountingQuotientFilter&) = delete;
CountingQuotientFilter(CountingQuotientFilter&&) = delete;
CountingQuotientFilter operator=(CountingQuotientFilter&) = delete;
CountingQuotientFilter operator=(CountingQuotientFilter&&) = delete;

private:
uint64_t _hash(const ElementType& value) const;

boost::variant<gqf2::QF, gqf4::QF, gqf8::QF, gqf16::QF, gqf32::QF> _quotient_filter;
const size_t _hash_bits;
};

} // namespace opossum
1 change: 1 addition & 0 deletions src/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ set(
statistics/chunk_statistics/histograms/equal_width_histogram_test.cpp
statistics/chunk_statistics/histograms/histogram_utils_test.cpp
statistics/chunk_statistics/min_max_filter_test.cpp
statistics/chunk_statistics/counting_quotient_filter_test.cpp
statistics/chunk_statistics/range_filter_test.cpp
statistics/column_statistics_test.cpp
statistics/generate_table_statistics_test.cpp
Expand Down
Loading

0 comments on commit 60f0424

Please sign in to comment.