forked from hyrise/hyrise
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Counting Quotient Filter (hyrise#1075)
* Add CQF submodule * start adapting new cqf version * more cqf integration * removed original cqf module * new cqf module * started cqf test * cqf test * integrating cqf wrapper * delete old cqf clone * Reset to older cqf version * lint * fix dependencies * move cqf to filters * add test for can_prune and fix can_prune * lint * Test more CQF configurations * remainder size as enum * added int cqf tests * Now using boost::variant for cqf * lint * memory_consumptionn -> memory_consumption * update cqf * resolve merge issues * Another misaligned bytes problem fix * clang tidy fixes * forgot to rename variables * copy and move constructors * new cqf false positive rate tests * integrating pr comments * remove second insert method * update submodules to master * moar * adjust to hyrise coding standards * should be clean now * higher allowed fpr * add comment * moar tests * lint
- Loading branch information
ArneMayer
authored and
Moritz Eyssen
committed
Oct 21, 2018
1 parent
1f673f4
commit 60f0424
Showing
10 changed files
with
388 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
src/lib/statistics/chunk_statistics/counting_quotient_filter.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#include "counting_quotient_filter.hpp" | ||
|
||
#include <cmath> | ||
#include <iostream> | ||
#include <string> | ||
|
||
#include "resolve_type.hpp" | ||
#include "storage/create_iterable_from_segment.hpp" | ||
#include "storage/storage_manager.hpp" | ||
#include "storage/table.hpp" | ||
#include "types.hpp" | ||
|
||
using namespace gqf2; // NOLINT | ||
using namespace gqf4; // NOLINT | ||
using namespace gqf8; // NOLINT | ||
using namespace gqf16; // NOLINT | ||
using namespace gqf32; // NOLINT | ||
|
||
namespace opossum { | ||
|
||
template <typename ElementType> | ||
CountingQuotientFilter<ElementType>::CountingQuotientFilter(const size_t quotient_size, const size_t remainder_size) | ||
: _hash_bits(quotient_size + remainder_size) { | ||
Assert(quotient_size > 0, "Quotient size can not be zero."); | ||
Assert(_hash_bits <= 64u, "Hash length can not exceed 64 bits."); | ||
|
||
if (remainder_size == 2) { | ||
_quotient_filter = gqf2::quotient_filter{}; | ||
} else if (remainder_size == 4) { | ||
_quotient_filter = gqf4::quotient_filter{}; | ||
} else if (remainder_size == 8) { | ||
_quotient_filter = gqf8::quotient_filter{}; | ||
} else if (remainder_size == 16) { | ||
_quotient_filter = gqf16::quotient_filter{}; | ||
} else if (remainder_size == 32) { | ||
_quotient_filter = gqf32::quotient_filter{}; | ||
} else { | ||
Fail("Invalid remainder remainder_size"); | ||
} | ||
|
||
const auto number_of_slots = std::pow(2, quotient_size); | ||
boost::apply_visitor([&](auto& filter) { qf_init(&filter, number_of_slots, _hash_bits, 0); }, _quotient_filter); | ||
} | ||
|
||
template <typename ElementType> | ||
CountingQuotientFilter<ElementType>::~CountingQuotientFilter() { | ||
boost::apply_visitor([&](auto& filter) { qf_destroy(&filter); }, _quotient_filter); | ||
} | ||
|
||
template <typename ElementType> | ||
void CountingQuotientFilter<ElementType>::insert(ElementType value, size_t count) { | ||
const auto bitmask = static_cast<size_t>(std::pow(2, _hash_bits)) - 1; | ||
const auto hash = bitmask & _hash(value); | ||
for (size_t idx = 0; idx < count; ++idx) { | ||
boost::apply_visitor([&](auto& filter) { qf_insert(&filter, hash, 0, 1); }, _quotient_filter); | ||
} | ||
} | ||
|
||
template <typename ElementType> | ||
size_t CountingQuotientFilter<ElementType>::count(const AllTypeVariant& value) const { | ||
DebugAssert(value.type() == typeid(ElementType), "Value does not have the same type as the filter elements"); | ||
return count(type_cast<ElementType>(value)); | ||
} | ||
|
||
template <typename ElementType> | ||
bool CountingQuotientFilter<ElementType>::can_prune(const PredicateCondition predicate_type, | ||
const AllTypeVariant& value, | ||
const std::optional<AllTypeVariant>& variant_value2) const { | ||
DebugAssert(predicate_type == PredicateCondition::Equals && !variant_value2, "CQF only supports equality predicates"); | ||
return count(value) == 0; | ||
} | ||
|
||
template <typename ElementType> | ||
size_t CountingQuotientFilter<ElementType>::count(const ElementType& value) const { | ||
const auto bitmask = static_cast<uint64_t>(std::pow(2, _hash_bits)) - 1; | ||
const auto hash = bitmask & _hash(value); | ||
|
||
auto count = size_t{0}; | ||
boost::apply_visitor([&](auto& filter) { count = qf_count_key_value(&filter, hash, 0); }, _quotient_filter); | ||
return count; | ||
} | ||
|
||
template <typename ElementType> | ||
size_t CountingQuotientFilter<ElementType>::_hash(const ElementType& value) const { | ||
auto hash = std::hash<ElementType>{}(value); | ||
return static_cast<size_t>(hash); | ||
} | ||
|
||
template <typename ElementType> | ||
void CountingQuotientFilter<ElementType>::populate(const std::shared_ptr<const BaseSegment>& segment) { | ||
resolve_segment_type<ElementType>(*segment, [&](const auto& typed_segment) { | ||
auto segment_iterable = create_iterable_from_segment<ElementType>(typed_segment); | ||
segment_iterable.for_each([&](const auto& value) { | ||
if (value.is_null()) return; | ||
insert(value.value()); | ||
}); | ||
}); | ||
} | ||
|
||
template <typename ElementType> | ||
size_t CountingQuotientFilter<ElementType>::memory_consumption() const { | ||
size_t consumption = 0; | ||
boost::apply_visitor([&](auto& filter) { consumption = qf_memory_consumption(filter); }, _quotient_filter); | ||
return consumption; | ||
} | ||
|
||
template <typename ElementType> | ||
float CountingQuotientFilter<ElementType>::load_factor() const { | ||
auto load_factor = 0.f; | ||
boost::apply_visitor([&](auto& filter) { load_factor = filter.noccupied_slots / static_cast<float>(filter.nslots); }, | ||
_quotient_filter); | ||
return load_factor; | ||
} | ||
|
||
template <typename ElementType> | ||
bool CountingQuotientFilter<ElementType>::is_full() const { | ||
return load_factor() > 0.99f; | ||
} | ||
|
||
EXPLICITLY_INSTANTIATE_DATA_TYPES(CountingQuotientFilter); | ||
|
||
} // namespace opossum |
67 changes: 67 additions & 0 deletions
67
src/lib/statistics/chunk_statistics/counting_quotient_filter.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#pragma once | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
#include "cqf16.hpp" | ||
#include "cqf2.hpp" | ||
#include "cqf32.hpp" | ||
#include "cqf4.hpp" | ||
#include "cqf8.hpp" | ||
#include "types.hpp" | ||
|
||
#include "abstract_filter.hpp" | ||
#include "storage/base_segment.hpp" | ||
|
||
namespace opossum { | ||
|
||
/* Counting Quotient Filters allow you to keep track of which values are present in a segment and how often. Filters | ||
work approximately. If a membership query yields a positive result, the value is probably present but there is | ||
a chance of a false positive. If the query delivers a negative result, the item is guaranteed to not be contained. | ||
In the same way, items can be over counted but not under counted. | ||
CQF can be configured with quotient size, which determines the number of slots, and the remainder size, which | ||
corresponds to the slot size. At this time, the remainder size must be 2, 4, 8, 16 or 32. | ||
"When you configure the CQF the number of slots in the CQF must be at least the number of the distinct elements in your | ||
input dataset plus the sum of logs of all the counts divided by the remainder size. You must have some estimate of the | ||
number distinct elements in your dataset to configure the CQF correctly. For example, if in a dataset there are `M` | ||
integers and `N` distinct integers. And let's assume each integer appears M/N times. Then the number of slots `S` you | ||
would need would be `S = N*(1 + log(M/N)/r)`. Since the number of slots can only be a power of two, we choose the | ||
smallest number greater than `S` that is a power of 2 as the number of slots." - Prashant Pandey | ||
*/ | ||
|
||
template <typename ElementType> | ||
class CountingQuotientFilter : public AbstractFilter, public Noncopyable { | ||
public: | ||
CountingQuotientFilter(const size_t quotient_size, const size_t remainder_size); | ||
~CountingQuotientFilter() override; | ||
|
||
void insert(ElementType value, size_t count = 1); | ||
void populate(const std::shared_ptr<const BaseSegment>& segment); | ||
|
||
size_t count(const ElementType& value) const; | ||
size_t count(const AllTypeVariant& value) const; | ||
|
||
size_t memory_consumption() const; | ||
|
||
float load_factor() const; | ||
|
||
bool is_full() const; | ||
|
||
bool can_prune(const PredicateCondition predicate_type, const AllTypeVariant& value, | ||
const std::optional<AllTypeVariant>& variant_value2 = std::nullopt) const override; | ||
|
||
// Can't copy CountingQuotientFilter | ||
CountingQuotientFilter(CountingQuotientFilter&) = delete; | ||
CountingQuotientFilter(CountingQuotientFilter&&) = delete; | ||
CountingQuotientFilter operator=(CountingQuotientFilter&) = delete; | ||
CountingQuotientFilter operator=(CountingQuotientFilter&&) = delete; | ||
|
||
private: | ||
uint64_t _hash(const ElementType& value) const; | ||
|
||
boost::variant<gqf2::QF, gqf4::QF, gqf8::QF, gqf16::QF, gqf32::QF> _quotient_filter; | ||
const size_t _hash_bits; | ||
}; | ||
|
||
} // namespace opossum |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.