diff --git a/.gitmodules b/.gitmodules index 02419ff689..0dc6fd60b2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -25,6 +25,9 @@ [submodule "flat_hash_map"] path = third_party/flat_hash_map url = https://github.com/skarupke/flat_hash_map.git +[submodule "third_party/lz4"] + path = third_party/lz4 + url = https://github.com/lz4/lz4 [submodule "third_party/join-order-benchmark"] path = third_party/join-order-benchmark url = https://github.com/gregrahn/join-order-benchmark.git diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2c3f562c30..1a839b0dcd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -86,6 +86,7 @@ include_directories( ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include ${PROJECT_SOURCE_DIR}/third_party/flat_hash_map ${PROJECT_SOURCE_DIR}/third_party/json + ${PROJECT_SOURCE_DIR}/third_party/lz4 ) if (${ENABLE_JIT_SUPPORT}) diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index b303f4b88f..7935895f2e 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -442,6 +442,10 @@ set( storage/index/segment_index_type.hpp storage/lqp_view.cpp storage/lqp_view.hpp + storage/lz4/lz4_encoder.hpp + storage/lz4/lz4_iterable.hpp + storage/lz4_segment.cpp + storage/lz4_segment.hpp storage/materialize.hpp storage/mvcc_data.cpp storage/mvcc_data.hpp @@ -578,6 +582,7 @@ set( sqlparser cqf uninitialized_vector + lz4 custom_jemalloc ${FILESYSTEM_LIBRARY} ${Boost_CONTAINER_LIBRARY} diff --git a/src/lib/constant_mappings.cpp b/src/lib/constant_mappings.cpp index 870d22871a..2a58782c12 100644 --- a/src/lib/constant_mappings.cpp +++ b/src/lib/constant_mappings.cpp @@ -76,6 +76,7 @@ const boost::bimap encoding_type_to_string = make_bim {EncodingType::RunLength, "RunLength"}, {EncodingType::FixedStringDictionary, "FixedStringDictionary"}, {EncodingType::FrameOfReference, "FrameOfReference"}, + {EncodingType::LZ4, "LZ4"}, {EncodingType::Unencoded, "Unencoded"}, }); diff --git a/src/lib/operators/print.cpp b/src/lib/operators/print.cpp index 2eb79fda14..8412fc98d9 100644 --- a/src/lib/operators/print.cpp +++ b/src/lib/operators/print.cpp @@ -194,6 +194,10 @@ std::string Print::_segment_type(const std::shared_ptr& segment) co segment_type += "FoR"; break; } + case EncodingType::LZ4: { + segment_type += "LZ4"; + break; + } } if (encoded_segment->compressed_vector_type()) { switch (*encoded_segment->compressed_vector_type()) { diff --git a/src/lib/storage/create_iterable_from_segment.hpp b/src/lib/storage/create_iterable_from_segment.hpp index 9c8a30f5f4..3eb8976637 100644 --- a/src/lib/storage/create_iterable_from_segment.hpp +++ b/src/lib/storage/create_iterable_from_segment.hpp @@ -2,6 +2,7 @@ #include "storage/dictionary_segment/dictionary_segment_iterable.hpp" #include "storage/frame_of_reference/frame_of_reference_iterable.hpp" +#include "storage/lz4/lz4_iterable.hpp" #include "storage/run_length_segment/run_length_segment_iterable.hpp" #include "storage/segment_iterables/any_segment_iterable.hpp" #include "storage/value_segment/value_segment_iterable.hpp" @@ -72,6 +73,16 @@ auto create_iterable_from_segment(const FrameOfReferenceSegment& segment) { } } +template +auto create_iterable_from_segment(const LZ4Segment& segment) { + // We always erase the type here because LZ4 is too slow to notice a difference anyway. + if constexpr (EraseSegmentType) { + return create_any_segment_iterable(segment); + } else { + return LZ4Iterable{segment}; + } +} + /** * This function must be forward-declared because ReferenceSegmentIterable * includes this file leading to a circular dependency diff --git a/src/lib/storage/encoding_type.hpp b/src/lib/storage/encoding_type.hpp index c2b43e5bf5..ced4028d07 100644 --- a/src/lib/storage/encoding_type.hpp +++ b/src/lib/storage/encoding_type.hpp @@ -17,11 +17,12 @@ namespace opossum { namespace hana = boost::hana; -enum class EncodingType : uint8_t { Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference }; +enum class EncodingType : uint8_t { Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4 }; inline static std::vector encoding_type_enum_values{ - EncodingType::Unencoded, EncodingType::Dictionary, EncodingType::RunLength, EncodingType::FixedStringDictionary, - EncodingType::FrameOfReference}; + EncodingType::Unencoded, EncodingType::Dictionary, + EncodingType::RunLength, EncodingType::FixedStringDictionary, + EncodingType::FrameOfReference, EncodingType::LZ4}; /** * @brief Maps each encoding type to its supported data types @@ -36,7 +37,8 @@ constexpr auto supported_data_types_for_encoding_type = hana::make_map( hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, hana::tuple_t), - hana::make_pair(enum_c, hana::tuple_t)); + hana::make_pair(enum_c, hana::tuple_t), + hana::make_pair(enum_c, data_types)); /** * @return an integral constant implicitly convertible to bool diff --git a/src/lib/storage/lz4/lz4_encoder.hpp b/src/lib/storage/lz4/lz4_encoder.hpp new file mode 100644 index 0000000000..069e72880f --- /dev/null +++ b/src/lib/storage/lz4/lz4_encoder.hpp @@ -0,0 +1,170 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "storage/base_segment_encoder.hpp" +#include "storage/lz4_segment.hpp" +#include "storage/value_segment.hpp" +#include "storage/value_segment/value_segment_iterable.hpp" +#include "storage/vector_compression/vector_compression.hpp" +#include "types.hpp" +#include "utils/assert.hpp" +#include "utils/enum_constant.hpp" + +namespace opossum { + +class LZ4Encoder : public SegmentEncoder { + public: + static constexpr auto _encoding_type = enum_c; + static constexpr auto _uses_vector_compression = false; + + template + std::shared_ptr _on_encode(const std::shared_ptr>& value_segment) { + const auto alloc = value_segment->values().get_allocator(); + const auto num_elements = value_segment->size(); + + // TODO(anyone): when value segments switch to using pmr_vectors, the data can be copied directly instead of + // copying it element by element + auto values = pmr_vector{alloc}; + values.resize(num_elements); + auto null_values = pmr_vector{alloc}; + null_values.resize(num_elements); + + // copy values and null flags from value segment + auto iterable = ValueSegmentIterable{*value_segment}; + iterable.with_iterators([&](auto it, auto end) { + // iterate over the segment to access the values and increment the row index to write to the values and null + // values vectors + for (size_t row_index = 0u; it != end; ++it, ++row_index) { + auto segment_value = *it; + values[row_index] = segment_value.value(); + null_values[row_index] = segment_value.is_null(); + } + }); + + /** + * Use the LZ4 high compression API to compress the copied values. As C-library LZ4 needs raw pointers as input + * and output. To avoid directly handling raw pointers we use std::vectors as input and output. The input vector + * contains the data that needs to be compressed and the output vector is allocated enough memory to contain + * the compression result. Via the .data() call we can supply LZ4 with raw pointers to the memory the vectors use. + * These are cast to char-pointers since LZ4 expects char pointers. + */ + DebugAssert(values.size() * sizeof(T) <= std::numeric_limits::max(), + "Input of LZ4 encoder contains too many bytes to fit into a 32-bit signed integer sized vector that is" + " used by the LZ4 library."); + + const auto input_size = values.size() * sizeof(T); + // estimate the (maximum) output size + const auto output_size = LZ4_compressBound(static_cast(input_size)); + auto compressed_data = pmr_vector{alloc}; + compressed_data.resize(static_cast(output_size)); + const int compression_result = LZ4_compress_HC(reinterpret_cast(values.data()), compressed_data.data(), + static_cast(input_size), output_size, LZ4HC_CLEVEL_MAX); + Assert(compression_result > 0, "LZ4 compression failed"); + + // shrink the vector to the actual size of the compressed result + compressed_data.resize(static_cast(compression_result)); + compressed_data.shrink_to_fit(); + + return std::allocate_shared>(alloc, std::move(compressed_data), std::move(null_values), input_size); + } + + std::shared_ptr _on_encode(const std::shared_ptr>& value_segment) { + const auto alloc = value_segment->values().get_allocator(); + const auto num_elements = value_segment->size(); + + /** + * First iterate over the values for two reasons. + * 1) If all the strings are empty LZ4 will try to compress an empty vector which will cause a segmentation fault. + * In this case we can and need to do an early exit. + * 2) Sum the length of the strings to improve the performance when copying the data to the char vector. + */ + size_t num_chars = 0u; + ValueSegmentIterable{*value_segment}.with_iterators([&](auto it, auto end) { + for (size_t row_index = 0; it != end; ++it, ++row_index) { + if (!it->is_null()) { + num_chars += it->value().size(); + } + } + }); + + // copy values and null flags from value segment + auto values = pmr_vector{alloc}; + values.reserve(num_chars); + auto null_values = pmr_vector{alloc}; + null_values.resize(num_elements); + + /** + * These offsets mark the beginning of strings (and therefore end of the previous string) in the data vector. + * These offsets are character offsets. The string at position 0 starts at the offset stored at position 0, which + * will always be 0. + * Its exclusive end is the offset stored at position 1 (i.e. offsets[1] - 1 is the last character of the string + * at position 0). + * In case of the last string its end is determined by the end of the data vector. + */ + auto offsets = pmr_vector{alloc}; + offsets.resize(num_elements); + + auto iterable = ValueSegmentIterable{*value_segment}; + iterable.with_iterators([&](auto it, auto end) { + size_t offset = 0u; + bool is_null; + // iterate over the iterator to access the values and increment the row index to write to the values and null + // values vectors + for (size_t row_index = 0; it != end; ++it, ++row_index) { + auto segment_value = *it; + is_null = segment_value.is_null(); + null_values[row_index] = is_null; + offsets[row_index] = offset; + if (!is_null) { + auto data = segment_value.value(); + values.insert(values.cend(), data.begin(), data.end()); + offset += data.size(); + } + } + }); + + /** + * If the input only contained null values and/or empty strings we don't need to compress anything (and LZ4 will + * cause an error). Therefore we can return the encoded segment already. + */ + if (!num_chars) { + return std::allocate_shared>(alloc, pmr_vector{alloc}, std::move(null_values), + std::move(offsets), 0u); + } + + /** + * Use the LZ4 high compression API to compress the copied values. As C-library LZ4 needs raw pointers as input + * and output. To avoid directly handling raw pointers we use std::vectors as input and output. The input vector + * contains the data that needs to be compressed and the output vector is allocated enough memory to contain + * the compression result. Via the .data() call we can supply LZ4 with raw pointers to the memory the vectors use. + * These are cast to char-pointers since LZ4 expects char pointers. + */ + DebugAssert(values.size() <= std::numeric_limits::max(), + "String input of LZ4 encoder contains too many characters to fit into a 32-bit signed integer sized " + "vector that is used by the LZ4 library."); + const auto input_size = values.size(); + // estimate the (maximum) output size + const auto output_size = LZ4_compressBound(static_cast(input_size)); + auto compressed_data = pmr_vector{alloc}; + compressed_data.resize(static_cast(output_size)); + const int compression_result = LZ4_compress_HC(values.data(), compressed_data.data(), static_cast(input_size), + output_size, LZ4HC_CLEVEL_MAX); + Assert(compression_result > 0, "LZ4 compression failed"); + + // shrink the vector to the actual size of the compressed result + compressed_data.resize(static_cast(compression_result)); + compressed_data.shrink_to_fit(); + + return std::allocate_shared>(alloc, std::move(compressed_data), std::move(null_values), + std::move(offsets), input_size); + } +}; + +} // namespace opossum diff --git a/src/lib/storage/lz4/lz4_iterable.hpp b/src/lib/storage/lz4/lz4_iterable.hpp new file mode 100644 index 0000000000..bdd51650ef --- /dev/null +++ b/src/lib/storage/lz4/lz4_iterable.hpp @@ -0,0 +1,133 @@ +#pragma once + +#include + +#include "storage/segment_iterables.hpp" + +#include "storage/lz4_segment.hpp" +#include "storage/vector_compression/resolve_compressed_vector_type.hpp" + +namespace opossum { + +template +class LZ4Iterable : public PointAccessibleSegmentIterable> { + public: + using ValueType = T; + + explicit LZ4Iterable(const LZ4Segment& segment) : _segment{segment} {} + + template + void _on_with_iterators(const Functor& functor) const { + auto decompressed_segment = _segment.decompress(); + // alias the data type of the constant iterator over the decompressed data + using ValueIterator = decltype(decompressed_segment.cbegin()); + + // create iterator instances for the begin and end + auto begin = Iterator{decompressed_segment.cbegin(), _segment.null_values().cbegin()}; + auto end = Iterator{decompressed_segment.cend(), _segment.null_values().cend()}; + + // call the functor on the iterators (until the begin iterator equals the end iterator) + functor(begin, end); + } + + /** + * For now this point access iterator decompresses the whole segment. + */ + template + void _on_with_iterators(const std::shared_ptr& position_filter, const Functor& functor) const { + auto decompressed_segment = _segment.decompress(); + // alias the data type of the constant iterator over the decompressed data + using ValueIterator = decltype(decompressed_segment.cbegin()); + + // create point access iterator instances for the begin and end + auto begin = PointAccessIterator{decompressed_segment, _segment.null_values(), + position_filter->cbegin(), position_filter->cbegin()}; + auto end = PointAccessIterator{decompressed_segment, _segment.null_values(), + position_filter->cbegin(), position_filter->cend()}; + + // call the functor on the iterators (until the begin iterator equals the end iterator) + functor(begin, end); + } + + size_t _on_size() const { return _segment.size(); } + + private: + const LZ4Segment& _segment; + + private: + template + class Iterator : public BaseSegmentIterator, SegmentPosition> { + public: + using ValueType = T; + using IterableType = LZ4Iterable; + using NullValueIterator = typename pmr_vector::const_iterator; + + public: + // Begin and End Iterator + explicit Iterator(ValueIterator data_it, const NullValueIterator null_value_it) + : _chunk_offset{0u}, _data_it{data_it}, _null_value_it{null_value_it} {} + + private: + friend class boost::iterator_core_access; // grants the boost::iterator_facade access to the private interface + + void increment() { + ++_chunk_offset; + ++_data_it; + ++_null_value_it; + } + + void advance(std::ptrdiff_t n) { + DebugAssert(n >= 0, "Rewinding iterators is not implemented"); + // The easy way for now + for (std::ptrdiff_t i = 0; i < n; ++i) { + increment(); + } + } + + bool equal(const Iterator& other) const { return _data_it == other._data_it; } + + std::ptrdiff_t distance_to(const Iterator& other) const { + return std::ptrdiff_t{other._chunk_offset} - std::ptrdiff_t{_chunk_offset}; + } + + SegmentPosition dereference() const { return SegmentPosition{*_data_it, *_null_value_it, _chunk_offset}; } + + private: + ChunkOffset _chunk_offset; + ValueIterator _data_it; + NullValueIterator _null_value_it; + }; + + template + class PointAccessIterator + : public BasePointAccessSegmentIterator, SegmentPosition> { + public: + using ValueType = T; + using IterableType = LZ4Iterable; + + // Begin Iterator + PointAccessIterator(const std::vector data, const pmr_vector& null_values, + const PosList::const_iterator position_filter_begin, PosList::const_iterator position_filter_it) + : BasePointAccessSegmentIterator, + SegmentPosition>{std::move(position_filter_begin), + std::move(position_filter_it)}, + _data{data}, + _null_values{null_values} {} + + private: + friend class boost::iterator_core_access; // grants the boost::iterator_facade access to the private interface + + SegmentPosition dereference() const { + const auto& chunk_offsets = this->chunk_offsets(); + const auto value = _data[chunk_offsets.offset_in_referenced_chunk]; + const auto is_null = _null_values[chunk_offsets.offset_in_referenced_chunk]; + return SegmentPosition{value, is_null, chunk_offsets.offset_in_poslist}; + } + + private: + const std::vector _data; + const pmr_vector& _null_values; + }; +}; + +} // namespace opossum diff --git a/src/lib/storage/lz4_segment.cpp b/src/lib/storage/lz4_segment.cpp new file mode 100644 index 0000000000..9dc0b72ea8 --- /dev/null +++ b/src/lib/storage/lz4_segment.cpp @@ -0,0 +1,158 @@ +#include "lz4_segment.hpp" + +#include + +#include "resolve_type.hpp" +#include "storage/vector_compression/base_compressed_vector.hpp" +#include "utils/assert.hpp" +#include "utils/performance_warning.hpp" + +namespace opossum { + +template +LZ4Segment::LZ4Segment(pmr_vector&& compressed_data, pmr_vector&& null_values, + pmr_vector&& offsets, const size_t decompressed_size) + : BaseEncodedSegment{data_type_from_type()}, + _compressed_data{std::move(compressed_data)}, + _null_values{std::move(null_values)}, + _offsets{std::move(offsets)}, + _decompressed_size{decompressed_size} {} + +template +LZ4Segment::LZ4Segment(pmr_vector&& compressed_data, pmr_vector&& null_values, + const size_t decompressed_size) + : BaseEncodedSegment{data_type_from_type()}, + _compressed_data{std::move(compressed_data)}, + _null_values{std::move(null_values)}, + _offsets{std::nullopt}, + _decompressed_size{decompressed_size} {} + +template +const AllTypeVariant LZ4Segment::operator[](const ChunkOffset chunk_offset) const { + PerformanceWarning("operator[] used"); + DebugAssert(chunk_offset < size(), "Passed chunk offset must be valid."); + + const auto typed_value = get_typed_value(chunk_offset); + if (!typed_value.has_value()) { + return NULL_VALUE; + } + return *typed_value; +} + +template +const std::optional LZ4Segment::get_typed_value(const ChunkOffset chunk_offset) const { + PerformanceWarning("LZ4::get_typed_value: decompressing the whole LZ4 segment"); + auto decompressed_segment = decompress(); + + const auto is_null = _null_values[chunk_offset]; + if (is_null) { + return std::nullopt; + } + + return decompressed_segment[chunk_offset]; +} + +template +const pmr_vector& LZ4Segment::null_values() const { + return _null_values; +} + +template +const std::optional> LZ4Segment::offsets() const { + return _offsets; +} + +template +size_t LZ4Segment::size() const { + return _null_values.size(); +} + +template +std::vector LZ4Segment::decompress() const { + auto decompressed_data = std::vector(_decompressed_size / sizeof(T)); + auto compressed_size = static_cast(_compressed_data.size()); + const int decompressed_result = + LZ4_decompress_safe(_compressed_data.data(), reinterpret_cast(decompressed_data.data()), compressed_size, + static_cast(_decompressed_size)); + Assert(decompressed_result > 0, "LZ4 decompression failed"); + + return decompressed_data; +} + +template <> +std::vector LZ4Segment::decompress() const { + /** + * If the input segment only contained empty strings the original size is 0. That can't be decompressed and instead + * we can just return as many empty strings as the input contained. + */ + if (!_decompressed_size) { + return std::vector(_null_values.size()); + } + + auto decompressed_data = std::vector(_decompressed_size); + auto compressed_size = static_cast(_compressed_data.size()); + const int decompressed_result = LZ4_decompress_safe(_compressed_data.data(), decompressed_data.data(), + compressed_size, static_cast(_decompressed_size)); + Assert(decompressed_result > 0, "LZ4 decompression failed"); + + /** + * Decode the previously encoded string data. These strings are all appended and separated along the stored offsets. + * Each offset corresponds to a single string. The stored offset itself is the character offset of the first character + * of the string. The end offset is the first character behind the string that is NOT part of the string (i.e., an + * exclusive offset). It is usually the next offset in the vector. In the case of the last offset the end offset is + * indicated by the end of the data vector. + */ + auto decompressed_strings = std::vector(); + for (auto it = _offsets->cbegin(); it != _offsets->cend(); ++it) { + auto start_char_offset = *it; + size_t end_char_offset; + if (it + 1 == _offsets->cend()) { + end_char_offset = _decompressed_size; + } else { + end_char_offset = *(it + 1); + } + + const auto start_offset_it = decompressed_data.cbegin() + start_char_offset; + const auto end_offset_it = decompressed_data.cbegin() + end_char_offset; + decompressed_strings.emplace_back(start_offset_it, end_offset_it); + } + + return decompressed_strings; +} + +template +std::shared_ptr LZ4Segment::copy_using_allocator(const PolymorphicAllocator& alloc) const { + auto new_compressed_data = pmr_vector{_compressed_data, alloc}; + auto new_null_values = pmr_vector{_null_values, alloc}; + + if (_offsets.has_value()) { + auto new_offsets = pmr_vector(*_offsets, alloc); + return std::allocate_shared(alloc, std::move(new_compressed_data), std::move(new_null_values), + std::move(new_offsets), _decompressed_size); + } else { + return std::allocate_shared(alloc, std::move(new_compressed_data), std::move(new_null_values), + _decompressed_size); + } +} + +template +size_t LZ4Segment::estimate_memory_usage() const { + auto bool_size = _null_values.size() * sizeof(bool); + // _offsets is used only for strings + auto offset_size = (_offsets.has_value() ? _offsets->size() * sizeof(size_t) : 0u); + return sizeof(*this) + _compressed_data.size() + bool_size + offset_size; +} + +template +EncodingType LZ4Segment::encoding_type() const { + return EncodingType::LZ4; +} + +template +std::optional LZ4Segment::compressed_vector_type() const { + return std::nullopt; +} + +EXPLICITLY_INSTANTIATE_DATA_TYPES(LZ4Segment); + +} // namespace opossum diff --git a/src/lib/storage/lz4_segment.hpp b/src/lib/storage/lz4_segment.hpp new file mode 100644 index 0000000000..2de0f85130 --- /dev/null +++ b/src/lib/storage/lz4_segment.hpp @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include + +#include + +#include +#include + +#include "base_encoded_segment.hpp" +#include "storage/vector_compression/base_compressed_vector.hpp" +#include "types.hpp" + +namespace opossum { + +class BaseCompressedVector; + +template +class LZ4Segment : public BaseEncodedSegment { + public: + /* + * This is a container for an LZ4 compressed segment. It contains the compressed data, the necessary + * metadata and the ability to decompress the data again. + * + * @param compressed_data The char vector that contains the LZ4 compressed segment data as binary blob. + * @param null_values Boolean vector that contains the information which row is null and which is not null. + * @param offsets If this segment is not a pmr_string segment this will be a std::nullopt (see the other constructor). + * Otherwise it contains the offsets for the compressed strings. The offset at position 0 is the + * character index of the string at index 0. Its (exclusive) end is at the offset at position 1. The + * last string ends at the end of the compressed data (since there is offset after it that specifies + * the end offset). Since these offsets are used the stored strings are not null-terminated + * (and may contain null bytes). + * @param compressed_size The size of the compressed data vector (the return value of LZ4) + * @param decompressed_size The size in bytes of the decompressed data vector. + */ + explicit LZ4Segment(pmr_vector&& compressed_data, pmr_vector&& null_values, pmr_vector&& offsets, + const size_t decompressed_size); + + explicit LZ4Segment(pmr_vector&& compressed_data, pmr_vector&& null_values, + const size_t decompressed_size); + + const pmr_vector& null_values() const; + const std::optional> offsets() const; + + /** + * @defgroup BaseSegment interface + * @{ + */ + + const AllTypeVariant operator[](const ChunkOffset chunk_offset) const final; + + const std::optional get_typed_value(const ChunkOffset chunk_offset) const; + + size_t size() const final; + + std::vector decompress() const; + + std::shared_ptr copy_using_allocator(const PolymorphicAllocator& alloc) const final; + + size_t estimate_memory_usage() const final; + + /**@}*/ + + /** + * @defgroup BaseEncodedSegment interface + * @{ + */ + + EncodingType encoding_type() const final; + std::optional compressed_vector_type() const final; + + /**@}*/ + + private: + const pmr_vector _compressed_data; + const pmr_vector _null_values; + const std::optional> _offsets; + const size_t _decompressed_size; +}; + +} // namespace opossum diff --git a/src/lib/storage/resolve_encoded_segment_type.hpp b/src/lib/storage/resolve_encoded_segment_type.hpp index 9da74fb2ba..0049ba915b 100644 --- a/src/lib/storage/resolve_encoded_segment_type.hpp +++ b/src/lib/storage/resolve_encoded_segment_type.hpp @@ -11,6 +11,7 @@ #include "storage/dictionary_segment.hpp" #include "storage/fixed_string_dictionary_segment.hpp" #include "storage/frame_of_reference_segment.hpp" +#include "storage/lz4_segment.hpp" #include "storage/run_length_segment.hpp" #include "storage/encoding_type.hpp" @@ -32,7 +33,8 @@ constexpr auto encoded_segment_for_type = hana::make_map( hana::make_pair(enum_c, template_c), hana::make_pair(enum_c, template_c), - hana::make_pair(enum_c, template_c)); + hana::make_pair(enum_c, template_c), + hana::make_pair(enum_c, template_c)); /** * @brief Resolves the type of an encoded segment. diff --git a/src/lib/storage/segment_encoding_utils.cpp b/src/lib/storage/segment_encoding_utils.cpp index f6399cd3f3..738c9a0b4a 100644 --- a/src/lib/storage/segment_encoding_utils.cpp +++ b/src/lib/storage/segment_encoding_utils.cpp @@ -5,6 +5,7 @@ #include "storage/dictionary_segment/dictionary_encoder.hpp" #include "storage/frame_of_reference/frame_of_reference_encoder.hpp" +#include "storage/lz4/lz4_encoder.hpp" #include "storage/run_length_segment/run_length_encoder.hpp" #include "storage/base_value_segment.hpp" @@ -24,7 +25,8 @@ const auto encoder_for_type = std::map>()}, {EncodingType::RunLength, std::make_shared()}, {EncodingType::FixedStringDictionary, std::make_shared>()}, - {EncodingType::FrameOfReference, std::make_shared()}}; + {EncodingType::FrameOfReference, std::make_shared()}, + {EncodingType::LZ4, std::make_shared()}}; } // namespace diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 1a40f4d443..8549baf0ab 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -161,6 +161,7 @@ set( storage/fixed_string_vector_test.cpp storage/group_key_index_test.cpp storage/iterables_test.cpp + storage/lz4_segment_test.cpp storage/materialize_test.cpp storage/multi_segment_index_test.cpp storage/prepared_plan_test.cpp diff --git a/src/test/storage/encoded_segment_test.cpp b/src/test/storage/encoded_segment_test.cpp index 64cae05688..885e0a36a9 100644 --- a/src/test/storage/encoded_segment_test.cpp +++ b/src/test/storage/encoded_segment_test.cpp @@ -125,7 +125,7 @@ INSTANTIATE_TEST_CASE_P( SegmentEncodingSpec{EncodingType::Dictionary, VectorCompressionType::FixedSizeByteAligned}, SegmentEncodingSpec{EncodingType::FrameOfReference, VectorCompressionType::SimdBp128}, SegmentEncodingSpec{EncodingType::FrameOfReference, VectorCompressionType::FixedSizeByteAligned}, - SegmentEncodingSpec{EncodingType::RunLength}), + SegmentEncodingSpec{EncodingType::RunLength}, SegmentEncodingSpec{EncodingType::LZ4}), formatter); TEST_P(EncodedSegmentTest, SequentiallyReadNotNullableIntSegment) { diff --git a/src/test/storage/encoding_test.hpp b/src/test/storage/encoding_test.hpp index 19867f7b2b..fbdc37d3f0 100644 --- a/src/test/storage/encoding_test.hpp +++ b/src/test/storage/encoding_test.hpp @@ -25,6 +25,8 @@ const SegmentEncodingSpec all_segment_encoding_specs[]{ {EncodingType::Unencoded}, {EncodingType::Dictionary, VectorCompressionType::FixedSizeByteAligned}, {EncodingType::Dictionary, VectorCompressionType::SimdBp128}, - {EncodingType::RunLength}}; + {EncodingType::RunLength}, + {EncodingType::LZ4}, +}; } // namespace opossum diff --git a/src/test/storage/lz4_segment_test.cpp b/src/test/storage/lz4_segment_test.cpp new file mode 100644 index 0000000000..e3fbb8cfa3 --- /dev/null +++ b/src/test/storage/lz4_segment_test.cpp @@ -0,0 +1,149 @@ +#include +#include +#include + +#include "base_test.hpp" +#include "gtest/gtest.h" + +#include "storage/chunk_encoder.hpp" +#include "storage/lz4_segment.hpp" +#include "storage/segment_encoding_utils.hpp" +#include "storage/value_segment.hpp" + +namespace opossum { + +class StorageLZ4SegmentTest : public BaseTest { + protected: + std::shared_ptr> vs_str = std::make_shared>(true); +}; + +TEST_F(StorageLZ4SegmentTest, CompressNullableSegmentString) { + vs_str->append("Alex"); + vs_str->append("Peter"); + vs_str->append("Ralf"); + vs_str->append("Hans"); + vs_str->append(NULL_VALUE); + vs_str->append("Anna"); + + auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str); + auto lz4_segment = std::dynamic_pointer_cast>(segment); + + // Test segment size + EXPECT_EQ(lz4_segment->size(), 6u); + + // Test compressed values + auto decompressed_data = lz4_segment->decompress(); + EXPECT_EQ(decompressed_data[0], "Alex"); + EXPECT_EQ(decompressed_data[1], "Peter"); + + auto& null_values = lz4_segment->null_values(); + EXPECT_EQ(null_values.size(), 6u); + auto expected_null_values = std::vector{false, false, false, false, true, false}; + + auto offsets = lz4_segment->offsets(); + EXPECT_TRUE(offsets.has_value()); + EXPECT_EQ(offsets->size(), 6u); + auto expected_offsets = std::vector{0, 4, 9, 13, 17, 17}; + + for (auto index = 0u; index < lz4_segment->size(); ++index) { + // Test null values + EXPECT_TRUE(null_values[index] == expected_null_values[index]); + + // Test offsets + EXPECT_TRUE((*offsets)[index] == expected_offsets[index]); + } +} + +TEST_F(StorageLZ4SegmentTest, CompressNullableAndEmptySegmentString) { + vs_str->append("Alex"); + vs_str->append("Peter"); + vs_str->append("Ralf"); + vs_str->append(""); + vs_str->append(NULL_VALUE); + vs_str->append("Anna"); + + auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str); + auto lz4_segment = std::dynamic_pointer_cast>(segment); + + // Test segment size + EXPECT_EQ(lz4_segment->size(), 6u); + + // The empty string should not be a null value + auto& null_values = lz4_segment->null_values(); + EXPECT_EQ(null_values.size(), 6u); + auto expected_null_values = std::vector{false, false, false, false, true, false}; + + auto offsets = lz4_segment->offsets(); + EXPECT_TRUE(offsets.has_value()); + EXPECT_EQ(offsets->size(), 6u); + auto expected_offsets = std::vector{0, 4, 9, 13, 13, 13}; + + for (auto index = 0u; index < lz4_segment->size(); ++index) { + // Test null values + EXPECT_TRUE(null_values[index] == expected_null_values[index]); + + // Test offsets + EXPECT_TRUE((*offsets)[index] == expected_offsets[index]); + } +} + +TEST_F(StorageLZ4SegmentTest, CompressEmptySegmentString) { + for (int i = 0; i < 6; ++i) { + vs_str->append(""); + } + + auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str); + auto lz4_segment = std::dynamic_pointer_cast>(segment); + + // Test segment size + EXPECT_EQ(lz4_segment->size(), 6u); + + // Test compressed values + auto decompressed_data = lz4_segment->decompress(); + EXPECT_EQ(decompressed_data.size(), 6u); + for (const auto& elem : decompressed_data) { + EXPECT_EQ(elem, ""); + } + + // Test offsets + auto offsets = lz4_segment->offsets(); + EXPECT_TRUE(offsets.has_value()); + EXPECT_EQ(offsets->size(), 6u); + for (auto offset : (*offsets)) { + EXPECT_EQ(offset, 0); + } +} + +TEST_F(StorageLZ4SegmentTest, CompressSingleCharSegmentString) { + for (int i = 0; i < 5; ++i) { + vs_str->append(""); + } + vs_str->append("a"); + + auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str); + auto lz4_segment = std::dynamic_pointer_cast>(segment); + + // Test segment size + EXPECT_EQ(lz4_segment->size(), 6u); + + auto decompressed_data = lz4_segment->decompress(); + auto offsets = lz4_segment->offsets(); + EXPECT_TRUE(offsets.has_value()); + EXPECT_EQ(decompressed_data.size(), 6u); + EXPECT_EQ(offsets->size(), 6u); + + for (auto index = 0u; index < lz4_segment->size() - 1; ++index) { + // Test compressed values + EXPECT_EQ(decompressed_data[index], ""); + + // Test offsets + EXPECT_EQ((*offsets)[index], 0); + } + + // Test last element + EXPECT_EQ(decompressed_data[5], "a"); + // This offset is also 0 since the elements before it don't have any content + EXPECT_EQ((*offsets)[5], 0); +} + +} // namespace opossum diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index f3c9b50e56..d32a71d574 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -99,6 +99,24 @@ if(${NUMA_FOUND}) add_subdirectory(pgasus) endif() +## Build lz4 +set(LZ4_LIBRARY_DIR lz4/lib) + +add_library (lz4 + ${LZ4_LIBRARY_DIR}/lz4.c + ${LZ4_LIBRARY_DIR}/lz4hc.c + + ${LZ4_LIBRARY_DIR}/lz4.h + ${LZ4_LIBRARY_DIR}/lz4hc.h +) + +target_include_directories( + lz4 + + PUBLIC + ${LZ4_LIBRARY_DIR} +) + # TODO(lawben): There is currently a PR open for CMake support in libpqxx. # Once that is merged, this should be updated to add_subdirectory(libpqxx) add_library( diff --git a/third_party/lz4 b/third_party/lz4 new file mode 160000 index 0000000000..641b453d9d --- /dev/null +++ b/third_party/lz4 @@ -0,0 +1 @@ +Subproject commit 641b453d9db536ee020851bfcb1dc39f61006f0a