[DYOD18/19] Add LZ4 Encoding Scheme (hyrise#1467)

* Add lz4 submodule * Copy frame of reference files to lz4 files and change constants * Add import * Fixes c&p errors * Implement simple base access lz4 iterator * Add lz4 to cmake project * Implement lz4 segment * Implement lz4 encoder * Use decompress call in iterable * Fix lz4 segment compiler error * Use shared pointer for decompressed data * add zstd submodule * add zstd dictBuilder includes * Handle string segments and copy concurrent vector to stl * Auto stash before merge of "lz4_integration" and "origin/lz4_integration" * extern c and add explicit casting * add LZ4 EncodingType to more places * fix merge issues * comment non-working code * build lz4 as well * Fix lz4 library linking * Fix linker error * Enable encoding tests for lz4 * Refactor lz4 segment * Remove old code * refactor lz4 encoder * Remove old code * Refactor lz4 iterable * Fix string decompression * Add debugassert, fails and remove dead code * Use Fail and cast * indentatio * formatter * cleanup * remove zstd submodule * fix estimate memory usage in lz4 segment * remove lz4 iterable inlining * commata * add lz4 segment parameter docstring * move include * assert instead of debug * emplace back instead of push back * reserve->resize and offset documentation * don't use c_strings in string compression * lz4 in caps * Revert "remove lz4 iterable inlining" This reverts commit 03c7385. * add missing debug assert in encoder * remove lz4 deprecation compiler setting * remove explicit nullpointer instantiation * decompress string data in a more elegant way * remove lz4 compile definitions * remove comments * shrink vector after compression to actual size * remove num_elements * use resize over reserver * lz4 compression comment * improve encoder debug asserts * use const * comment * remove T suffix * fix encoder * fix size call * docstring for string decompression * remove null values if else * remove _compressed_size member from lz4 segment * remove _compressed_size member from lz4 segment * remove accessors * null termination comment * lz4 docstring * remove shared pointer from data and null values * keep offset pointer in encoder * remove reference * re-add offset accessor * return null value reference * refactor copy_using_allocator * also call shrink_to_fit * make decompressed_size a size_t * add LZ4 segment test * improve string copy performance and handle edge case of emtpy strings * fix test * fix empty vector argument * fix test * handle empty strings in decompression * format * add single char test case * fix linter * fix indices and add more expects * fix offset * fix test * move third party includes * Add comment * remove std::move without effect * Remove const references and use std::move * add advance and distance to to sequential iterator * inlining * universal reference * make offsets optional * maybe fix test * add second constructor * fix copy * fix remaining tests * format * remove const * use pmr_string instead of std::string * english * format
hyrise-mp-22-23 · Feb 28, 2019 · 61c53bc · 61c53bc
1 parent e24c7a1
commit 61c53bc
Show file tree

Hide file tree

Showing 19 changed files with 754 additions and 8 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -25,6 +25,9 @@
 [submodule "flat_hash_map"]
 	path = third_party/flat_hash_map
 	url = https://github.com/skarupke/flat_hash_map.git
+[submodule "third_party/lz4"]
+	path = third_party/lz4
+	url = https://github.com/lz4/lz4
 [submodule "third_party/join-order-benchmark"]
 	path = third_party/join-order-benchmark
 	url = https://github.com/gregrahn/join-order-benchmark.git

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -86,6 +86,7 @@ include_directories(
     ${PROJECT_SOURCE_DIR}/third_party/cxxopts/include
     ${PROJECT_SOURCE_DIR}/third_party/flat_hash_map
     ${PROJECT_SOURCE_DIR}/third_party/json
+    ${PROJECT_SOURCE_DIR}/third_party/lz4
 )
 
 if (${ENABLE_JIT_SUPPORT})

diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
@@ -442,6 +442,10 @@ set(
     storage/index/segment_index_type.hpp
     storage/lqp_view.cpp
     storage/lqp_view.hpp
+    storage/lz4/lz4_encoder.hpp
+    storage/lz4/lz4_iterable.hpp
+    storage/lz4_segment.cpp
+    storage/lz4_segment.hpp
     storage/materialize.hpp
     storage/mvcc_data.cpp
     storage/mvcc_data.hpp
@@ -578,6 +582,7 @@ set(
     sqlparser
     cqf
     uninitialized_vector
+    lz4
     custom_jemalloc
     ${FILESYSTEM_LIBRARY}
     ${Boost_CONTAINER_LIBRARY}

diff --git a/src/lib/constant_mappings.cpp b/src/lib/constant_mappings.cpp
@@ -76,6 +76,7 @@ const boost::bimap<EncodingType, std::string> encoding_type_to_string = make_bim
     {EncodingType::RunLength, "RunLength"},
     {EncodingType::FixedStringDictionary, "FixedStringDictionary"},
     {EncodingType::FrameOfReference, "FrameOfReference"},
+    {EncodingType::LZ4, "LZ4"},
     {EncodingType::Unencoded, "Unencoded"},
 });
 

diff --git a/src/lib/operators/print.cpp b/src/lib/operators/print.cpp
@@ -194,6 +194,10 @@ std::string Print::_segment_type(const std::shared_ptr<BaseSegment>& segment) co
         segment_type += "FoR";
         break;
       }
+      case EncodingType::LZ4: {
+        segment_type += "LZ4";
+        break;
+      }
     }
     if (encoded_segment->compressed_vector_type()) {
       switch (*encoded_segment->compressed_vector_type()) {

diff --git a/src/lib/storage/create_iterable_from_segment.hpp b/src/lib/storage/create_iterable_from_segment.hpp
@@ -2,6 +2,7 @@
 
 #include "storage/dictionary_segment/dictionary_segment_iterable.hpp"
 #include "storage/frame_of_reference/frame_of_reference_iterable.hpp"
+#include "storage/lz4/lz4_iterable.hpp"
 #include "storage/run_length_segment/run_length_segment_iterable.hpp"
 #include "storage/segment_iterables/any_segment_iterable.hpp"
 #include "storage/value_segment/value_segment_iterable.hpp"
@@ -72,6 +73,16 @@ auto create_iterable_from_segment(const FrameOfReferenceSegment<T>& segment) {
   }
 }
 
+template <typename T, bool EraseSegmentType = true>
+auto create_iterable_from_segment(const LZ4Segment<T>& segment) {
+  // We always erase the type here because LZ4 is too slow to notice a difference anyway.
+  if constexpr (EraseSegmentType) {
+    return create_any_segment_iterable<T>(segment);
+  } else {
+    return LZ4Iterable<T>{segment};
+  }
+}
+
 /**
  * This function must be forward-declared because ReferenceSegmentIterable
  * includes this file leading to a circular dependency

diff --git a/src/lib/storage/encoding_type.hpp b/src/lib/storage/encoding_type.hpp
@@ -17,11 +17,12 @@ namespace opossum {
 
 namespace hana = boost::hana;
 
-enum class EncodingType : uint8_t { Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference };
+enum class EncodingType : uint8_t { Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4 };
 
 inline static std::vector<EncodingType> encoding_type_enum_values{
-    EncodingType::Unencoded, EncodingType::Dictionary, EncodingType::RunLength, EncodingType::FixedStringDictionary,
-    EncodingType::FrameOfReference};
+    EncodingType::Unencoded,        EncodingType::Dictionary,
+    EncodingType::RunLength,        EncodingType::FixedStringDictionary,
+    EncodingType::FrameOfReference, EncodingType::LZ4};
 
 /**
  * @brief Maps each encoding type to its supported data types
@@ -36,7 +37,8 @@ constexpr auto supported_data_types_for_encoding_type = hana::make_map(
     hana::make_pair(enum_c<EncodingType, EncodingType::Dictionary>, data_types),
     hana::make_pair(enum_c<EncodingType, EncodingType::RunLength>, data_types),
     hana::make_pair(enum_c<EncodingType, EncodingType::FixedStringDictionary>, hana::tuple_t<pmr_string>),
-    hana::make_pair(enum_c<EncodingType, EncodingType::FrameOfReference>, hana::tuple_t<int32_t, int64_t>));
+    hana::make_pair(enum_c<EncodingType, EncodingType::FrameOfReference>, hana::tuple_t<int32_t, int64_t>),
+    hana::make_pair(enum_c<EncodingType, EncodingType::LZ4>, data_types));
 
 /**
  * @return an integral constant implicitly convertible to bool

diff --git a/src/lib/storage/lz4/lz4_encoder.hpp b/src/lib/storage/lz4/lz4_encoder.hpp
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <lz4hc.h>
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "storage/base_segment_encoder.hpp"
+#include "storage/lz4_segment.hpp"
+#include "storage/value_segment.hpp"
+#include "storage/value_segment/value_segment_iterable.hpp"
+#include "storage/vector_compression/vector_compression.hpp"
+#include "types.hpp"
+#include "utils/assert.hpp"
+#include "utils/enum_constant.hpp"
+
+namespace opossum {
+
+class LZ4Encoder : public SegmentEncoder<LZ4Encoder> {
+ public:
+  static constexpr auto _encoding_type = enum_c<EncodingType, EncodingType::LZ4>;
+  static constexpr auto _uses_vector_compression = false;
+
+  template <typename T>
+  std::shared_ptr<BaseEncodedSegment> _on_encode(const std::shared_ptr<const ValueSegment<T>>& value_segment) {
+    const auto alloc = value_segment->values().get_allocator();
+    const auto num_elements = value_segment->size();
+
+    // TODO(anyone): when value segments switch to using pmr_vectors, the data can be copied directly instead of
+    // copying it element by element
+    auto values = pmr_vector<T>{alloc};
+    values.resize(num_elements);
+    auto null_values = pmr_vector<bool>{alloc};
+    null_values.resize(num_elements);
+
+    // copy values and null flags from value segment
+    auto iterable = ValueSegmentIterable<T>{*value_segment};
+    iterable.with_iterators([&](auto it, auto end) {
+      // iterate over the segment to access the values and increment the row index to write to the values and null
+      // values vectors
+      for (size_t row_index = 0u; it != end; ++it, ++row_index) {
+        auto segment_value = *it;
+        values[row_index] = segment_value.value();
+        null_values[row_index] = segment_value.is_null();
+      }
+    });
+
+    /**
+     * Use the LZ4 high compression API to compress the copied values. As C-library LZ4 needs raw pointers as input
+     * and output. To avoid directly handling raw pointers we use std::vectors as input and output. The input vector
+     * contains the data that needs to be compressed and the output vector is allocated enough memory to contain
+     * the compression result. Via the .data() call we can supply LZ4 with raw pointers to the memory the vectors use.
+     * These are cast to char-pointers since LZ4 expects char pointers.
+     */
+    DebugAssert(values.size() * sizeof(T) <= std::numeric_limits<int>::max(),
+                "Input of LZ4 encoder contains too many bytes to fit into a 32-bit signed integer sized vector that is"
+                " used by the LZ4 library.");
+
+    const auto input_size = values.size() * sizeof(T);
+    // estimate the (maximum) output size
+    const auto output_size = LZ4_compressBound(static_cast<int>(input_size));
+    auto compressed_data = pmr_vector<char>{alloc};
+    compressed_data.resize(static_cast<size_t>(output_size));
+    const int compression_result = LZ4_compress_HC(reinterpret_cast<char*>(values.data()), compressed_data.data(),
+                                                   static_cast<int>(input_size), output_size, LZ4HC_CLEVEL_MAX);
+    Assert(compression_result > 0, "LZ4 compression failed");
+
+    // shrink the vector to the actual size of the compressed result
+    compressed_data.resize(static_cast<size_t>(compression_result));
+    compressed_data.shrink_to_fit();
+
+    return std::allocate_shared<LZ4Segment<T>>(alloc, std::move(compressed_data), std::move(null_values), input_size);
+  }
+
+  std::shared_ptr<BaseEncodedSegment> _on_encode(const std::shared_ptr<const ValueSegment<pmr_string>>& value_segment) {
+    const auto alloc = value_segment->values().get_allocator();
+    const auto num_elements = value_segment->size();
+
+    /**
+     * First iterate over the values for two reasons.
+     * 1) If all the strings are empty LZ4 will try to compress an empty vector which will cause a segmentation fault.
+     * In this case we can and need to do an early exit.
+     * 2) Sum the length of the strings to improve the performance when copying the data to the char vector.
+     */
+    size_t num_chars = 0u;
+    ValueSegmentIterable<pmr_string>{*value_segment}.with_iterators([&](auto it, auto end) {
+      for (size_t row_index = 0; it != end; ++it, ++row_index) {
+        if (!it->is_null()) {
+          num_chars += it->value().size();
+        }
+      }
+    });
+
+    // copy values and null flags from value segment
+    auto values = pmr_vector<char>{alloc};
+    values.reserve(num_chars);
+    auto null_values = pmr_vector<bool>{alloc};
+    null_values.resize(num_elements);
+
+    /**
+     * These offsets mark the beginning of strings (and therefore end of the previous string) in the data vector.
+     * These offsets are character offsets. The string at position 0 starts at the offset stored at position 0, which
+     * will always be 0.
+     * Its exclusive end is the offset stored at position 1 (i.e. offsets[1] - 1 is the last character of the string
+     * at position 0).
+     * In case of the last string its end is determined by the end of the data vector.
+     */
+    auto offsets = pmr_vector<size_t>{alloc};
+    offsets.resize(num_elements);
+
+    auto iterable = ValueSegmentIterable<pmr_string>{*value_segment};
+    iterable.with_iterators([&](auto it, auto end) {
+      size_t offset = 0u;
+      bool is_null;
+      // iterate over the iterator to access the values and increment the row index to write to the values and null
+      // values vectors
+      for (size_t row_index = 0; it != end; ++it, ++row_index) {
+        auto segment_value = *it;
+        is_null = segment_value.is_null();
+        null_values[row_index] = is_null;
+        offsets[row_index] = offset;
+        if (!is_null) {
+          auto data = segment_value.value();
+          values.insert(values.cend(), data.begin(), data.end());
+          offset += data.size();
+        }
+      }
+    });
+
+    /**
+     * If the input only contained null values and/or empty strings we don't need to compress anything (and LZ4 will
+     * cause an error). Therefore we can return the encoded segment already.
+     */
+    if (!num_chars) {
+      return std::allocate_shared<LZ4Segment<pmr_string>>(alloc, pmr_vector<char>{alloc}, std::move(null_values),
+                                                          std::move(offsets), 0u);
+    }
+
+    /**
+     * Use the LZ4 high compression API to compress the copied values. As C-library LZ4 needs raw pointers as input
+     * and output. To avoid directly handling raw pointers we use std::vectors as input and output. The input vector
+     * contains the data that needs to be compressed and the output vector is allocated enough memory to contain
+     * the compression result. Via the .data() call we can supply LZ4 with raw pointers to the memory the vectors use.
+     * These are cast to char-pointers since LZ4 expects char pointers.
+     */
+    DebugAssert(values.size() <= std::numeric_limits<int>::max(),
+                "String input of LZ4 encoder contains too many characters to fit into a 32-bit signed integer sized "
+                "vector that is used by the LZ4 library.");
+    const auto input_size = values.size();
+    // estimate the (maximum) output size
+    const auto output_size = LZ4_compressBound(static_cast<int>(input_size));
+    auto compressed_data = pmr_vector<char>{alloc};
+    compressed_data.resize(static_cast<size_t>(output_size));
+    const int compression_result = LZ4_compress_HC(values.data(), compressed_data.data(), static_cast<int>(input_size),
+                                                   output_size, LZ4HC_CLEVEL_MAX);
+    Assert(compression_result > 0, "LZ4 compression failed");
+
+    // shrink the vector to the actual size of the compressed result
+    compressed_data.resize(static_cast<size_t>(compression_result));
+    compressed_data.shrink_to_fit();
+
+    return std::allocate_shared<LZ4Segment<pmr_string>>(alloc, std::move(compressed_data), std::move(null_values),
+                                                        std::move(offsets), input_size);
+  }
+};
+
+}  // namespace opossum