From d6f5343595733ed30272ef6cf9eed2fe0b673729 Mon Sep 17 00:00:00 2001
From: Jan Ehmueller <jan.ehmueller@student.hpi.de>
Date: Fri, 29 Mar 2019 19:14:49 +0100
Subject: [PATCH] [DYOD18/19] Add LZ4 Point Access (also fixes #1522 and fixes
 #1516) (#1521)

* fix test

* handle empty strings in decompression

* format

* add single char test case

* fix linter

* fix indices and add more expects

* fix offset

* fix test

* move third party includes

* Add comment

* remove std::move without effect

* Remove const references and use std::move

* add advance and distance to to sequential iterator

* string lz4segment point access

* inlining

* universal reference

* make offsets optional

* maybe fix test

* add second constructor

* fix copy

* fix remaining tests

* format

* remove const

* use pmr_string instead of std::string

* english

* format

* merge lz4 encoder

* merge lz4 segment

* merge lz4 iterators

* comment out string code

* Remove malicious semicolon

* fix compile errors

* make constant constexpression

* remove std::string

* rename offset function in tests

* add point access string decompression

* add string segment decompression

* debug

* typo

* possible fix

* debug

* handle string decompression edge case

* debug

* fix empty last block

* remove debug output

* fix indent lint errors and make debugassert an assert

* format

* add segment docstrings

* add string segment test for all segments

* more comments

* fix row count calls

* fix test case class

* fix class name

* fix uint

* fix

* add debug

* debug

* fix multi block string

* more debug

* more debug

* fix block index access

* remove debug

* more debug

* try different decompress method

* try larger input block size

* assert for max block size

* Fix string decode error

* remove old method call

* fix if clause

* add empty non string segment test

* remove code in empty loop

* Add extra test case

* multi block tests

* fix index in test

* proper string dictionary learning

* debug

* debug

* debug

* debug

* dbeug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* skip empty int segment test

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* debug

* try to increase small dictionariess otherwise nullopt

* debug

* debug

* debug

* a little refactoring

* dictionary decompression method

* better lz4 use

* add dictionary abort for small value vectors

* single char test

* return empty dict on error

* finish single char test

* remove duplicate test case

* debug

* introduce second type param to decompress method

* better zero one test

* Revert "introduce second type param to decompress method"

This reverts commit 338e19c6b9b26b949b4ac3704b72b47cf3437d61.

* docstring

* refactor dictionary gen

* generate dict with more data

* docstring

* Revert "debug"

This reverts commit 1dfb74d5e0faf7a947ff69767504e6c5dc537ccd.

* Revert "debug"

This reverts commit b38ece91eb9d35668983fba0f3b534c289f6d237.

* Revert "debug"

This reverts commit 0150fe75db6b04d0ff00626f08f6f53b368c85e9.

* Revert "try to increase small dictionariess otherwise nullopt"

This reverts commit 069b5cf5a08c1ca86780d2c9c28de8ed12799006.

* fix #1516: null vector size in value segment

* lz4 estimate memory usage

* refactor dictionary generation and docstring

* move compress

* calculate metadata

* remove const

* remove unused variables

* refactor and remove duplicate code

* point access docstring

* format

* linter

* fix docstring

* more docstring

* Skip only failing test instances

* Fix LZ4 and RunLength encoding for empty Segments

* fullci

* add simple caching

* fix ternary operator

* docstring

* add nolint for std::pair unzip in variable assigment

* remove random data from string segment test

* better commtent for test skipping

* caching with char vector

* fix method signature

* string caching

* wrap caching method for simple string decompression

* move wrapper methods below caching implementations

* more code deduplication

* format

* generate -> train

* typos & ternary operator to std::max

* more typo fixing

* refactor & typos

* refactor encoding emtpy segment test

* re-add empty loop for empty segment test

* remove duplicate empty segment test in encoded string segment test

* don't store offsets in empty string segment

* fix typo

* fix typo

* fix simdbp128 on empty segments

* comment

* size_t initialization

* remove test skipping

* Use constant for number of bits in a byte

* remove this in tests

* remove dictionary padding

* refactor lz4 iterable

* change size_t construction

* fix typo and implicit bool

* change pair constructor

* update dependencies.md

* rename previous block to cached block

* more size_t construction

* remove random null values

* fix shrinking comment

* remove repeated comment in constructor

* fix name shadowing

* improve dictionary training comment

* add general comment explaining zstd dictionary to encoder

* add comment explaining string use_caching variable

* add comment for skipping of dictionary

* make bool usage explicit

* use lz4segment::size instead of null_values.size

* use simdbp128 vector compression for string offsets

* format

* rename vector_decompressor to offset_decompressor

* clarified decompression comment

* add comment explaining block size

* only store null values vector when there are null values

* Use proper vector compression interface

* refactor lz4 test

* comments and extra test case

* refactor optional access to use value()

* format

* reset lz4 stream decoder (fix pointer overflow)

* fix another comment

* fix

* add const

* NULL to nullptr

* change debugassert to assert

* null values in iterator

* optional code style

* format

* more code style

* multi block string test

* more code style
---
 .gitmodules                                   |   3 +
 DEPENDENCIES.md                               |   2 +
 src/CMakeLists.txt                            |   1 +
 src/lib/CMakeLists.txt                        |   1 +
 src/lib/storage/lz4/lz4_encoder.hpp           | 361 +++++++++++++---
 src/lib/storage/lz4/lz4_iterable.hpp          |  40 +-
 src/lib/storage/lz4_segment.cpp               | 404 +++++++++++++++---
 src/lib/storage/lz4_segment.hpp               | 177 +++++++-
 .../run_length_segment/run_length_encoder.hpp |   5 +
 src/lib/storage/value_segment.cpp             |  10 +-
 .../simd_bp128/simd_bp128_iterator.cpp        |   2 +-
 src/test/CMakeLists.txt                       |   1 +
 src/test/storage/encoded_segment_test.cpp     |  64 ++-
 .../storage/encoded_string_segment_test.cpp   | 297 +++++++++++++
 src/test/storage/encoding_test.hpp            |   3 +-
 src/test/storage/lz4_segment_test.cpp         | 226 +++++++---
 src/test/storage/simd_bp128_test.cpp          |  15 +-
 third_party/CMakeLists.txt                    |  38 ++
 third_party/zstd                              |   1 +
 19 files changed, 1428 insertions(+), 223 deletions(-)
 create mode 100644 src/test/storage/encoded_string_segment_test.cpp
 create mode 160000 third_party/zstd

diff --git a/.gitmodules b/.gitmodules
index 0dc6fd60b2..c70e4f1ea4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -31,6 +31,9 @@
 [submodule "third_party/join-order-benchmark"]
 	path = third_party/join-order-benchmark
 	url = https://github.com/gregrahn/join-order-benchmark.git
+[submodule "third_party/zstd"]
+	path = third_party/zstd
+	url = https://github.com/facebook/zstd.git
 [submodule "third_party/jemalloc"]
 	path = third_party/jemalloc
 	url = https://github.com/jemalloc/jemalloc.git
diff --git a/DEPENDENCIES.md b/DEPENDENCIES.md
index 0de48355c5..0ccff4ff11 100644
--- a/DEPENDENCIES.md
+++ b/DEPENDENCIES.md
@@ -33,8 +33,10 @@
 - cxxopts (https://github.com/jarro2783/cxxopts.git)
 - googletest (https://github.com/google/googletest)
 - libpqxx (https://github.com/jtv/libpqxx)
+- lz4 (https://github.com/lz4/lz4)
 - sql-parser (https://github.com/hyrise/sql-parser)
 - pgasus (https://github.com/kateyy/pgasus)
 - cpp-btree (https://github.com/algorithm-ninja/cpp-btree)
 - cqf (https://github.com/ArneMayer/cqf)
 - jemalloc (https://github.com/jemalloc/jemalloc)
+- zstd (https://github.com/facebook/zstd)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1a839b0dcd..62416fd36a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -87,6 +87,7 @@ include_directories(
     ${PROJECT_SOURCE_DIR}/third_party/flat_hash_map
     ${PROJECT_SOURCE_DIR}/third_party/json
     ${PROJECT_SOURCE_DIR}/third_party/lz4
+    ${PROJECT_SOURCE_DIR}/third_party/zstd
 )
 
 if (${ENABLE_JIT_SUPPORT})
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 5c2a3f2470..ede491ffcb 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -592,6 +592,7 @@ set(
     cqf
     uninitialized_vector
     lz4
+    zstd
     custom_jemalloc
     ${FILESYSTEM_LIBRARY}
     ${Boost_CONTAINER_LIBRARY}
diff --git a/src/lib/storage/lz4/lz4_encoder.hpp b/src/lib/storage/lz4/lz4_encoder.hpp
index 069e72880f..a9573b1a14 100644
--- a/src/lib/storage/lz4/lz4_encoder.hpp
+++ b/src/lib/storage/lz4/lz4_encoder.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <lib/dictBuilder/zdict.h>
 #include <lz4hc.h>
 
 #include <algorithm>
@@ -12,6 +13,8 @@
 #include "storage/lz4_segment.hpp"
 #include "storage/value_segment.hpp"
 #include "storage/value_segment/value_segment_iterable.hpp"
+#include "storage/vector_compression/simd_bp128/simd_bp128_compressor.hpp"
+#include "storage/vector_compression/simd_bp128/simd_bp128_vector.hpp"
 #include "storage/vector_compression/vector_compression.hpp"
 #include "types.hpp"
 #include "utils/assert.hpp"
@@ -19,10 +22,45 @@
 
 namespace opossum {
 
+/**
+ * This encoder compresses a value segment with the LZ4 library. LZ4 allows two different modes: block and stream
+ * compression.
+ *
+ * Block compression compresses the segment into one large blob. To access any value of the segment, the
+ * whole blob has to be decompressed first. This causes a large overhead for random access into the segment.
+ *
+ * Stream compression is made for compressing large files as a stream of blocks, while still taking advantage of
+ * information redundancy in the whole file - single block compression can't do that and thus can not compress with
+ * as good ratios. LZ4 builds up a dictionary internally that is incremented with every block that is compressed.
+ * In consequence of that, those blocks can only be decompressed in the same order they were compressed, making it
+ * useless for our point access case, where we only want to decompress the one block containing the requested element
+ * (or several blocks for strings larger than one block size).
+ * To circumvent that we can use a pre-trained dictionary: instead of building up a dictionary internally, it is
+ * already provided for compression and decompression. That makes it possible decompress a block independently.
+ *
+ * A value segment is split into multiple blocks that are compressed with the LZ4 stream compression mode. The
+ * pre-trained dictionary is created with the zstd-library that uses all values in the segment to train the dictionary.
+ * This training can fail if there is not enough input data. In that case, the data is still split into blocks, but
+ * these are compressed independently. That means that LZ4 can't use any information redundancy between these blocks and
+ * therefore, the compression ratio will suffer.
+ * In the case that the input data fits into a single block, this block is compressed without training a dictionary
+ */
 class LZ4Encoder : public SegmentEncoder<LZ4Encoder> {
  public:
   static constexpr auto _encoding_type = enum_c<EncodingType, EncodingType::LZ4>;
-  static constexpr auto _uses_vector_compression = false;
+  static constexpr auto _uses_vector_compression = true;
+  /**
+   * A block size of 16 KB was chosen, since the recommended minimal amount of data to train a zstd dictionary is around
+   * 20 KB. Therefore, there is no point in trying to train a dictionary with less data than that (and the training
+   * fails sometimes in the edge case of two blocks in the segment with the total amount of data being still less than
+   * 20 KB).
+   * With only one block in a segment there is no need for a zstd dictionary. With multiple blocks but no dictionary
+   * (due to not enough data) the compression ratio suffers, since LZ4 can only view and compress small amounts of data
+   * at once (refer to the comment above).
+   */
+  static constexpr auto _block_size = size_t{16384u};
+  static_assert(_block_size <= std::numeric_limits<int>::max(),
+                "LZ4 block size can't be larger than the maximum value of a 32 bit signed int");
 
   template <typename T>
   std::shared_ptr<BaseEncodedSegment> _on_encode(const std::shared_ptr<const ValueSegment<T>>& value_segment) {
@@ -35,44 +73,58 @@ class LZ4Encoder : public SegmentEncoder<LZ4Encoder> {
     values.resize(num_elements);
     auto null_values = pmr_vector<bool>{alloc};
     null_values.resize(num_elements);
+    /**
+     * If the null value vector only contains the value false, then the value segment does not have any row value that
+     * is null. In that case, we don't store the null value vector to reduce the LZ4 segment's memory footprint.
+     */
+    auto contains_null_value = false;
 
     // copy values and null flags from value segment
     auto iterable = ValueSegmentIterable<T>{*value_segment};
     iterable.with_iterators([&](auto it, auto end) {
       // iterate over the segment to access the values and increment the row index to write to the values and null
       // values vectors
-      for (size_t row_index = 0u; it != end; ++it, ++row_index) {
+      for (auto row_index = size_t{0u}; it != end; ++it, ++row_index) {
         auto segment_value = *it;
         values[row_index] = segment_value.value();
         null_values[row_index] = segment_value.is_null();
+        contains_null_value = contains_null_value || segment_value.is_null();
       }
     });
 
+    auto optional_null_values = contains_null_value ? std::optional<pmr_vector<bool>>{null_values} : std::nullopt;
+
     /**
-     * Use the LZ4 high compression API to compress the copied values. As C-library LZ4 needs raw pointers as input
-     * and output. To avoid directly handling raw pointers we use std::vectors as input and output. The input vector
-     * contains the data that needs to be compressed and the output vector is allocated enough memory to contain
-     * the compression result. Via the .data() call we can supply LZ4 with raw pointers to the memory the vectors use.
-     * These are cast to char-pointers since LZ4 expects char pointers.
+     * Pre-compute a zstd dictionary if the input data is split among multiple blocks. This dictionary allows
+     * independent compression of the blocks, while maintaining a good compression ratio.
+     * If the input data fits into a single block, training of a dictionary is skipped.
      */
-    DebugAssert(values.size() * sizeof(T) <= std::numeric_limits<int>::max(),
-                "Input of LZ4 encoder contains too many bytes to fit into a 32-bit signed integer sized vector that is"
-                " used by the LZ4 library.");
-
     const auto input_size = values.size() * sizeof(T);
-    // estimate the (maximum) output size
-    const auto output_size = LZ4_compressBound(static_cast<int>(input_size));
-    auto compressed_data = pmr_vector<char>{alloc};
-    compressed_data.resize(static_cast<size_t>(output_size));
-    const int compression_result = LZ4_compress_HC(reinterpret_cast<char*>(values.data()), compressed_data.data(),
-                                                   static_cast<int>(input_size), output_size, LZ4HC_CLEVEL_MAX);
-    Assert(compression_result > 0, "LZ4 compression failed");
-
-    // shrink the vector to the actual size of the compressed result
-    compressed_data.resize(static_cast<size_t>(compression_result));
-    compressed_data.shrink_to_fit();
-
-    return std::allocate_shared<LZ4Segment<T>>(alloc, std::move(compressed_data), std::move(null_values), input_size);
+    auto dictionary = pmr_vector<char>{};
+    if (input_size > _block_size) {
+      dictionary = _train_dictionary(values);
+    }
+
+    /**
+     * Compress the data and calculate the last block size (which may vary from the block size of the previous blocks)
+     * and the total compressed size. The size of the last block is needed for decompression. The total compressed
+     * size is pre-calculated instead of iterating over all blocks when the memory consumption of the LZ4 segment is
+     * estimated.
+     */
+    auto lz4_blocks = pmr_vector<pmr_vector<char>>{alloc};
+    auto total_compressed_size = size_t{0u};
+    auto last_block_size = size_t{0u};
+    if (!values.empty()) {
+      _compress(values, lz4_blocks, dictionary);
+      last_block_size = input_size % _block_size != 0 ? input_size % _block_size : _block_size;
+      for (const auto& compressed_block : lz4_blocks) {
+        total_compressed_size += compressed_block.size();
+      }
+    }
+
+    return std::allocate_shared<LZ4Segment<T>>(alloc, std::move(lz4_blocks), std::move(optional_null_values),
+                                               std::move(dictionary), _block_size, last_block_size,
+                                               total_compressed_size, num_elements);
   }
 
   std::shared_ptr<BaseEncodedSegment> _on_encode(const std::shared_ptr<const ValueSegment<pmr_string>>& value_segment) {
@@ -82,10 +134,10 @@ class LZ4Encoder : public SegmentEncoder<LZ4Encoder> {
     /**
      * First iterate over the values for two reasons.
      * 1) If all the strings are empty LZ4 will try to compress an empty vector which will cause a segmentation fault.
-     * In this case we can and need to do an early exit.
+     *    In this case we can and need to do an early exit.
      * 2) Sum the length of the strings to improve the performance when copying the data to the char vector.
      */
-    size_t num_chars = 0u;
+    auto num_chars = size_t{0u};
     ValueSegmentIterable<pmr_string>{*value_segment}.with_iterators([&](auto it, auto end) {
       for (size_t row_index = 0; it != end; ++it, ++row_index) {
         if (!it->is_null()) {
@@ -99,21 +151,35 @@ class LZ4Encoder : public SegmentEncoder<LZ4Encoder> {
     values.reserve(num_chars);
     auto null_values = pmr_vector<bool>{alloc};
     null_values.resize(num_elements);
+    /**
+     * If the null value vector only contains the value false, then the value segment does not have any row value that
+     * is null. In that case, we don't store the null value vector to reduce the LZ4 segment's memory footprint.
+     */
+    auto contains_null_value = false;
 
     /**
      * These offsets mark the beginning of strings (and therefore end of the previous string) in the data vector.
      * These offsets are character offsets. The string at position 0 starts at the offset stored at position 0, which
      * will always be 0.
-     * Its exclusive end is the offset stored at position 1 (i.e. offsets[1] - 1 is the last character of the string
+     * Its exclusive end is the offset stored at position 1 (i.e., offsets[1] - 1 is the last character of the string
      * at position 0).
      * In case of the last string its end is determined by the end of the data vector.
+     *
+     * The offsets are stored as 32 bit unsigned integer as opposed to 64 bit (size_t) so that they can later be
+     * compressed via vector compression.
      */
-    auto offsets = pmr_vector<size_t>{alloc};
+    auto offsets = pmr_vector<uint32_t>{alloc};
     offsets.resize(num_elements);
 
+    /**
+     * These are the lengths of each string. They are needed to train the zstd dictionary.
+     */
+    auto sample_sizes = pmr_vector<size_t>{alloc};
+    sample_sizes.resize(num_elements);
+
     auto iterable = ValueSegmentIterable<pmr_string>{*value_segment};
     iterable.with_iterators([&](auto it, auto end) {
-      size_t offset = 0u;
+      auto offset = uint32_t{0u};
       bool is_null;
       // iterate over the iterator to access the values and increment the row index to write to the values and null
       // values vectors
@@ -121,49 +187,232 @@ class LZ4Encoder : public SegmentEncoder<LZ4Encoder> {
         auto segment_value = *it;
         is_null = segment_value.is_null();
         null_values[row_index] = is_null;
+        contains_null_value = contains_null_value || is_null;
         offsets[row_index] = offset;
+        auto sample_size = size_t{0u};
         if (!is_null) {
           auto data = segment_value.value();
           values.insert(values.cend(), data.begin(), data.end());
-          offset += data.size();
+          Assert(data.size() <= std::numeric_limits<uint32_t>::max(),
+                 "The size of string row value exceeds the maximum of uint32 in LZ4 encoding.");
+          offset += static_cast<uint32_t>(data.size());
+          sample_size = data.size();
         }
+        sample_sizes[row_index] = sample_size;
       }
     });
 
+    auto optional_null_values = contains_null_value ? std::optional<pmr_vector<bool>>{null_values} : std::nullopt;
+
     /**
      * If the input only contained null values and/or empty strings we don't need to compress anything (and LZ4 will
-     * cause an error). Therefore we can return the encoded segment already.
+     * cause an error). We can also throw away the offsets, since they won't be used for decompression.
+     * We can do an early exit and return the (not encoded) segment.
      */
-    if (!num_chars) {
-      return std::allocate_shared<LZ4Segment<pmr_string>>(alloc, pmr_vector<char>{alloc}, std::move(null_values),
-                                                          std::move(offsets), 0u);
+    if (num_chars == 0) {
+      auto empty_blocks = pmr_vector<pmr_vector<char>>{alloc};
+      auto empty_dictionary = pmr_vector<char>{};
+      return std::allocate_shared<LZ4Segment<pmr_string>>(alloc, std::move(empty_blocks),
+                                                          std::move(optional_null_values), std::move(empty_dictionary),
+                                                          nullptr, _block_size, 0u, 0u, num_elements);
     }
 
+    // Compress the offsets with a vector compression method to reduce the memory footprint of the LZ4 segment.
+    auto compressed_offsets = compress_vector(offsets, vector_compression_type(), alloc, {offsets.back()});
+
     /**
-     * Use the LZ4 high compression API to compress the copied values. As C-library LZ4 needs raw pointers as input
-     * and output. To avoid directly handling raw pointers we use std::vectors as input and output. The input vector
-     * contains the data that needs to be compressed and the output vector is allocated enough memory to contain
-     * the compression result. Via the .data() call we can supply LZ4 with raw pointers to the memory the vectors use.
-     * These are cast to char-pointers since LZ4 expects char pointers.
+     * Pre-compute a zstd dictionary if the input data is split among multiple blocks. This dictionary allows
+     * independent compression of the blocks, while maintaining a good compression ratio.
+     * If the input data fits into a single block, training of a dictionary is skipped.
      */
-    DebugAssert(values.size() <= std::numeric_limits<int>::max(),
-                "String input of LZ4 encoder contains too many characters to fit into a 32-bit signed integer sized "
-                "vector that is used by the LZ4 library.");
     const auto input_size = values.size();
-    // estimate the (maximum) output size
-    const auto output_size = LZ4_compressBound(static_cast<int>(input_size));
-    auto compressed_data = pmr_vector<char>{alloc};
-    compressed_data.resize(static_cast<size_t>(output_size));
-    const int compression_result = LZ4_compress_HC(values.data(), compressed_data.data(), static_cast<int>(input_size),
-                                                   output_size, LZ4HC_CLEVEL_MAX);
-    Assert(compression_result > 0, "LZ4 compression failed");
-
-    // shrink the vector to the actual size of the compressed result
-    compressed_data.resize(static_cast<size_t>(compression_result));
-    compressed_data.shrink_to_fit();
-
-    return std::allocate_shared<LZ4Segment<pmr_string>>(alloc, std::move(compressed_data), std::move(null_values),
-                                                        std::move(offsets), input_size);
+    auto dictionary = pmr_vector<char>{alloc};
+    if (input_size > _block_size) {
+      dictionary = _train_dictionary(values, sample_sizes);
+    }
+
+    /**
+     * Compress the data and calculate the last block size (which may vary from the block size of the previous blocks)
+     * and the total compressed size. The size of the last block is needed for decompression. The total compressed size
+     * is pre-calculated instead of iterating over all blocks when the memory consumption of the LZ4 segment is
+     * estimated.
+     */
+    auto lz4_blocks = pmr_vector<pmr_vector<char>>{alloc};
+    _compress(values, lz4_blocks, dictionary);
+
+    auto last_block_size = input_size % _block_size != 0 ? input_size % _block_size : _block_size;
+
+    auto total_compressed_size = size_t{0u};
+    for (const auto& compressed_block : lz4_blocks) {
+      total_compressed_size += compressed_block.size();
+    }
+
+    return std::allocate_shared<LZ4Segment<pmr_string>>(
+        alloc, std::move(lz4_blocks), std::move(optional_null_values), std::move(dictionary),
+        std::move(compressed_offsets), _block_size, last_block_size, total_compressed_size, num_elements);
+  }
+
+ private:
+  static constexpr auto _minimum_dictionary_size = size_t{1000u};
+  static constexpr auto _minimum_value_size = size_t{20000u};
+
+  /**
+   * Use the LZ4 high compression stream API to compress the input values. The data is separated into different
+   * blocks that are compressed independently. To maintain a high compression ratio and independence of these blocks
+   * we use the dictionary trained via zstd. LZ4 can use the dictionary "learned" on the column to compress the data
+   * in blocks independently while maintaining a good compression ratio.
+   *
+   * The C-library LZ4 needs raw pointers as input and output. To avoid directly handling raw pointers, we use
+   * std::vectors as input and output. The input vector contains the block that needs to be compressed and the
+   * output vector has allocated enough memory to contain the compression result. Via the .data() call we can supply
+   * LZ4 with raw pointers to the memory the vectors use. These are cast to char-pointers since LZ4 expects char
+   * pointers.
+   *
+   * @tparam T The type of the input data. In the case of non-string-segments, this is the segment type. In the case of
+   *           string-segments, this will be char.
+   * @param values The values that are compressed.
+   * @param lz4_blocks The vector to which the generated LZ4 blocks are appended.
+   * @param dictionary The dictionary trained via zstd. If this dictionary is empty, the blocks are still compressed
+   *                   independently but the compression ratio might suffer.
+   */
+  template <typename T>
+  void _compress(pmr_vector<T>& values, pmr_vector<pmr_vector<char>>& lz4_blocks, const pmr_vector<char>& dictionary) {
+    /**
+     * Here begins the LZ4 compression. The library provides a function to create a stream. The stream is used with
+     * every new block that is to be compressed, but the stream returns a raw pointer to an internal structure.
+     * The stream memory is freed with another call to a library function after compression is done.
+     */
+    auto lz4_stream = LZ4_createStreamHC();
+    // We use the maximum high compression level available in LZ4 for best compression ratios.
+    LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX);
+
+    const auto input_size = values.size() * sizeof(T);
+    auto num_blocks = input_size / _block_size;
+    // Only add the last not-full block if the data doesn't perfectly fit into the block size.
+    if (input_size % _block_size != 0) {
+      num_blocks++;
+    }
+    lz4_blocks.reserve(num_blocks);
+
+    for (auto block_index = size_t{0u}; block_index < num_blocks; ++block_index) {
+      auto decompressed_block_size = _block_size;
+      // The last block's uncompressed size varies.
+      if (block_index + 1 == num_blocks) {
+        decompressed_block_size = input_size - (block_index * _block_size);
+      }
+      // LZ4_compressBound returns an upper bound for the size of the compressed data
+      const auto block_bound = static_cast<size_t>(LZ4_compressBound(static_cast<int>(decompressed_block_size)));
+      auto compressed_block = pmr_vector<char>{values.get_allocator()};
+      compressed_block.resize(block_bound);
+
+      /**
+       * If we previously learned a dictionary, we use it to initialize LZ4. Otherwise LZ4 uses the previously
+       * compressed block instead, which would cause the blocks to depend on one another.
+       * If we have no dictionary present and compress at least a second block (i.e., block_index > 0), then we reset
+       * the LZ4 stream to maintain the independence of the blocks. This only happens when the column does not contain
+       * enough data to produce a zstd dictionary (i.e., a column of single character strings).
+       */
+      if (!dictionary.empty()) {
+        LZ4_loadDictHC(lz4_stream, dictionary.data(), static_cast<int>(dictionary.size()));
+      } else if (block_index) {
+        LZ4_resetStreamHC(lz4_stream, LZ4HC_CLEVEL_MAX);
+      }
+
+      // The offset in the source data where the current block starts.
+      const auto value_offset = block_index * _block_size;
+      // move pointer to start position and pass to the actual compression method
+      const int compression_result = LZ4_compress_HC_continue(
+          lz4_stream, reinterpret_cast<char*>(values.data()) + value_offset, compressed_block.data(),
+          static_cast<int>(decompressed_block_size), static_cast<int>(block_bound));
+
+      Assert(compression_result > 0, "LZ4 stream compression failed");
+
+      // shrink the block vector to the actual size of the compressed result
+      compressed_block.resize(static_cast<size_t>(compression_result));
+      compressed_block.shrink_to_fit();
+
+      lz4_blocks.emplace_back(std::move(compressed_block));
+    }
+
+    // Finally, release the LZ4 stream memory.
+    LZ4_freeStreamHC(lz4_stream);
+  }
+
+  /**
+   * Train a zstd dictionary. This dictionary is used to compress different lz4 blocks independently, while maintaining
+   * a high compression ratio. The independent compression of the blocks is necessary to decompress them independently,
+   * which allows for efficient random access.
+   *
+   * This method should be called for non-string data, since each value has the same size. The zstd dictionary is
+   * originally intended for string data, in which each sample can have a different size.
+   * Non-string data types have a constant size and can be smaller than the minimum sample size of 8 bytes. Therefore,
+   * the sample sizes are the same for each sample and if the size of the data type is less than the minimum sample
+   * size, multiple values are a single sample (e.g., 2 values for 32 bit integers).
+   *
+   * @tparam T The data type of the value segment. This method should only be called for non-string-segments.
+   * @param values All values of the segment. They are the input data to train the dictionary.
+   * @return The trained dictionary, or in the case of failure, an empty vector.
+   */
+  template <typename T>
+  pmr_vector<char> _train_dictionary(const pmr_vector<T>& values) {
+    const auto min_sample_size = size_t{8u};
+    const auto values_size = values.size() * sizeof(T);
+    const auto sample_size = std::max(sizeof(T), min_sample_size);
+    const auto num_samples = values_size / sample_size;
+    const auto sample_sizes = pmr_vector<size_t>(num_samples, sample_size);
+
+    return _train_dictionary(values, sample_sizes);
+  }
+
+  /**
+   * Train a zstd dictionary. In the case of non-string data, the sample sizes are the same. In the case of string data,
+   * each row element is a sample (with differing sizes).
+   * If the dictionary training fails, a dictionary can't be used for compression. Since the blocks have to be
+   * compressed independently for independent access, the compression ratio will suffer.
+   *
+   * @tparam T The data type of the value segment.
+   * @param values The input data that will be compressed (i.e., all rows concatenated).
+   * @param sample_sizes A vector of sample lengths. In the case of strings, each length corresponds to a substring in
+   *                     the values vector. These should correspond to the length of each row's value.
+   *                     In the case of non-strings these will all have the same value and correspond byte blocks,
+   *                     possibly containing more than one row value.
+   * @return The trained dictionary or in the case of failure an empty vector.
+   */
+  template <typename T>
+  pmr_vector<char> _train_dictionary(const pmr_vector<T>& values, const pmr_vector<size_t>& sample_sizes) {
+    /**
+     * The recommended dictionary size is about 1/100th of size of all samples combined, but the size also has to be at
+     * least 1KB. Smaller dictionaries won't work.
+     */
+    auto max_dictionary_size = values.size() / 100;
+    max_dictionary_size = std::max(max_dictionary_size, _minimum_dictionary_size);
+
+    auto dictionary = pmr_vector<char>{values.get_allocator()};
+    size_t dictionary_size;
+
+    // If the input does not contain enough values, it won't be possible to train a dictionary for it.
+    if (values.size() < _minimum_value_size) {
+      return dictionary;
+    }
+
+    dictionary.resize(max_dictionary_size);
+    dictionary_size = ZDICT_trainFromBuffer(dictionary.data(), max_dictionary_size, values.data(), sample_sizes.data(),
+                                            static_cast<unsigned>(sample_sizes.size()));
+
+    // If the generation failed, then compress without a dictionary (the compression ratio will suffer).
+    if (ZDICT_isError(dictionary_size)) {
+      return pmr_vector<char>{};
+    }
+
+    DebugAssert(dictionary_size <= max_dictionary_size,
+                "Generated ZSTD dictionary in LZ4 compression is larger than "
+                "the memory allocated for it.");
+
+    // Shrink the allocated dictionary size to the actual size.
+    dictionary.resize(dictionary_size);
+    dictionary.shrink_to_fit();
+
+    return dictionary;
   }
 };
 
diff --git a/src/lib/storage/lz4/lz4_iterable.hpp b/src/lib/storage/lz4/lz4_iterable.hpp
index a3076c0432..075a7c2d52 100644
--- a/src/lib/storage/lz4/lz4_iterable.hpp
+++ b/src/lib/storage/lz4/lz4_iterable.hpp
@@ -22,24 +22,40 @@ class LZ4Iterable : public PointAccessibleSegmentIterable<LZ4Iterable<T>> {
 
     auto decompressed_segment = _segment.decompress();
 
-    auto begin = Iterator<ValueIterator>{decompressed_segment.cbegin(), _segment.null_values().cbegin()};
-    auto end = Iterator<ValueIterator>{decompressed_segment.cend(), _segment.null_values().cend()};
+    /**
+     * If the null value vector doesn't exist, then the segment does not have any row value that is null. In that case,
+     * we can just use a default initialized boolean vector.
+     */
+    const auto null_values = _segment.null_values() ? *_segment.null_values() : pmr_vector<bool>(_segment.size());
+
+    auto begin = Iterator<ValueIterator>{decompressed_segment.cbegin(), null_values.cbegin()};
+    auto end = Iterator<ValueIterator>{decompressed_segment.cend(), null_values.cend()};
 
     functor(begin, end);
   }
 
   /**
-   * For now this point access iterator decompresses the whole segment.
+   * For the point access, we first retrieve the values for all chunk offsets in the position list and then save
+   * the decompressed values in a vector. The first value in that vector (index 0) is the value for the chunk offset
+   * at index 0 in the position list.
    */
   template <typename Functor>
   void _on_with_iterators(const std::shared_ptr<const PosList>& position_filter, const Functor& functor) const {
     using ValueIterator = typename std::vector<T>::const_iterator;
 
-    const auto decompressed_segment = std::make_shared<std::vector<T>>(_segment.decompress());
+    auto decompressed_filtered_segment = std::vector<ValueType>(position_filter->size());
+    auto cached_block = std::vector<char>{};
+    auto cached_block_index = std::optional<size_t>{};
+    for (auto index = size_t{0u}; index < position_filter->size(); ++index) {
+      const auto& position = (*position_filter)[index];
+      auto [value, block_index] = _segment.decompress(position.chunk_offset, cached_block_index, cached_block);  // NOLINT
+      decompressed_filtered_segment[index] = std::move(value);
+      cached_block_index = block_index;
+    }
 
-    auto begin = PointAccessIterator<ValueIterator>{decompressed_segment, &_segment.null_values(),
+    auto begin = PointAccessIterator<ValueIterator>{decompressed_filtered_segment, &_segment.null_values(),
                                                     position_filter->cbegin(), position_filter->cbegin()};
-    auto end = PointAccessIterator<ValueIterator>{decompressed_segment, &_segment.null_values(),
+    auto end = PointAccessIterator<ValueIterator>{decompressed_filtered_segment, &_segment.null_values(),
                                                   position_filter->cbegin(), position_filter->cend()};
 
     functor(begin, end);
@@ -113,7 +129,7 @@ class LZ4Iterable : public PointAccessibleSegmentIterable<LZ4Iterable<T>> {
     using IterableType = LZ4Iterable<T>;
 
     // Begin Iterator
-    PointAccessIterator(const std::shared_ptr<std::vector<T>>& data, const pmr_vector<bool>* null_values,
+    PointAccessIterator(const std::vector<T>& data, const std::optional<pmr_vector<bool>>* null_values,
                         const PosList::const_iterator position_filter_begin, PosList::const_iterator position_filter_it)
         : BasePointAccessSegmentIterator<PointAccessIterator<ValueIterator>,
                                          SegmentPosition<T>>{std::move(position_filter_begin),
@@ -126,16 +142,14 @@ class LZ4Iterable : public PointAccessibleSegmentIterable<LZ4Iterable<T>> {
 
     SegmentPosition<T> dereference() const {
       const auto& chunk_offsets = this->chunk_offsets();
-      const auto value = (*_data)[chunk_offsets.offset_in_referenced_chunk];
-      const auto is_null = (*_null_values)[chunk_offsets.offset_in_referenced_chunk];
+      const auto value = _data[chunk_offsets.offset_in_poslist];
+      const auto is_null = *_null_values && (**_null_values)[chunk_offsets.offset_in_referenced_chunk];
       return SegmentPosition<T>{value, is_null, chunk_offsets.offset_in_poslist};
     }
 
    private:
-    // LZ4 PointAccessIterators share the materialized segment
-    std::shared_ptr<std::vector<T>> _data;
-
-    const pmr_vector<bool>* _null_values;
+    const std::vector<T> _data;
+    const std::optional<pmr_vector<bool>>* _null_values;
   };
 };
 
diff --git a/src/lib/storage/lz4_segment.cpp b/src/lib/storage/lz4_segment.cpp
index 8ed0e53d0b..a6fd36fd15 100644
--- a/src/lib/storage/lz4_segment.cpp
+++ b/src/lib/storage/lz4_segment.cpp
@@ -2,30 +2,46 @@
 
 #include <lz4.h>
 
+#include <climits>
+#include <sstream>
+#include <string>
+
 #include "resolve_type.hpp"
 #include "storage/vector_compression/base_compressed_vector.hpp"
+#include "storage/vector_compression/base_vector_decompressor.hpp"
 #include "utils/assert.hpp"
 #include "utils/performance_warning.hpp"
 
 namespace opossum {
 
 template <typename T>
-LZ4Segment<T>::LZ4Segment(pmr_vector<char>&& compressed_data, pmr_vector<bool>&& null_values,
-                          pmr_vector<size_t>&& offsets, const size_t decompressed_size)
+LZ4Segment<T>::LZ4Segment(pmr_vector<pmr_vector<char>>&& lz4_blocks, std::optional<pmr_vector<bool>>&& null_values,
+                          pmr_vector<char>&& dictionary, const size_t block_size, const size_t last_block_size,
+                          const size_t compressed_size, const size_t num_elements)
     : BaseEncodedSegment{data_type_from_type<T>()},
-      _compressed_data{std::move(compressed_data)},
+      _lz4_blocks{std::move(lz4_blocks)},
       _null_values{std::move(null_values)},
-      _offsets{std::move(offsets)},
-      _decompressed_size{decompressed_size} {}
+      _dictionary{std::move(dictionary)},
+      _string_offsets{std::nullopt},
+      _block_size{block_size},
+      _last_block_size{last_block_size},
+      _compressed_size{compressed_size},
+      _num_elements{num_elements} {}
 
 template <typename T>
-LZ4Segment<T>::LZ4Segment(pmr_vector<char>&& compressed_data, pmr_vector<bool>&& null_values,
-                          const size_t decompressed_size)
+LZ4Segment<T>::LZ4Segment(pmr_vector<pmr_vector<char>>&& lz4_blocks, std::optional<pmr_vector<bool>>&& null_values,
+                          pmr_vector<char>&& dictionary, std::unique_ptr<const BaseCompressedVector>&& string_offsets,
+                          const size_t block_size, const size_t last_block_size, const size_t compressed_size,
+                          const size_t num_elements)
     : BaseEncodedSegment{data_type_from_type<T>()},
-      _compressed_data{std::move(compressed_data)},
+      _lz4_blocks{std::move(lz4_blocks)},
       _null_values{std::move(null_values)},
-      _offsets{std::nullopt},
-      _decompressed_size{decompressed_size} {}
+      _dictionary{std::move(dictionary)},
+      _string_offsets{std::move(string_offsets)},
+      _block_size{block_size},
+      _last_block_size{last_block_size},
+      _compressed_size{compressed_size},
+      _num_elements{num_elements} {}
 
 template <typename T>
 const AllTypeVariant LZ4Segment<T>::operator[](const ChunkOffset chunk_offset) const {
@@ -41,59 +57,71 @@ const AllTypeVariant LZ4Segment<T>::operator[](const ChunkOffset chunk_offset) c
 
 template <typename T>
 const std::optional<T> LZ4Segment<T>::get_typed_value(const ChunkOffset chunk_offset) const {
-  PerformanceWarning("LZ4::get_typed_value: decompressing the whole LZ4 segment");
-  auto decompressed_segment = decompress();
-
-  const auto is_null = _null_values[chunk_offset];
-  if (is_null) {
+  if (_null_values && (*_null_values)[chunk_offset]) {
     return std::nullopt;
   }
 
-  return decompressed_segment[chunk_offset];
+  return decompress(chunk_offset);
 }
 
 template <typename T>
-const pmr_vector<bool>& LZ4Segment<T>::null_values() const {
+const std::optional<pmr_vector<bool>>& LZ4Segment<T>::null_values() const {
   return _null_values;
 }
 
 template <typename T>
-const std::optional<const pmr_vector<size_t>> LZ4Segment<T>::offsets() const {
-  return _offsets;
+const std::optional<std::unique_ptr<BaseVectorDecompressor>> LZ4Segment<T>::string_offset_decompressor() const {
+  if (_string_offsets && *_string_offsets) {
+    return (*_string_offsets)->create_base_decompressor();
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename T>
+const pmr_vector<char>& LZ4Segment<T>::dictionary() const {
+  return _dictionary;
 }
 
 template <typename T>
 size_t LZ4Segment<T>::size() const {
-  return _null_values.size();
+  return _num_elements;
 }
 
 template <typename T>
 std::vector<T> LZ4Segment<T>::decompress() const {
-  auto decompressed_data = std::vector<T>(_decompressed_size / sizeof(T));
-  auto compressed_size = static_cast<int>(_compressed_data.size());
-  const int decompressed_result =
-      LZ4_decompress_safe(_compressed_data.data(), reinterpret_cast<char*>(decompressed_data.data()), compressed_size,
-                          static_cast<int>(_decompressed_size));
-  Assert(decompressed_result > 0, "LZ4 decompression failed");
+  auto decompressed_data = std::vector<T>(size());
+
+  const auto num_blocks = _lz4_blocks.size();
 
+  for (auto block_index = size_t{0u}; block_index < num_blocks; ++block_index) {
+    // This offset is needed to write directly into the decompressed data vector.
+    const auto decompression_offset = block_index * _block_size / sizeof(T);
+    _decompress_block(block_index, decompressed_data, decompression_offset);
+  }
   return decompressed_data;
 }
 
 template <>
 std::vector<pmr_string> LZ4Segment<pmr_string>::decompress() const {
   /**
-   * If the input segment only contained empty strings the original size is 0. That can't be decompressed and instead
-   * we can just return as many empty strings as the input contained.
+   * If the input segment only contained empty strings, the original size is 0. The segment can't be decompressed, and
+   * instead we can just return as many empty strings as the input contained.
    */
-  if (!_decompressed_size) {
-    return std::vector<pmr_string>(_null_values.size());
+  if (_lz4_blocks.empty()) {
+    return std::vector<pmr_string>(size());
   }
 
-  auto decompressed_data = std::vector<char>(_decompressed_size);
-  auto compressed_size = static_cast<int>(_compressed_data.size());
-  const int decompressed_result = LZ4_decompress_safe(_compressed_data.data(), decompressed_data.data(),
-                                                      compressed_size, static_cast<int>(_decompressed_size));
-  Assert(decompressed_result > 0, "LZ4 decompression failed");
+  const auto decompressed_size = (_lz4_blocks.size() - 1) * _block_size + _last_block_size;
+  auto decompressed_data = std::vector<char>(decompressed_size);
+
+  const auto num_blocks = _lz4_blocks.size();
+
+  for (auto block_index = size_t{0u}; block_index < num_blocks; ++block_index) {
+    // This offset is needed to write directly into the decompressed data vector.
+    const auto decompression_offset = block_index * _block_size;
+    _decompress_block_to_bytes(block_index, decompressed_data, decompression_offset);
+  }
 
   /**
    * Decode the previously encoded string data. These strings are all appended and separated along the stored offsets.
@@ -101,15 +129,17 @@ std::vector<pmr_string> LZ4Segment<pmr_string>::decompress() const {
    * of the string. The end offset is the first character behind the string that is NOT part of the string (i.e., an
    * exclusive offset). It is usually the next offset in the vector. In the case of the last offset the end offset is
    * indicated by the end of the data vector.
+   * The offsets are stored in a compressed vector and accessed via the vector decompression interface.
    */
+  auto offset_decompressor = (*_string_offsets)->create_base_decompressor();
   auto decompressed_strings = std::vector<pmr_string>();
-  for (auto it = _offsets->cbegin(); it != _offsets->cend(); ++it) {
-    auto start_char_offset = *it;
+  for (auto offset_index = size_t{0u}; offset_index < offset_decompressor->size(); ++offset_index) {
+    auto start_char_offset = offset_decompressor->get(offset_index);
     size_t end_char_offset;
-    if (it + 1 == _offsets->cend()) {
-      end_char_offset = _decompressed_size;
+    if (offset_index + 1 == offset_decompressor->size()) {
+      end_char_offset = decompressed_size;
     } else {
-      end_char_offset = *(it + 1);
+      end_char_offset = offset_decompressor->get(offset_index + 1);
     }
 
     const auto start_offset_it = decompressed_data.cbegin() + start_char_offset;
@@ -120,27 +150,297 @@ std::vector<pmr_string> LZ4Segment<pmr_string>::decompress() const {
   return decompressed_strings;
 }
 
+template <typename T>
+void LZ4Segment<T>::_decompress_block(const size_t block_index, std::vector<T>& decompressed_data,
+                                      const size_t write_offset) const {
+  const auto decompressed_block_size = block_index + 1 != _lz4_blocks.size() ? _block_size : _last_block_size;
+  auto& compressed_block = _lz4_blocks[block_index];
+  const auto compressed_block_size = compressed_block.size();
+
+  int decompressed_result;
+  if (_dictionary.empty()) {
+    /**
+     * If the dictionary is empty, we either have only a single block or had not enough data for a dictionary.
+     * When decoding without a dictionary LZ4 needs a stream decode pointer (which would be used to decode the
+     * following blocks).
+     * A new decoder needs to be created for every block (in the case of multiple blocks being compressed without a
+     * dictionary) since the blocks were compressed independently.
+     * This decoder needs to be reset via LZ4_setStreamDecode since LZ4 reuses the previous state instead.
+     */
+    LZ4_streamDecode_t lz4_stream_decoder;
+    auto lz4_stream_decoder_ptr = std::make_unique<LZ4_streamDecode_t>(lz4_stream_decoder);
+    const auto reset_decoder_status = LZ4_setStreamDecode(lz4_stream_decoder_ptr.get(), nullptr, 0);
+    Assert(reset_decoder_status == 1, "LZ4 decompression failed to reset stream decoder.");
+
+    decompressed_result = LZ4_decompress_safe_continue(lz4_stream_decoder_ptr.get(), compressed_block.data(),
+                                                       reinterpret_cast<char*>(decompressed_data.data()) + write_offset,
+                                                       static_cast<int>(compressed_block_size),
+                                                       static_cast<int>(decompressed_block_size));
+  } else {
+    decompressed_result = LZ4_decompress_safe_usingDict(
+        compressed_block.data(), reinterpret_cast<char*>(decompressed_data.data()) + write_offset,
+        static_cast<int>(compressed_block_size), static_cast<int>(decompressed_block_size), _dictionary.data(),
+        static_cast<int>(_dictionary.size()));
+  }
+
+  Assert(decompressed_result > 0, "LZ4 stream decompression failed");
+  DebugAssert(static_cast<size_t>(decompressed_result) == decompressed_block_size,
+              "Decompressed LZ4 block has different size than the initial source data.");
+}
+
+template <typename T>
+void LZ4Segment<T>::_decompress_block_to_bytes(const size_t block_index, std::vector<char>& decompressed_data) const {
+  // Assure that the decompressed data fits into the vector.
+  if (decompressed_data.size() != _block_size) {
+    decompressed_data.resize(_block_size);
+  }
+
+  // We use the string method since we handle a char-vector (even though the data is no necessarily string data).
+  _decompress_block_to_bytes(block_index, decompressed_data, 0u);
+
+  /**
+    * In the case of the last block, the decompressed data is possibly smaller than _block_size (its size equals
+    * _last_block_size). However, when decompressing that block into a buffer of the size _last_block_size, the
+    * LZ4 decompression fails. Therefore, the block is decompressed into a buffer of size _block_size and resized to
+    * the smaller _last_block_size afterwards.
+    */
+  if (block_index + 1 == _lz4_blocks.size()) {
+    decompressed_data.resize(_last_block_size);
+  }
+}
+
+template <typename T>
+void LZ4Segment<T>::_decompress_block_to_bytes(const size_t block_index, std::vector<char>& decompressed_data,
+                                               const size_t write_offset) const {
+  const auto decompressed_block_size = block_index + 1 != _lz4_blocks.size() ? _block_size : _last_block_size;
+  auto& compressed_block = _lz4_blocks[block_index];
+  const auto compressed_block_size = compressed_block.size();
+
+  int decompressed_result;
+  if (_dictionary.empty()) {
+    /**
+     * If the dictionary is empty, we either have only a single block or had not enough data for a dictionary.
+     * When decoding without a dictionary LZ4 needs a stream decode pointer (which would be used to decode the
+     * following blocks).
+     * A new decoder needs to be created for every block (in the case of multiple blocks being compressed without a
+     * dictionary) since the blocks were compressed independently.
+     * This decoder needs to be reset via LZ4_setStreamDecode since LZ4 reuses the previous state instead.
+     */
+    LZ4_streamDecode_t lz4_stream_decoder;
+    auto lz4_stream_decoder_ptr = std::make_unique<LZ4_streamDecode_t>(lz4_stream_decoder);
+    const auto reset_decoder_status = LZ4_setStreamDecode(lz4_stream_decoder_ptr.get(), nullptr, 0);
+    Assert(reset_decoder_status == 1, "LZ4 decompression failed to reset stream decoder.");
+
+    decompressed_result = LZ4_decompress_safe_continue(
+        lz4_stream_decoder_ptr.get(), compressed_block.data(), decompressed_data.data() + write_offset,
+        static_cast<int>(compressed_block_size), static_cast<int>(decompressed_block_size));
+  } else {
+    decompressed_result = LZ4_decompress_safe_usingDict(
+        compressed_block.data(), decompressed_data.data() + write_offset, static_cast<int>(compressed_block_size),
+        static_cast<int>(decompressed_block_size), _dictionary.data(), static_cast<int>(_dictionary.size()));
+  }
+
+  Assert(decompressed_result > 0, "LZ4 stream decompression failed");
+  DebugAssert(static_cast<size_t>(decompressed_result) == decompressed_block_size,
+              "Decompressed LZ4 block has different size than the initial source data.");
+}
+
+template <typename T>
+std::pair<T, size_t> LZ4Segment<T>::decompress(const ChunkOffset& chunk_offset,
+                                               const std::optional<size_t> cached_block_index,
+                                               std::vector<char>& cached_block) const {
+  const auto memory_offset = chunk_offset * sizeof(T);
+  const auto block_index = memory_offset / _block_size;
+
+  /**
+   * If the previously decompressed block was a different block than the one accessed now, overwrite it with the now
+   * decompressed block.
+   */
+  if (!cached_block_index || block_index != *cached_block_index) {
+    _decompress_block_to_bytes(block_index, cached_block);
+  }
+
+  const auto value_offset = (memory_offset % _block_size) / sizeof(T);
+  const T value = *(reinterpret_cast<T*>(cached_block.data()) + value_offset);
+  return std::pair{value, block_index};
+}
+
+template <>
+std::pair<pmr_string, size_t> LZ4Segment<pmr_string>::decompress(const ChunkOffset& chunk_offset,
+                                                                 const std::optional<size_t> cached_block_index,
+                                                                 std::vector<char>& cached_block) const {
+  /**
+   * If the input segment only contained empty strings, the original size is 0. The segment can't be decompressed, and
+   * instead we can just return as many empty strings as the input contained.
+   */
+  if (_lz4_blocks.empty()) {
+    return std::pair{pmr_string{""}, 0u};
+  }
+
+  /**
+   * Calculate character begin and end offsets. This range may span more than one block. If this is the case, multiple
+   * blocks need to be decompressed.
+   * The offsets are stored in a compressed vector and accessed via the vector decompression interface.
+   */
+  auto offset_decompressor = (*_string_offsets)->create_base_decompressor();
+  auto start_offset = offset_decompressor->get(chunk_offset);
+  size_t end_offset;
+  if (chunk_offset + 1 == offset_decompressor->size()) {
+    end_offset = (_lz4_blocks.size() - 1) * _block_size + _last_block_size;
+  } else {
+    end_offset = offset_decompressor->get(chunk_offset + 1);
+  }
+
+  /**
+   * Find the block range in which the string is. If it is only in a single block, then the decompression is simple.
+   * Otherwise multiple blocks need to be decompressed.
+   */
+  const auto start_block = start_offset / _block_size;
+  const auto end_block = end_offset / _block_size;
+
+  // Only one block needs to be decompressed.
+  if (start_block == end_block) {
+    /**
+     * If the previously decompressed block was a different block than the one accessed now, overwrite it with the now
+     * decompressed block.
+     */
+    if (!cached_block_index || start_block != *cached_block_index) {
+      _decompress_block_to_bytes(start_block, cached_block);
+    }
+
+    // Extract the string from the block via the offsets.
+    const auto block_start_offset = start_offset % _block_size;
+    const auto block_end_offset = end_offset % _block_size;
+    const auto start_offset_it = cached_block.cbegin() + block_start_offset;
+    const auto end_offset_it = cached_block.cbegin() + block_end_offset;
+
+    return std::pair{pmr_string{start_offset_it, end_offset_it}, start_block};
+  } else {
+    /**
+     * Multiple blocks need to be decompressed. Iterate over all relevant blocks and append the result to this string
+     * stream.
+     */
+    std::stringstream result_string;
+
+    // These are the character offsets that need to be read in every block.
+    size_t block_start_offset = start_offset % _block_size;
+    size_t block_end_offset = _block_size;
+
+    /**
+     * This is true if there is a block cached and it is one of the blocks that has to be accessed to decompress the
+     * current element.
+     * If it is true there are two cases:
+     * 1) The first block that has to be accesses is cached. This is trivial and afterwards the data can be overwritten.
+     * 2) The cached block is not the first but a later block. In that case, the cached block is copied. The original
+     * buffer is overwritten when decompressing the other blocks. When the cached block needs to be accessed, the copy
+     * is used.
+     */
+    const auto use_caching =
+        cached_block_index && *cached_block_index >= start_block && *cached_block_index <= end_offset;
+
+    /**
+     * If the cached block is not the first block, keep a copy so that the blocks can still be decompressed into the
+     * passed char array and the last decompressed block will be cached afterwards.
+     */
+    auto cached_block_copy = std::vector<char>{};
+    if (use_caching && *cached_block_index != start_block) {
+      cached_block_copy = std::vector<char>{cached_block};
+    }
+
+    /**
+     * Store the index of the last decompressed block. The blocks are decompressed into the cache buffer. If the cached
+     * block is the last block the string, it is copied and used. As a result, the cache contains the last decompressed
+     * block (i.e., the block before the cached block).
+     * In that case, this index equals end_block - 1. Otherwise, it will equal end_block.
+     */
+    auto new_cached_block_index = size_t{0u};
+
+    for (size_t block_index = start_block; block_index <= end_block; ++block_index) {
+      // Only decompress the current block if it's not cached.
+      if (!(use_caching && block_index == *cached_block_index)) {
+        _decompress_block_to_bytes(block_index, cached_block);
+        new_cached_block_index = block_index;
+      }
+
+      // Set the offset for the end of the string.
+      if (block_index == end_block) {
+        block_end_offset = end_offset % _block_size;
+      }
+
+      /**
+       * Extract the string from the current block via the offsets and append it to the result string stream.
+       * If the cached block is not the start block, the data is retrieved from the copy.
+       */
+      pmr_string partial_result;
+      if (use_caching && block_index == *cached_block_index && block_index != start_block) {
+        const auto start_offset_it = cached_block_copy.cbegin() + block_start_offset;
+        const auto end_offset_it = cached_block_copy.cbegin() + block_end_offset;
+        partial_result = pmr_string{start_offset_it, end_offset_it};
+      } else {
+        const auto start_offset_it = cached_block.cbegin() + block_start_offset;
+        const auto end_offset_it = cached_block.cbegin() + block_end_offset;
+        partial_result = pmr_string{start_offset_it, end_offset_it};
+      }
+      result_string << partial_result;
+
+      // After the first iteration, this is set to 0 since only the first block's start offset can't be equal to zero.
+      block_start_offset = 0u;
+    }
+    return std::pair{pmr_string{result_string.str()}, new_cached_block_index};
+  }
+}
+
+template <typename T>
+T LZ4Segment<T>::decompress(const ChunkOffset& chunk_offset) const {
+  auto decompressed_block = std::vector<char>(_block_size);
+  return decompress(chunk_offset, std::nullopt, decompressed_block).first;
+}
+
 template <typename T>
 std::shared_ptr<BaseSegment> LZ4Segment<T>::copy_using_allocator(const PolymorphicAllocator<size_t>& alloc) const {
-  auto new_compressed_data = pmr_vector<char>{_compressed_data, alloc};
-  auto new_null_values = pmr_vector<bool>{_null_values, alloc};
+  auto new_lz4_blocks = pmr_vector<pmr_vector<char>>{alloc};
+  for (const auto& block : _lz4_blocks) {
+    new_lz4_blocks.emplace_back(pmr_vector<char>{block, alloc});
+  }
 
-  if (_offsets) {
-    auto new_offsets = pmr_vector<size_t>(*_offsets, alloc);
-    return std::allocate_shared<LZ4Segment>(alloc, std::move(new_compressed_data), std::move(new_null_values),
-                                            std::move(new_offsets), _decompressed_size);
+  auto new_null_values =
+      _null_values ? std::optional<pmr_vector<bool>>{pmr_vector<bool>{*_null_values, alloc}} : std::nullopt;
+  auto new_dictionary = pmr_vector<char>{_dictionary, alloc};
+
+  if (_string_offsets) {
+    auto new_string_offsets = *_string_offsets ? (*_string_offsets)->copy_using_allocator(alloc) : nullptr;
+    return std::allocate_shared<LZ4Segment>(alloc, std::move(new_lz4_blocks), std::move(new_null_values),
+                                            std::move(new_dictionary), std::move(new_string_offsets), _block_size,
+                                            _last_block_size, _compressed_size, _num_elements);
   } else {
-    return std::allocate_shared<LZ4Segment>(alloc, std::move(new_compressed_data), std::move(new_null_values),
-                                            _decompressed_size);
+    return std::allocate_shared<LZ4Segment>(alloc, std::move(new_lz4_blocks), std::move(new_null_values),
+                                            std::move(new_dictionary), _block_size, _last_block_size, _compressed_size,
+                                            _num_elements);
   }
 }
 
 template <typename T>
 size_t LZ4Segment<T>::estimate_memory_usage() const {
-  auto bool_size = _null_values.size() * sizeof(bool);
-  // _offsets is used only for strings
-  auto offset_size = (_offsets ? _offsets->size() * sizeof(size_t) : 0u);
-  return sizeof(*this) + _compressed_data.size() + bool_size + offset_size;
+  // The null value vector is only stored if there is at least 1 null value in the segment.
+  auto bool_size = size_t{0u};
+  if (_null_values) {
+    bool_size = _null_values->size() * sizeof(bool);
+    // Integer ceiling, since sizeof(bool) equals 1 but boolean vectors are optimized.
+    bool_size = _null_values->size() % CHAR_BIT ? bool_size / CHAR_BIT + 1 : bool_size / CHAR_BIT;
+  }
+
+  // The overhead of storing each block in a separate vector.
+  auto block_vector_size = _lz4_blocks.size() * sizeof(pmr_vector<char>);
+
+  /**
+   * _string_offsets is used only for string segments and is a nullptr if the string segment does not contain any data
+   * (i.e., no rows or only rows with empty strings).
+   */
+  auto offset_size = size_t{0};
+  if (_string_offsets && *_string_offsets) {
+    offset_size = (*_string_offsets)->data_size();
+  }
+  return sizeof(*this) + _compressed_size + bool_size + offset_size + _dictionary.size() + block_vector_size;
 }
 
 template <typename T>
diff --git a/src/lib/storage/lz4_segment.hpp b/src/lib/storage/lz4_segment.hpp
index 2de0f85130..43aac63524 100644
--- a/src/lib/storage/lz4_segment.hpp
+++ b/src/lib/storage/lz4_segment.hpp
@@ -10,7 +10,9 @@
 #include <memory>
 
 #include "base_encoded_segment.hpp"
+#include "storage/pos_list.hpp"
 #include "storage/vector_compression/base_compressed_vector.hpp"
+#include "storage/vector_compression/base_vector_decompressor.hpp"
 #include "types.hpp"
 
 namespace opossum {
@@ -20,29 +22,83 @@ class BaseCompressedVector;
 template <typename T>
 class LZ4Segment : public BaseEncodedSegment {
  public:
-  /*
-   * This is a container for an LZ4 compressed segment. It contains the compressed data, the necessary
+  /**
+   * This is a container for an LZ4 compressed segment. It contains the compressed data in blocks, the necessary
    * metadata and the ability to decompress the data again.
    *
-   * @param compressed_data The char vector that contains the LZ4 compressed segment data as binary blob.
-   * @param null_values Boolean vector that contains the information which row is null and which is not null.
-   * @param offsets If this segment is not a pmr_string segment this will be a std::nullopt (see the other constructor).
-   *                Otherwise it contains the offsets for the compressed strings. The offset at position 0 is the
-   *                character index of the string at index 0. Its (exclusive) end is at the offset at position 1. The
-   *                last string ends at the end of the compressed data (since there is offset after it that specifies
-   *                the end offset). Since these offsets are used the stored strings are not null-terminated
-   *                (and may contain null bytes).
-   * @param compressed_size The size of the compressed data vector (the return value of LZ4)
-   * @param decompressed_size The size in bytes of the decompressed data vector.
+   * This constructor is used for non pmr_string segments. In those, the size of the data type in bytes is a
+   * power of two. That means that the row values perfectly fit into a block (whose size is also a power-of-two) and no
+   * value is split across two blocks. This makes decompression very convenient.
+   *
+   * @param lz4_blocks A vector that contains every LZ4 block separately (i.e., this is a vector of vectors). The blocks
+   *                   are stored in this data format since they are created independently and are also accessed
+   *                   independently. The decompressed size of the first n - 1 blocks is "block_size" and the
+   *                   decompressed size of the last vector is equal to "last_block_size".
+   * @param null_values Boolean vector that contains the information which row is null and which is not null. If no
+   *                    value in the segment is null, std::nullopt is passed instead to reduce the memory footprint of
+   *                    the vector.
+   * @param dictionary This dictionary should be generated via the zstd library. It is used to initialize the LZ4
+   *                   stream compression algorithm. Doing that makes the compression of separate blocks independent of
+   *                   each other (by default the blocks would depend on the previous blocks). If the segment only has
+   *                   a single block, the passed dictionary will be empty since it is not needed for independent
+   *                   decompression.
+   * @param block_size The decompressed size of each full block in bytes. This can be numeric_limits<int>::max() at max.
+   * @param last_block_size The size of the last block in bytes. It is a separate value since the last block is not
+   *                        necessarily full.
+   * @param compressed_size The sum of the compressed size of all blocks. This is a separate argument, so that
+   *                        there is no need to iterate over all blocks when estimating the memory usage.
+   * @param num_elements The number of elements in this segment. This needs to be stored in its own variable, since
+   *                     the other variables might not be set or stored to reduce the memory footprint. E.g., a string
+   *                     segment with only empty strings as elements would have no other way to know how many rows there
+   *                     are.
    */
-  explicit LZ4Segment(pmr_vector<char>&& compressed_data, pmr_vector<bool>&& null_values, pmr_vector<size_t>&& offsets,
-                      const size_t decompressed_size);
+  explicit LZ4Segment(pmr_vector<pmr_vector<char>>&& lz4_blocks, std::optional<pmr_vector<bool>>&& null_values,
+                      pmr_vector<char>&& dictionary, const size_t block_size, const size_t last_block_size,
+                      const size_t compressed_size, const size_t num_elements);
 
-  explicit LZ4Segment(pmr_vector<char>&& compressed_data, pmr_vector<bool>&& null_values,
-                      const size_t decompressed_size);
+  /**
+   * This constructor is used only for pmr_string segments. In those, the size of each row value varies. This means that
+   * a row value can be split into multiple blocks (even more than two if the value is larger than the block size). That
+   * makes the decompression slightly more complex.
+   *
+   * @param lz4_blocks A vector that contains every LZ4 block separately (i.e., this is a vector of vectors). The blocks
+   *                   are stored in this data format since they are created independently and are also accessed
+   *                   independently. The decompressed size of the first n - 1 blocks is "block_size" and the
+   *                   decompressed size of the last vector is equal to "last_block_size".
+   * @param null_values Boolean vector that contains the information which row is null and which is not null. If no
+   *                    value in the segment is null, std::nullopt is passed instead to reduce the memory footprint of
+   *                    the vector.
+   * @param dictionary This dictionary should be generated via the zstd library. It is used to initialize the LZ4
+   *                   stream compression algorithm. Doing that makes the compression of separate blocks independent of
+   *                   each other (by default, the blocks would depend on the previous blocks). If the segment only has
+   *                   a single block, the passed dictionary will be empty since it is not needed for independent
+   *                   decompression.
+   * @param string_offsets These offsets are only needed if this segment is not a pmr_string segment.
+   *                       Otherwise, this is set to a std::nullopt (see the other constructor).
+   *                       It contains the offsets for the compressed strings. The offset at position 0 is the
+   *                       character index of the string at index 0. Its (exclusive) end is at the offset at position 1.
+   *                       The last string ends at the end of the compressed data (since there is an offset after it
+   *                       that specifies the end offset). Since these offsets are used, the stored strings are not
+   *                       null-terminated (and may contain null bytes).
+   *                       The offsets are compressed using a vector compression method to reduce their memory footprint.
+   * @param block_size The decompressed size of each full block in bytes. This can be numeric_limits<int>::max() at max.
+   * @param last_block_size The size of the last block in bytes. It is a separate value since the last block is not
+   *                        necessarily full.
+   * @param compressed_size The sum of the compressed size of all blocks. This is a separate argument so that
+   *                        there is no need to iterate over all blocks when estimating the memory usage.
+   * @param num_elements The number of elements in this segment. This needs to be stored in its own variable, since
+   *                     the other variables might not be set or stored to reduce the memory footprint. E.g., a string
+   *                     segment with only empty strings as elements would have no other way to know how many rows there
+   *                     are.
+   */
+  explicit LZ4Segment(pmr_vector<pmr_vector<char>>&& lz4_blocks, std::optional<pmr_vector<bool>>&& null_values,
+                      pmr_vector<char>&& dictionary, std::unique_ptr<const BaseCompressedVector>&& string_offsets,
+                      const size_t block_size, const size_t last_block_size, const size_t compressed_size,
+                      const size_t num_elements);
 
-  const pmr_vector<bool>& null_values() const;
-  const std::optional<const pmr_vector<size_t>> offsets() const;
+  const std::optional<pmr_vector<bool>>& null_values() const;
+  const std::optional<std::unique_ptr<BaseVectorDecompressor>> string_offset_decompressor() const;
+  const pmr_vector<char>& dictionary() const;
 
   /**
    * @defgroup BaseSegment interface
@@ -55,8 +111,45 @@ class LZ4Segment : public BaseEncodedSegment {
 
   size_t size() const final;
 
+  /**
+   * Decompresses the whole segment at once into a single vector.
+   *
+   * @return A vector containing all the decompressed values in order.
+   */
   std::vector<T> decompress() const;
 
+  /**
+   * Retrieves a single value by only decompressing the block in resides in. Each call of this method causes the
+   * decompression of a block.
+   *
+   * @param chunk_offset The chunk offset identifies a single value in the segment.
+   * @return The decompressed value.
+   */
+  T decompress(const ChunkOffset& chunk_offset) const;
+
+  /**
+   * Retrieves a single value by only decompressing the block in resides in. This method also accepts a previously
+   * decompressed block (and its block index) to check if the queried value also resides in that block. If that is the
+   * case, the value is retrieved directly instead of decompressing the block again.
+   * If the passed block is a different block, it is overwritten with the newly decompressed block.
+   * This block is stored (and passed) as char-vector instead of type T to maintain compatibility with string-segments,
+   * since those don't compress a string-vector but a char-vector. In the case of non-string-segments, the data will be
+   * cast to type T. In the case of string-segments, the char-vector can be used directly.
+   *
+   * @param chunk_offset The chunk offset identifies a single value in the segment.
+   * @param cached_block_index The index of the passed decompressed block. Passing a nullopt indicates that there is
+   *                             no previous block that was decompressed. In that case the newly decompressed block is
+   *                             written to the passed vector. This is only the case for the first decompression, when
+   *                             resolving a position list in the point access iterator.
+   * @param cached_block Vector that contains a previously decompressed block. If this method needs to access a
+   *                       different block, the data is overwritten.
+   * @return A pair of the decompressed value and the index of the block it resides in. This index is the same as the
+   *         input index if no new block had to be decompressed. Otherwise it is the index of the block that was written
+   *         to the passed vector.
+   */
+  std::pair<T, size_t> decompress(const ChunkOffset& chunk_offset, const std::optional<size_t> cached_block_index,
+                                  std::vector<char>& cached_block) const;
+
   std::shared_ptr<BaseSegment> copy_using_allocator(const PolymorphicAllocator<size_t>& alloc) const final;
 
   size_t estimate_memory_usage() const final;
@@ -74,10 +167,50 @@ class LZ4Segment : public BaseEncodedSegment {
   /**@}*/
 
  private:
-  const pmr_vector<char> _compressed_data;
-  const pmr_vector<bool> _null_values;
-  const std::optional<const pmr_vector<size_t>> _offsets;
-  const size_t _decompressed_size;
+  const pmr_vector<pmr_vector<char>> _lz4_blocks;
+  const std::optional<pmr_vector<bool>> _null_values;
+  const pmr_vector<char> _dictionary;
+  const std::optional<std::unique_ptr<const BaseCompressedVector>> _string_offsets;
+  const size_t _block_size;
+  const size_t _last_block_size;
+  const size_t _compressed_size;
+  const size_t _num_elements;
+
+  /**
+   * Decompress a single block into the provided buffer (the vector). This method writes to the buffer with the given
+   * offset, i.e., the buffer can be larger than a single block.
+   *
+   * @param block_index Index of the block in _lz4_blocks that is decompressed.
+   * @param decompressed_data The buffer to which the decompressed data is written.
+   * @param write_offset Byte offset from the beginning of the decompressed_data vector. This is useful when
+   *                     decompressing multiple blocks into the same buffer.
+   */
+  void _decompress_block(const size_t block_index, std::vector<T>& decompressed_data, const size_t write_offset) const;
+
+  /**
+   * Decompresses a single block into a char vector. This method resizes the input vector if the decompressed data
+   * would not fit into it. It is used for string-segments as well as non-string-segments.
+   * This allows a uniform interface in the decompress method for caching. For non-string-segments the decompressed
+   * values have to be further cast to type T, while string-segments can use the char-vector directly.
+   *
+   * @param block_index Index of the block that is decompressed.
+   * @param decompressed_data Vector to which the decompressed data is written. This data is written in bytes (i.e.
+   *                          char) and needs to be cast to type T to get the proper values.
+   */
+  void _decompress_block_to_bytes(const size_t block_index, std::vector<char>& decompressed_data) const;
+
+  /**
+   * Decompress a single block into bytes. For strings the bytes equal the chars of the strings. This method uses the
+   * passed offset to write into the passed char-vector with that offset. I.e., it allows to decompress multiple blocks
+   * into the same vector.
+   *
+   * @param block_index Index of the block that is decompressed.
+   * @param decompressed_data Vector to which the decompressed data is written. Its size needs to be at least equal to
+   *                          the write offset + the decompressed size of the block.
+   * @param write_offset The byte offset to which is written in the passed vector.
+   */
+  void _decompress_block_to_bytes(const size_t block_index, std::vector<char>& decompressed_data,
+                                  const size_t write_offset) const;
 };
 
 }  // namespace opossum
diff --git a/src/lib/storage/run_length_segment/run_length_encoder.hpp b/src/lib/storage/run_length_segment/run_length_encoder.hpp
index 8354c52f59..2ac49d0e5c 100644
--- a/src/lib/storage/run_length_segment/run_length_encoder.hpp
+++ b/src/lib/storage/run_length_segment/run_length_encoder.hpp
@@ -28,6 +28,11 @@ class RunLengthEncoder : public SegmentEncoder<RunLengthEncoder> {
     auto iterable = ValueSegmentIterable<T>{*value_segment};
 
     iterable.with_iterators([&](auto it, auto end) {
+      // Early out for empty segments, code below assumes it to be non-empty
+      if (it == end) {
+        return;
+      }
+
       // Init is_current_null such that it does not equal the first entry
       auto current_value = T{};
       auto is_current_null = !it->is_null();
diff --git a/src/lib/storage/value_segment.cpp b/src/lib/storage/value_segment.cpp
index a75625839c..a09ae917ba 100644
--- a/src/lib/storage/value_segment.cpp
+++ b/src/lib/storage/value_segment.cpp
@@ -1,5 +1,6 @@
 #include "value_segment.hpp"
 
+#include <climits>
 #include <limits>
 #include <memory>
 #include <sstream>
@@ -169,7 +170,14 @@ std::shared_ptr<BaseSegment> ValueSegment<T>::copy_using_allocator(const Polymor
 
 template <typename T>
 size_t ValueSegment<T>::estimate_memory_usage() const {
-  return sizeof(*this) + _values.size() * sizeof(T) + (_null_values ? _null_values->size() * sizeof(bool) : 0u);
+  size_t bool_size = 0u;
+  if (_null_values) {
+    bool_size = _null_values->size() * sizeof(bool);
+    // Integer ceiling, since sizeof(bool) equals 1, but boolean vectors are optimized.
+    bool_size = _null_values->size() % CHAR_BIT ? bool_size / CHAR_BIT + 1 : bool_size / CHAR_BIT;
+  }
+
+  return sizeof(*this) + _values.size() * sizeof(T) + bool_size;
 }
 
 EXPLICITLY_INSTANTIATE_DATA_TYPES(ValueSegment);
diff --git a/src/lib/storage/vector_compression/simd_bp128/simd_bp128_iterator.cpp b/src/lib/storage/vector_compression/simd_bp128/simd_bp128_iterator.cpp
index 44512a3d6a..16c96692a8 100644
--- a/src/lib/storage/vector_compression/simd_bp128/simd_bp128_iterator.cpp
+++ b/src/lib/storage/vector_compression/simd_bp128/simd_bp128_iterator.cpp
@@ -9,7 +9,7 @@ SimdBp128Iterator::SimdBp128Iterator(const pmr_vector<uint128_t>* data, size_t s
       _absolute_index{absolute_index},
       _current_meta_block{std::make_unique<std::array<uint32_t, Packing::meta_block_size>>()},
       _current_meta_block_index{0u} {
-  if (data) {
+  if (data && !data->empty()) {
     _unpack_next_meta_block();
   }
 }
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 4e5e4a56b9..2d90598752 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -160,6 +160,7 @@ set(
     storage/compressed_vector_test.cpp
     storage/dictionary_segment_test.cpp
     storage/encoded_segment_test.cpp
+    storage/encoded_string_segment_test.cpp
     storage/encoding_test.hpp
     storage/fixed_string_dictionary_segment_test.cpp
     storage/fixed_string_vector_test.cpp
diff --git a/src/test/storage/encoded_segment_test.cpp b/src/test/storage/encoded_segment_test.cpp
index 885e0a36a9..d3936cbe7f 100644
--- a/src/test/storage/encoded_segment_test.cpp
+++ b/src/test/storage/encoded_segment_test.cpp
@@ -53,7 +53,7 @@ class EncodedSegmentTest : public BaseTestWithParam<SegmentEncodingSpec> {
     return std::make_shared<ValueSegment<int32_t>>(std::move(values));
   }
 
-  std::shared_ptr<ValueSegment<int32_t>> create_int_w_null_value_segment() {
+  std::shared_ptr<ValueSegment<int32_t>> create_int_with_null_value_segment() {
     auto values = pmr_concurrent_vector<int32_t>(row_count());
     auto null_values = pmr_concurrent_vector<bool>(row_count());
 
@@ -125,12 +125,33 @@ INSTANTIATE_TEST_CASE_P(
                       SegmentEncodingSpec{EncodingType::Dictionary, VectorCompressionType::FixedSizeByteAligned},
                       SegmentEncodingSpec{EncodingType::FrameOfReference, VectorCompressionType::SimdBp128},
                       SegmentEncodingSpec{EncodingType::FrameOfReference, VectorCompressionType::FixedSizeByteAligned},
-                      SegmentEncodingSpec{EncodingType::RunLength}, SegmentEncodingSpec{EncodingType::LZ4}),
+                      SegmentEncodingSpec{EncodingType::RunLength},
+                      SegmentEncodingSpec{EncodingType::LZ4, VectorCompressionType::SimdBp128},
+                      SegmentEncodingSpec{EncodingType::LZ4, VectorCompressionType::FixedSizeByteAligned}),
     formatter);
 
+TEST_P(EncodedSegmentTest, EncodeEmptyIntSegment) {
+  auto value_segment = std::make_shared<ValueSegment<int32_t>>(pmr_concurrent_vector<int32_t>{});
+  auto base_encoded_segment = encode_value_segment(DataType::Int, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  // Trying to iterate over the empty segments should not cause any errors or crashes.
+  resolve_encoded_segment_type<int32_t>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto value_segment_iterable = create_iterable_from_segment(*value_segment);
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    value_segment_iterable.with_iterators([&](auto value_segment_it, auto value_segment_end) {
+      encoded_segment_iterable.with_iterators([&](auto encoded_segment_it, auto encoded_segment_end) {
+        // Nothing happens here since the segments are empty
+      });
+    });
+  });
+}
+
 TEST_P(EncodedSegmentTest, SequentiallyReadNotNullableIntSegment) {
-  auto value_segment = this->create_int_value_segment();
-  auto base_encoded_segment = this->encode_value_segment(DataType::Int, value_segment);
+  auto value_segment = create_int_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::Int, value_segment);
 
   EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
 
@@ -149,8 +170,8 @@ TEST_P(EncodedSegmentTest, SequentiallyReadNotNullableIntSegment) {
 }
 
 TEST_P(EncodedSegmentTest, SequentiallyReadNullableIntSegment) {
-  auto value_segment = this->create_int_w_null_value_segment();
-  auto base_encoded_segment = this->encode_value_segment(DataType::Int, value_segment);
+  auto value_segment = create_int_with_null_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::Int, value_segment);
 
   EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
 
@@ -160,7 +181,7 @@ TEST_P(EncodedSegmentTest, SequentiallyReadNullableIntSegment) {
 
     value_segment_iterable.with_iterators([&](auto value_segment_it, auto value_segment_end) {
       encoded_segment_iterable.with_iterators([&](auto encoded_segment_it, auto encoded_segment_end) {
-        auto row_idx = 0;
+        auto row_idx = 0u;
         for (; encoded_segment_it != encoded_segment_end; ++encoded_segment_it, ++value_segment_it, ++row_idx) {
           // This covers `EncodedSegment::operator[]`
           if (variant_is_null((*value_segment)[row_idx])) {
@@ -182,12 +203,12 @@ TEST_P(EncodedSegmentTest, SequentiallyReadNullableIntSegment) {
 }
 
 TEST_P(EncodedSegmentTest, SequentiallyReadNullableIntSegmentWithChunkOffsetsList) {
-  auto value_segment = this->create_int_w_null_value_segment();
-  auto base_encoded_segment = this->encode_value_segment(DataType::Int, value_segment);
+  auto value_segment = create_int_with_null_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::Int, value_segment);
 
   EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
 
-  auto position_filter = this->create_sequential_position_filter();
+  auto position_filter = create_sequential_position_filter();
 
   resolve_encoded_segment_type<int32_t>(*base_encoded_segment, [&](const auto& encoded_segment) {
     auto value_segment_iterable = create_iterable_from_segment(*value_segment);
@@ -208,12 +229,12 @@ TEST_P(EncodedSegmentTest, SequentiallyReadNullableIntSegmentWithChunkOffsetsLis
 }
 
 TEST_P(EncodedSegmentTest, SequentiallyReadNullableIntSegmentWithShuffledChunkOffsetsList) {
-  auto value_segment = this->create_int_w_null_value_segment();
-  auto base_encoded_segment = this->encode_value_segment(DataType::Int, value_segment);
+  auto value_segment = create_int_with_null_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::Int, value_segment);
 
   EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
 
-  auto position_filter = this->create_random_access_position_filter();
+  auto position_filter = create_random_access_position_filter();
 
   resolve_encoded_segment_type<int32_t>(*base_encoded_segment, [&](const auto& encoded_segment) {
     auto value_segment_iterable = create_iterable_from_segment(*value_segment);
@@ -233,4 +254,21 @@ TEST_P(EncodedSegmentTest, SequentiallyReadNullableIntSegmentWithShuffledChunkOf
   });
 }
 
+TEST_P(EncodedSegmentTest, SequentiallyReadEmptyIntSegment) {
+  auto value_segment = std::make_shared<ValueSegment<int32_t>>(pmr_concurrent_vector<int32_t>{});
+  auto base_encoded_segment = encode_value_segment(DataType::Int, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  // Even if no actual reading happens here, iterators are created and we can test that they do not crash on empty
+  // segments
+  resolve_encoded_segment_type<int32_t>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    encoded_segment_iterable.with_iterators([&](auto encoded_segment_it, auto encoded_segment_end) {
+      // Nothing happens here since the segments are empty
+    });
+  });
+}
+
 }  // namespace opossum
diff --git a/src/test/storage/encoded_string_segment_test.cpp b/src/test/storage/encoded_string_segment_test.cpp
new file mode 100644
index 0000000000..ca332940a4
--- /dev/null
+++ b/src/test/storage/encoded_string_segment_test.cpp
@@ -0,0 +1,297 @@
+#include <boost/hana/at_key.hpp>
+
+#include <cctype>
+#include <memory>
+#include <random>
+#include <sstream>
+
+#include "base_test.hpp"
+#include "gtest/gtest.h"
+
+#include "constant_mappings.hpp"
+#include "storage/chunk_encoder.hpp"
+#include "storage/create_iterable_from_segment.hpp"
+#include "storage/encoding_type.hpp"
+#include "storage/resolve_encoded_segment_type.hpp"
+#include "storage/segment_encoding_utils.hpp"
+#include "storage/value_segment.hpp"
+
+#include "types.hpp"
+#include "utils/enum_constant.hpp"
+
+namespace opossum {
+
+class EncodedStringSegmentTest : public BaseTestWithParam<SegmentEncodingSpec> {
+ protected:
+  static constexpr auto max_length = 32;
+  static constexpr auto row_count = size_t{1u} << 10;
+
+ protected:
+  std::shared_ptr<ValueSegment<pmr_string>> create_empty_string_value_segment() {
+    auto values = pmr_concurrent_vector<pmr_string>(row_count);
+    return std::make_shared<ValueSegment<pmr_string>>(std::move(values));
+  }
+
+  std::shared_ptr<ValueSegment<pmr_string>> create_empty_string_with_null_value_segment() {
+    auto values = pmr_concurrent_vector<pmr_string>(row_count);
+    auto null_values = pmr_concurrent_vector<bool>(row_count);
+
+    for (auto index = size_t{0u}; index < row_count; ++index) {
+      null_values[index] = index % 4 == 0;
+    }
+
+    return std::make_shared<ValueSegment<pmr_string>>(std::move(values), std::move(null_values));
+  }
+
+  std::shared_ptr<ValueSegment<pmr_string>> create_string_value_segment() {
+    auto values = pmr_concurrent_vector<pmr_string>(row_count);
+
+    for (auto index = size_t{0u}; index < row_count; ++index) {
+      if (index % 3 == 0) {
+        values[index] = "Hello world!!1!12345";
+      } else if (index % 3 == 1) {
+        values[index] = "This IS A ...";
+      } else {
+        values[index] = "0987654312poiuytrewq";
+      }
+    }
+
+    return std::make_shared<ValueSegment<pmr_string>>(std::move(values));
+  }
+
+  std::shared_ptr<ValueSegment<pmr_string>> create_string_with_null_value_segment() {
+    auto values = pmr_concurrent_vector<pmr_string>(row_count);
+    auto null_values = pmr_concurrent_vector<bool>(row_count);
+
+    for (auto index = 0u; index < row_count; ++index) {
+      null_values[index] = index % 4 == 0;
+      if (index % 3 == 0) {
+        values[index] = "Hello world!!1!12345";
+      } else if (index % 3 == 1) {
+        values[index] = "This IS A ...";
+      } else {
+        values[index] = "0987654312poiuytrewq";
+      }
+    }
+
+    return std::make_shared<ValueSegment<pmr_string>>(std::move(values), std::move(null_values));
+  }
+
+  std::shared_ptr<PosList> create_sequential_position_filter() {
+    auto list = std::make_shared<PosList>();
+    list->guarantee_single_chunk();
+
+    std::default_random_engine engine{};
+    std::bernoulli_distribution bernoulli_dist{0.5};
+
+    for (auto offset_in_referenced_chunk = 0u; offset_in_referenced_chunk < row_count; ++offset_in_referenced_chunk) {
+      if (bernoulli_dist(engine)) {
+        list->push_back(RowID{ChunkID{0}, offset_in_referenced_chunk});
+      }
+    }
+
+    return list;
+  }
+
+  std::shared_ptr<PosList> create_random_access_position_filter() {
+    auto list = create_sequential_position_filter();
+
+    auto random_device = std::random_device{};
+    std::default_random_engine engine{random_device()};
+    std::shuffle(list->begin(), list->end(), engine);
+
+    return list;
+  }
+
+  template <typename T>
+  std::shared_ptr<BaseEncodedSegment> encode_value_segment(DataType data_type,
+                                                           const std::shared_ptr<ValueSegment<T>>& value_segment) {
+    const auto segment_encoding_spec = GetParam();
+    return encode_segment(segment_encoding_spec.encoding_type, data_type, value_segment,
+                          segment_encoding_spec.vector_compression_type);
+  }
+};
+
+auto formatter = [](const ::testing::TestParamInfo<SegmentEncodingSpec> info) {
+  const auto spec = info.param;
+
+  auto stream = std::stringstream{};
+  stream << encoding_type_to_string.left.at(spec.encoding_type);
+  if (spec.vector_compression_type) {
+    stream << "-" << vector_compression_type_to_string.left.at(*spec.vector_compression_type);
+  }
+
+  auto string = stream.str();
+  string.erase(std::remove_if(string.begin(), string.end(), [](char c) { return !std::isalnum(c); }), string.end());
+
+  return string;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SegmentEncodingSpecs, EncodedStringSegmentTest,
+    ::testing::Values(SegmentEncodingSpec{EncodingType::Dictionary, VectorCompressionType::SimdBp128},
+                      SegmentEncodingSpec{EncodingType::Dictionary, VectorCompressionType::FixedSizeByteAligned},
+                      SegmentEncodingSpec{EncodingType::RunLength},
+                      SegmentEncodingSpec{EncodingType::LZ4, VectorCompressionType::SimdBp128},
+                      SegmentEncodingSpec{EncodingType::LZ4, VectorCompressionType::FixedSizeByteAligned}),
+    formatter);
+
+TEST_P(EncodedStringSegmentTest, SequentiallyReadNotNullableEmptyStringSegment) {
+  auto value_segment = create_empty_string_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::String, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  resolve_encoded_segment_type<pmr_string>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto value_segment_iterable = create_iterable_from_segment(*value_segment);
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    value_segment_iterable.with_iterators([&](auto value_segment_it, auto value_segment_end) {
+      encoded_segment_iterable.with_iterators([&](auto encoded_segment_it, auto encoded_segment_end) {
+        for (; encoded_segment_it != encoded_segment_end; ++encoded_segment_it, ++value_segment_it) {
+          EXPECT_EQ(value_segment_it->value(), encoded_segment_it->value());
+        }
+      });
+    });
+  });
+}
+
+TEST_P(EncodedStringSegmentTest, SequentiallyReadNullableEmptyStringSegment) {
+  auto value_segment = create_empty_string_with_null_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::String, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  resolve_encoded_segment_type<pmr_string>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto value_segment_iterable = create_iterable_from_segment(*value_segment);
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    value_segment_iterable.with_iterators([&](auto value_segment_it, auto value_segment_end) {
+      encoded_segment_iterable.with_iterators([&](auto encoded_segment_it, auto encoded_segment_end) {
+        auto row_idx = 0u;
+        for (; encoded_segment_it != encoded_segment_end; ++encoded_segment_it, ++value_segment_it, ++row_idx) {
+          // This covers `EncodedSegment::operator[]`
+          if (variant_is_null((*value_segment)[row_idx])) {
+            EXPECT_TRUE(variant_is_null(encoded_segment[row_idx]));
+          } else {
+            EXPECT_EQ((*value_segment)[row_idx], encoded_segment[row_idx]);
+          }
+
+          // This covers the point access iterator
+          EXPECT_EQ(value_segment_it->is_null(), encoded_segment_it->is_null());
+
+          if (!value_segment_it->is_null()) {
+            EXPECT_EQ(value_segment_it->value(), encoded_segment_it->value());
+          }
+        }
+      });
+    });
+  });
+}
+
+TEST_P(EncodedStringSegmentTest, SequentiallyReadNotNullableStringSegment) {
+  auto value_segment = create_string_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::String, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  resolve_encoded_segment_type<pmr_string>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto value_segment_iterable = create_iterable_from_segment(*value_segment);
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    value_segment_iterable.with_iterators([&](auto value_segment_it, auto value_segment_end) {
+      encoded_segment_iterable.with_iterators([&](auto encoded_segment_it, auto encoded_segment_end) {
+        for (; encoded_segment_it != encoded_segment_end; ++encoded_segment_it, ++value_segment_it) {
+          EXPECT_EQ(value_segment_it->value(), encoded_segment_it->value());
+        }
+      });
+    });
+  });
+}
+
+TEST_P(EncodedStringSegmentTest, SequentiallyReadNullableStringSegment) {
+  auto value_segment = create_string_with_null_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::String, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  resolve_encoded_segment_type<pmr_string>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto value_segment_iterable = create_iterable_from_segment(*value_segment);
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    value_segment_iterable.with_iterators([&](auto value_segment_it, auto value_segment_end) {
+      encoded_segment_iterable.with_iterators([&](auto encoded_segment_it, auto encoded_segment_end) {
+        auto row_idx = 0u;
+        for (; encoded_segment_it != encoded_segment_end; ++encoded_segment_it, ++value_segment_it, ++row_idx) {
+          // This covers `EncodedSegment::operator[]`
+          if (variant_is_null((*value_segment)[row_idx])) {
+            EXPECT_TRUE(variant_is_null(encoded_segment[row_idx]));
+          } else {
+            EXPECT_EQ((*value_segment)[row_idx], encoded_segment[row_idx]);
+          }
+
+          // This covers the point access iterator
+          EXPECT_EQ(value_segment_it->is_null(), encoded_segment_it->is_null());
+
+          if (!value_segment_it->is_null()) {
+            EXPECT_EQ(value_segment_it->value(), encoded_segment_it->value());
+          }
+        }
+      });
+    });
+  });
+}
+
+TEST_P(EncodedStringSegmentTest, SequentiallyReadNullableStringSegmentWithChunkOffsetsList) {
+  auto value_segment = create_string_with_null_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::String, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  auto position_filter = create_sequential_position_filter();
+
+  resolve_encoded_segment_type<pmr_string>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto value_segment_iterable = create_iterable_from_segment(*value_segment);
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    value_segment_iterable.with_iterators(position_filter, [&](auto value_segment_it, auto value_segment_end) {
+      encoded_segment_iterable.with_iterators(position_filter, [&](auto encoded_segment_it, auto encoded_segment_end) {
+        for (; encoded_segment_it != encoded_segment_end; ++encoded_segment_it, ++value_segment_it) {
+          EXPECT_EQ(value_segment_it->is_null(), encoded_segment_it->is_null());
+
+          if (!value_segment_it->is_null()) {
+            EXPECT_EQ(value_segment_it->value(), encoded_segment_it->value());
+          }
+        }
+      });
+    });
+  });
+}
+
+TEST_P(EncodedStringSegmentTest, SequentiallyReadNullableStringSegmentWithShuffledChunkOffsetsList) {
+  auto value_segment = create_string_with_null_value_segment();
+  auto base_encoded_segment = encode_value_segment(DataType::String, value_segment);
+
+  EXPECT_EQ(value_segment->size(), base_encoded_segment->size());
+
+  auto position_filter = create_random_access_position_filter();
+
+  resolve_encoded_segment_type<pmr_string>(*base_encoded_segment, [&](const auto& encoded_segment) {
+    auto value_segment_iterable = create_iterable_from_segment(*value_segment);
+    auto encoded_segment_iterable = create_iterable_from_segment(encoded_segment);
+
+    value_segment_iterable.with_iterators(position_filter, [&](auto value_segment_it, auto value_segment_end) {
+      encoded_segment_iterable.with_iterators(position_filter, [&](auto encoded_segment_it, auto encoded_segment_end) {
+        for (; encoded_segment_it != encoded_segment_end; ++encoded_segment_it, ++value_segment_it) {
+          EXPECT_EQ(value_segment_it->is_null(), encoded_segment_it->is_null());
+
+          if (!value_segment_it->is_null()) {
+            EXPECT_EQ(value_segment_it->value(), encoded_segment_it->value());
+          }
+        }
+      });
+    });
+  });
+}
+
+}  // namespace opossum
diff --git a/src/test/storage/encoding_test.hpp b/src/test/storage/encoding_test.hpp
index 14614646d8..a8f9af11b1 100644
--- a/src/test/storage/encoding_test.hpp
+++ b/src/test/storage/encoding_test.hpp
@@ -36,7 +36,8 @@ const SegmentEncodingSpec all_segment_encoding_specs[]{
     {EncodingType::Dictionary, VectorCompressionType::FixedSizeByteAligned},
     {EncodingType::Dictionary, VectorCompressionType::SimdBp128},
     {EncodingType::FrameOfReference},
-    {EncodingType::LZ4},
+    {EncodingType::LZ4, VectorCompressionType::FixedSizeByteAligned},
+    {EncodingType::LZ4, VectorCompressionType::SimdBp128},
     {EncodingType::RunLength}};
 
 }  // namespace opossum
diff --git a/src/test/storage/lz4_segment_test.cpp b/src/test/storage/lz4_segment_test.cpp
index 8e491d8500..0e9188980c 100644
--- a/src/test/storage/lz4_segment_test.cpp
+++ b/src/test/storage/lz4_segment_test.cpp
@@ -6,6 +6,7 @@
 #include "gtest/gtest.h"
 
 #include "storage/chunk_encoder.hpp"
+#include "storage/lz4/lz4_encoder.hpp"
 #include "storage/lz4_segment.hpp"
 #include "storage/segment_encoding_utils.hpp"
 #include "storage/value_segment.hpp"
@@ -14,19 +15,60 @@ namespace opossum {
 
 class StorageLZ4SegmentTest : public BaseTest {
  protected:
+  static constexpr auto row_count = LZ4Encoder::_block_size + size_t{1000u};
   std::shared_ptr<ValueSegment<pmr_string>> vs_str = std::make_shared<ValueSegment<pmr_string>>(true);
 };
 
-TEST_F(StorageLZ4SegmentTest, CompressNullableSegmentString) {
+template <typename T>
+std::shared_ptr<LZ4Segment<T>> compress(std::shared_ptr<ValueSegment<T>> segment, DataType data_type) {
+  auto encoded_segment = encode_segment(EncodingType::LZ4, data_type, segment);
+  return std::dynamic_pointer_cast<LZ4Segment<T>>(encoded_segment);
+}
+
+TEST_F(StorageLZ4SegmentTest, HandleOptionalOffsetsAndNullValues) {
+  auto empty_int_segment = compress(std::make_shared<ValueSegment<int32_t>>(true), DataType::Int);
+  EXPECT_FALSE(empty_int_segment->string_offset_decompressor());
+  EXPECT_FALSE(empty_int_segment->null_values());
+
+  auto empty_str_segment = compress(std::make_shared<ValueSegment<pmr_string>>(true), DataType::String);
+  EXPECT_FALSE(empty_str_segment->string_offset_decompressor());
+  EXPECT_FALSE(empty_str_segment->null_values());
+
+  vs_str->append("Alex");
+  vs_str->append("Peter");
+  auto str_segment = compress(vs_str, DataType::String);
+  EXPECT_TRUE(str_segment->string_offset_decompressor());
+  EXPECT_NE(*(str_segment->string_offset_decompressor()), nullptr);
+  EXPECT_FALSE(str_segment->null_values());
+}
+
+TEST_F(StorageLZ4SegmentTest, CompressEmptyStringNotNullNullableSegment) {
+  for (auto index = size_t{0u}; index < row_count; ++index) {
+    vs_str->append("");
+  }
+  auto lz4_segment = compress(vs_str, DataType::String);
+
+  // Test segment size
+  EXPECT_EQ(lz4_segment->size(), row_count);
+
+  // Test compressed values
+  auto decompressed_data = lz4_segment->decompress();
+
+  auto& null_values = lz4_segment->null_values();
+  EXPECT_FALSE(null_values);
+
+  const auto offset_decompressor = lz4_segment->string_offset_decompressor();
+  EXPECT_FALSE(offset_decompressor);
+}
+
+TEST_F(StorageLZ4SegmentTest, CompressNullableStringSegment) {
   vs_str->append("Alex");
   vs_str->append("Peter");
   vs_str->append("Ralf");
   vs_str->append("Hans");
   vs_str->append(NULL_VALUE);
   vs_str->append("Anna");
-
-  auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str);
-  auto lz4_segment = std::dynamic_pointer_cast<LZ4Segment<pmr_string>>(segment);
+  auto lz4_segment = compress(vs_str, DataType::String);
 
   // Test segment size
   EXPECT_EQ(lz4_segment->size(), 6u);
@@ -37,113 +79,171 @@ TEST_F(StorageLZ4SegmentTest, CompressNullableSegmentString) {
   EXPECT_EQ(decompressed_data[1], "Peter");
 
   auto& null_values = lz4_segment->null_values();
-  EXPECT_EQ(null_values.size(), 6u);
+  EXPECT_TRUE(null_values);
+  EXPECT_EQ(null_values->size(), 6u);
   auto expected_null_values = std::vector<bool>{false, false, false, false, true, false};
 
-  auto offsets = lz4_segment->offsets();
-  EXPECT_TRUE(offsets);
-  EXPECT_EQ(offsets->size(), 6u);
-  auto expected_offsets = std::vector<size_t>{0, 4, 9, 13, 17, 17};
+  const auto offset_decompressor = lz4_segment->string_offset_decompressor();
+  EXPECT_TRUE(offset_decompressor);
+  EXPECT_EQ((*offset_decompressor)->size(), 6u);
 
-  for (auto index = 0u; index < lz4_segment->size(); ++index) {
+  auto expected_offsets = std::vector<size_t>{0, 4, 9, 13, 17, 17};
+  for (auto index = size_t{0u}; index < lz4_segment->size(); ++index) {
     // Test null values
-    EXPECT_TRUE(null_values[index] == expected_null_values[index]);
+    EXPECT_TRUE((*null_values)[index] == expected_null_values[index]);
 
     // Test offsets
-    EXPECT_TRUE((*offsets)[index] == expected_offsets[index]);
+    EXPECT_TRUE((*offset_decompressor)->get(index) == expected_offsets[index]);
   }
 }
 
-TEST_F(StorageLZ4SegmentTest, CompressNullableAndEmptySegmentString) {
+TEST_F(StorageLZ4SegmentTest, CompressNullableAndEmptyStringSegment) {
   vs_str->append("Alex");
   vs_str->append("Peter");
   vs_str->append("Ralf");
   vs_str->append("");
   vs_str->append(NULL_VALUE);
   vs_str->append("Anna");
-
-  auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str);
-  auto lz4_segment = std::dynamic_pointer_cast<LZ4Segment<pmr_string>>(segment);
+  auto lz4_segment = compress(vs_str, DataType::String);
 
   // Test segment size
   EXPECT_EQ(lz4_segment->size(), 6u);
 
   // The empty string should not be a null value
   auto& null_values = lz4_segment->null_values();
-  EXPECT_EQ(null_values.size(), 6u);
+  EXPECT_TRUE(null_values);
+  EXPECT_EQ(null_values->size(), 6u);
   auto expected_null_values = std::vector<bool>{false, false, false, false, true, false};
 
-  auto offsets = lz4_segment->offsets();
-  EXPECT_TRUE(offsets);
-  EXPECT_EQ(offsets->size(), 6u);
-  auto expected_offsets = std::vector<size_t>{0, 4, 9, 13, 13, 13};
+  const auto offset_decompressor = lz4_segment->string_offset_decompressor();
+  EXPECT_TRUE(offset_decompressor);
+  EXPECT_EQ((*offset_decompressor)->size(), 6u);
 
-  for (auto index = 0u; index < lz4_segment->size(); ++index) {
+  auto expected_offsets = std::vector<size_t>{0, 4, 9, 13, 13, 13};
+  for (auto index = size_t{0u}; index < lz4_segment->size(); ++index) {
     // Test null values
-    EXPECT_TRUE(null_values[index] == expected_null_values[index]);
+    EXPECT_TRUE((*null_values)[index] == expected_null_values[index]);
 
     // Test offsets
-    EXPECT_TRUE((*offsets)[index] == expected_offsets[index]);
+    EXPECT_TRUE((*offset_decompressor)->get(index) == expected_offsets[index]);
   }
 }
 
-TEST_F(StorageLZ4SegmentTest, CompressEmptySegmentString) {
-  for (int i = 0; i < 6; ++i) {
+TEST_F(StorageLZ4SegmentTest, CompressSingleCharStringSegment) {
+  for (auto index = size_t{0u}; index < row_count; ++index) {
     vs_str->append("");
   }
-
-  auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str);
-  auto lz4_segment = std::dynamic_pointer_cast<LZ4Segment<pmr_string>>(segment);
+  vs_str->append("a");
+  auto lz4_segment = compress(vs_str, DataType::String);
 
   // Test segment size
-  EXPECT_EQ(lz4_segment->size(), 6u);
+  EXPECT_EQ(lz4_segment->size(), row_count + 1);
 
-  // Test compressed values
   auto decompressed_data = lz4_segment->decompress();
-  EXPECT_EQ(decompressed_data.size(), 6u);
-  for (const auto& elem : decompressed_data) {
-    EXPECT_EQ(elem, "");
-  }
+  EXPECT_EQ(decompressed_data.size(), row_count + 1);
+
+  const auto offset_decompressor = lz4_segment->string_offset_decompressor();
+  EXPECT_TRUE(offset_decompressor);
+  EXPECT_EQ((*offset_decompressor)->size(), row_count + 1);
 
-  // Test offsets
-  auto offsets = lz4_segment->offsets();
-  EXPECT_TRUE(offsets);
-  EXPECT_EQ(offsets->size(), 6u);
-  for (auto offset : (*offsets)) {
-    EXPECT_EQ(offset, 0);
+  for (auto index = size_t{0u}; index < lz4_segment->size() - 1; ++index) {
+    // Test compressed values
+    EXPECT_EQ(decompressed_data[index], "");
+
+    // Test offsets
+    EXPECT_EQ((*offset_decompressor)->get(index), 0);
   }
+
+  // Test last element
+  EXPECT_EQ(decompressed_data[row_count], "a");
+  // This offset is also 0 since the elements before it don't have any content
+  EXPECT_EQ((*offset_decompressor)->get(row_count), 0);
 }
 
-TEST_F(StorageLZ4SegmentTest, CompressSingleCharSegmentString) {
-  for (int i = 0; i < 5; ++i) {
-    vs_str->append("");
+TEST_F(StorageLZ4SegmentTest, CompressZeroOneStringSegment) {
+  for (auto index = size_t{0u}; index < row_count; ++index) {
+    vs_str->append(index % 2 ? "0" : "1");
   }
-  vs_str->append("a");
-
-  auto segment = encode_segment(EncodingType::LZ4, DataType::String, vs_str);
-  auto lz4_segment = std::dynamic_pointer_cast<LZ4Segment<pmr_string>>(segment);
+  auto lz4_segment = compress(vs_str, DataType::String);
 
   // Test segment size
-  EXPECT_EQ(lz4_segment->size(), 6u);
+  EXPECT_EQ(lz4_segment->size(), row_count);
+  EXPECT_TRUE(lz4_segment->dictionary().empty());
 
   auto decompressed_data = lz4_segment->decompress();
-  auto offsets = lz4_segment->offsets();
-  EXPECT_TRUE(offsets);
-  EXPECT_EQ(decompressed_data.size(), 6u);
-  EXPECT_EQ(offsets->size(), 6u);
-
-  for (auto index = 0u; index < lz4_segment->size() - 1; ++index) {
-    // Test compressed values
-    EXPECT_EQ(decompressed_data[index], "");
+  EXPECT_EQ(decompressed_data.size(), row_count);
 
-    // Test offsets
-    EXPECT_EQ((*offsets)[index], 0);
+  // Test element values
+  for (auto index = size_t{0u}; index < lz4_segment->size(); ++index) {
+    EXPECT_EQ(decompressed_data[index], index % 2 ? "0" : "1");
   }
+}
 
-  // Test last element
-  EXPECT_EQ(decompressed_data[5], "a");
-  // This offset is also 0 since the elements before it don't have any content
-  EXPECT_EQ((*offsets)[5], 0);
+TEST_F(StorageLZ4SegmentTest, CompressMultiBlockStringSegment) {
+  const auto block_size = LZ4Encoder::_block_size;
+  const auto size_diff = size_t{1000u};
+  static_assert(block_size > size_diff, "LZ4 block size is too small");
+
+  // Nearly fills the first block.
+  const auto string1 = pmr_string(block_size - size_diff, 'a');
+  vs_str->append(string1);
+  // Starts in the first block, completely fills second block and reaches the third block.
+  const auto string2 = pmr_string(block_size + (2 * size_diff), 'b');
+  vs_str->append(string2);
+  // Stays in the third block.
+  const auto string3 = pmr_string(size_diff, 'c');
+  vs_str->append(string3);
+  const auto third_block_size = (string1.size() + string2.size() + string3.size()) % block_size;
+
+  auto lz4_segment = compress(vs_str, DataType::String);
+
+  // Test segment size.
+  EXPECT_EQ(lz4_segment->size(), 3u);
+
+  // Test element wise decompression without caching.
+  EXPECT_EQ(lz4_segment->decompress(ChunkOffset{1u}), string2);
+  EXPECT_EQ(lz4_segment->decompress(ChunkOffset{2u}), string3);
+  EXPECT_EQ(lz4_segment->decompress(ChunkOffset{0u}), string1);
+
+  // Test element wise decompression with cache.
+  auto cache = std::vector<char>{};
+  std::pair<pmr_string, size_t> result;
+
+  // First access the third block (cache miss).
+  result = lz4_segment->decompress(ChunkOffset{2u}, std::nullopt, cache);
+  EXPECT_EQ(cache.size(), third_block_size);
+  EXPECT_EQ(result.first, string3);
+  EXPECT_EQ(result.second, 2u);
+
+  /**
+   * Access the first, second and third block. The cache should be used for the third block. As a result the buffer
+   * used for decompression will contain the second block since it needed to be decompressed.
+   */
+  result = lz4_segment->decompress(ChunkOffset{1u}, result.second, cache);
+  EXPECT_EQ(cache.size(), block_size);
+  EXPECT_EQ(result.first, string2);
+  EXPECT_EQ(result.second, 1u);
+
+  // Access the same blocks again. Now, the cache should be used for the second block and then contain the third block.
+  result = lz4_segment->decompress(ChunkOffset{1u}, result.second, cache);
+  EXPECT_EQ(cache.size(), third_block_size);
+  EXPECT_EQ(result.first, string2);
+  EXPECT_EQ(result.second, 2u);
+
+  // Access the first block (cache miss).
+  result = lz4_segment->decompress(ChunkOffset{0u}, result.second, cache);
+  EXPECT_EQ(cache.size(), block_size);
+  EXPECT_EQ(result.first, string1);
+  EXPECT_EQ(result.second, 0u);
+
+  /**
+   * Access the first, second and third block again. The cache should now be used for the first block and afterwards be
+   * overwritten with the third block.
+   */
+  result = lz4_segment->decompress(ChunkOffset{1u}, result.second, cache);
+  EXPECT_EQ(cache.size(), third_block_size);
+  EXPECT_EQ(result.first, string2);
+  EXPECT_EQ(result.second, 2u);
 }
 
 }  // namespace opossum
diff --git a/src/test/storage/simd_bp128_test.cpp b/src/test/storage/simd_bp128_test.cpp
index 6e16ceed45..2deb5a994b 100644
--- a/src/test/storage/simd_bp128_test.cpp
+++ b/src/test/storage/simd_bp128_test.cpp
@@ -77,7 +77,6 @@ INSTANTIATE_TEST_CASE_P(BitSizes, SimdBp128Test, ::testing::Range(uint8_t{1}, ui
 TEST_P(SimdBp128Test, DecompressSequenceUsingIterators) {
   const auto sequence = generate_sequence(420);
   const auto compressed_sequence_base = compress(sequence);
-
   auto compressed_sequence = dynamic_cast<const SimdBp128Vector*>(compressed_sequence_base.get());
   EXPECT_NE(compressed_sequence, nullptr);
 
@@ -103,4 +102,18 @@ TEST_P(SimdBp128Test, DecompressSequenceUsingDecompressor) {
   }
 }
 
+TEST_P(SimdBp128Test, CompressEmptySequence) {
+  const auto sequence = generate_sequence(0);
+  const auto compressed_sequence_base = compress(sequence);
+
+  ASSERT_EQ(compressed_sequence_base->size(), 0u);
+  ASSERT_EQ(compressed_sequence_base->data_size(), 0u);
+
+  auto compressed_sequence = dynamic_cast<const SimdBp128Vector*>(compressed_sequence_base.get());
+  EXPECT_NE(compressed_sequence, nullptr);
+
+  auto decompressor = compressed_sequence->create_base_decompressor();
+  ASSERT_EQ(decompressor->size(), 0u);
+}
+
 }  // namespace opossum
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index d32a71d574..4bab4389e3 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -117,6 +117,44 @@ target_include_directories(
     ${LZ4_LIBRARY_DIR}
 )
 
+## Build zstd
+set(ZSTD_LIBRARY_DIR zstd/lib)
+
+add_library (zstd
+    ${ZSTD_LIBRARY_DIR}/compress/zstd_lazy.c
+    ${ZSTD_LIBRARY_DIR}/compress/zstdmt_compress.c
+    ${ZSTD_LIBRARY_DIR}/compress/zstd_double_fast.c
+    ${ZSTD_LIBRARY_DIR}/compress/zstd_fast.c
+    ${ZSTD_LIBRARY_DIR}/compress/hist.c
+    ${ZSTD_LIBRARY_DIR}/compress/fse_compress.c
+    ${ZSTD_LIBRARY_DIR}/compress/zstd_opt.c
+    ${ZSTD_LIBRARY_DIR}/compress/zstd_compress.c
+    ${ZSTD_LIBRARY_DIR}/compress/huf_compress.c
+    ${ZSTD_LIBRARY_DIR}/compress/zstd_ldm.c
+    ${ZSTD_LIBRARY_DIR}/common/xxhash.c
+    ${ZSTD_LIBRARY_DIR}/common/fse_decompress.c
+    ${ZSTD_LIBRARY_DIR}/common/pool.c
+    ${ZSTD_LIBRARY_DIR}/common/zstd_common.c
+    ${ZSTD_LIBRARY_DIR}/common/error_private.c
+    ${ZSTD_LIBRARY_DIR}/common/debug.c
+    ${ZSTD_LIBRARY_DIR}/common/threading.c
+    ${ZSTD_LIBRARY_DIR}/common/entropy_common.c
+    ${ZSTD_LIBRARY_DIR}/dictBuilder/zdict.c
+    ${ZSTD_LIBRARY_DIR}/dictBuilder/divsufsort.c
+    ${ZSTD_LIBRARY_DIR}/dictBuilder/cover.c
+    ${ZSTD_LIBRARY_DIR}/dictBuilder/fastcover.c
+)
+
+target_include_directories(
+    zstd
+
+    PRIVATE
+    ${ZSTD_LIBRARY_DIR}
+    ${ZSTD_LIBRARY_DIR}/common
+    ${ZSTD_LIBRARY_DIR}/compress
+    ${ZSTD_LIBRARY_DIR}/dictBuilder
+)
+
 # TODO(lawben): There is currently a PR open for CMake support in libpqxx.
 # Once that is merged, this should be updated to add_subdirectory(libpqxx)
 add_library(
diff --git a/third_party/zstd b/third_party/zstd
new file mode 160000
index 0000000000..470344d33e
--- /dev/null
+++ b/third_party/zstd
@@ -0,0 +1 @@
+Subproject commit 470344d33e1d52a2ada75d278466da8d4ee2faf6