From b93fde4d4e85ab4784b04b7acfcd9988eda91b97 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 28 Dec 2024 01:52:18 +0100
Subject: [PATCH 01/15] Extra classes for Words- and Docsfile parsing

---
 src/global/IndexTypes.h               |  1 +
 src/index/IndexImpl.Text.cpp          | 47 ++++++++++---------
 src/index/IndexImpl.h                 |  4 +-
 src/parser/CMakeLists.txt             |  2 +-
 src/parser/ContextFileParser.cpp      | 46 -------------------
 src/parser/ContextFileParser.h        | 45 ------------------
 src/parser/WordsAndDocsFileParser.cpp | 61 +++++++++++++++++++++++++
 src/parser/WordsAndDocsFileParser.h   | 63 +++++++++++++++++++++++++
 test/CMakeLists.txt                   |  2 +-
 test/ContextFileParserTest.cpp        | 59 ------------------------
 test/WordsAndDocsFileParserTest.cpp   | 66 +++++++++++++++++++++++++++
 11 files changed, 218 insertions(+), 178 deletions(-)
 delete mode 100644 src/parser/ContextFileParser.cpp
 delete mode 100644 src/parser/ContextFileParser.h
 create mode 100644 src/parser/WordsAndDocsFileParser.cpp
 create mode 100644 src/parser/WordsAndDocsFileParser.h
 delete mode 100644 test/ContextFileParserTest.cpp
 create mode 100644 test/WordsAndDocsFileParserTest.cpp
diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
index 08ee960d00..4868e59694 100644
--- a/src/global/IndexTypes.h
+++ b/src/global/IndexTypes.h
@@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*;
 using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
 using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
 using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
+using DocumentIndex = ad_utility::TypedIndex<uint64_t, "DocumentIndex">;
diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 76c0015974..022c49b8cd 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -17,7 +17,7 @@
 #include "backports/algorithm.h"
 #include "engine/CallFixedSize.h"
 #include "index/FTSAlgorithms.h"
-#include "parser/ContextFileParser.h"
+#include "parser/WordsAndDocsFileParser.h"
 #include "util/Conversions.h"
 #include "util/Simple8bCode.h"
 
@@ -38,18 +38,17 @@ struct LiteralsTokenizationDelimiter {
 }  // namespace
 
 // _____________________________________________________________________________
-cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
+cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
     const std::string& contextFile, bool addWordsFromLiterals) {
   auto localeManager = textVocab_.getLocaleManager();
   // ROUND 1: If context file aka wordsfile is not empty, read words from there.
   // Remember the last context id for the (optional) second round.
   TextRecordIndex contextId = TextRecordIndex::make(0);
   if (!contextFile.empty()) {
-    ContextFileParser::Line line;
-    ContextFileParser p(contextFile, localeManager);
+    WordsFileParser p(contextFile, localeManager);
     ad_utility::HashSet<string> items;
-    while (p.getLine(line)) {
-      contextId = line._contextId;
+    for (auto line : p) {
+      contextId = line.contextId_;
       co_yield line;
     }
     if (contextId > TextRecordIndex::make(0)) {
@@ -65,7 +64,7 @@ cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
       if (!isLiteral(text)) {
         continue;
       }
-      ContextFileParser::Line entityLine{text, true, contextId, 1, true};
+      WordsFileLine entityLine{text, true, contextId, 1, true};
       co_yield entityLine;
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
@@ -73,7 +72,7 @@ cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
       for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{},
                                       absl::SkipEmpty{})) {
         auto wordNormalized = localeManager.getLowercaseUtf8(word);
-        ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1};
+        WordsFileLine wordLine{wordNormalized, false, contextId, 1};
         co_yield wordLine;
       }
       contextId = contextId.incremented();
@@ -214,12 +213,12 @@ size_t IndexImpl::processWordsForVocabulary(string const& contextFile,
   for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
     ++numLines;
     // LOG(INFO) << "LINE: "
-    //           << std::setw(50) << line._word << "   "
-    //           << line._isEntity << "\t"
-    //           << line._contextId.get() << "\t"
-    //           << line._score << std::endl;
-    if (!line._isEntity) {
-      distinctWords.insert(line._word);
+    //           << std::setw(50) << line.word_ << "   "
+    //           << line.isEntity_ << "\t"
+    //           << line.contextId_.get() << "\t"
+    //           << line.score_ << std::endl;
+    if (!line.isEntity_) {
+      distinctWords.insert(line.word_);
     }
   }
   textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary");
@@ -243,29 +242,29 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
   size_t nofLiterals = 0;
 
   for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
-    if (line._contextId != currentContext) {
+    if (line.contextId_ != currentContext) {
       ++nofContexts;
       addContextToVector(writer, currentContext, wordsInContext,
                          entitiesInContext);
-      currentContext = line._contextId;
+      currentContext = line.contextId_;
       wordsInContext.clear();
       entitiesInContext.clear();
     }
-    if (line._isEntity) {
+    if (line.isEntity_) {
       ++nofEntityPostings;
       // TODO<joka921> Currently only IRIs and strings from the vocabulary can
       // be tagged entities in the text index (no doubles, ints, etc).
       VocabIndex eid;
-      if (getVocab().getId(line._word, &eid)) {
+      if (getVocab().getId(line.word_, &eid)) {
         // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
         // to be contiguous.
-        entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score;
-        if (line._isLiteralEntity) {
+        entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
+        if (line.isLiteralEntity_) {
           ++nofLiterals;
         }
       } else {
         if (entityNotFoundErrorMsgCount < 20) {
-          LOG(WARN) << "Entity from text not in KB: " << line._word << '\n';
+          LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n';
           if (++entityNotFoundErrorMsgCount == 20) {
             LOG(WARN) << "There are more entities not in the KB..."
                       << " suppressing further warnings...\n";
@@ -278,14 +277,14 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
       ++nofWordPostings;
       // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
       WordVocabIndex vid;
-      bool ret = textVocab_.getId(line._word, &vid);
+      bool ret = textVocab_.getId(line.word_, &vid);
       WordIndex wid = vid.get();
       if (!ret) {
-        LOG(ERROR) << "ERROR: word \"" << line._word << "\" "
+        LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
                    << "not found in textVocab. Terminating\n";
         AD_FAIL();
       }
-      wordsInContext[wid] += line._score;
+      wordsInContext[wid] += line.score_;
     }
   }
   if (entityNotFoundErrorMsgCount > 0) {
diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
index b98d0d5788..2fab089280 100644
--- a/src/index/IndexImpl.h
+++ b/src/index/IndexImpl.h
@@ -29,9 +29,9 @@
 #include "index/TextMetaData.h"
 #include "index/Vocabulary.h"
 #include "index/VocabularyMerger.h"
-#include "parser/ContextFileParser.h"
 #include "parser/RdfParser.h"
 #include "parser/TripleComponent.h"
+#include "parser/WordsAndDocsFileParser.h"
 #include "util/BufferedVector.h"
 #include "util/CancellationHandle.h"
 #include "util/File.h"
@@ -515,7 +515,7 @@ class IndexImpl {
   // TODO: So far, this is limited to the internal vocabulary (still in the
   // testing phase, once it works, it should be easy to include the IRIs and
   // literals from the external vocabulary as well).
-  cppcoro::generator<ContextFileParser::Line> wordsInTextRecords(
+  cppcoro::generator<WordsFileLine> wordsInTextRecords(
       const std::string& contextFile, bool addWordsFromLiterals);
 
   size_t processWordsForVocabulary(const string& contextFile,
diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt
index be4b3db44c..6fa123a793 100644
--- a/src/parser/CMakeLists.txt
+++ b/src/parser/CMakeLists.txt
@@ -10,7 +10,7 @@ add_library(parser
         ParsedQuery.cpp
         RdfParser.cpp
         Tokenizer.cpp
-        ContextFileParser.cpp
+        WordsAndDocsFileParser.cpp
         TurtleTokenId.h
         ParallelBuffer.cpp
         SparqlParserHelpers.cpp
diff --git a/src/parser/ContextFileParser.cpp b/src/parser/ContextFileParser.cpp
deleted file mode 100644
index 523bde486b..0000000000
--- a/src/parser/ContextFileParser.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2015, University of Freiburg,
-// Chair of Algorithms and Data Structures.
-// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
-
-#include "./ContextFileParser.h"
-
-#include <cassert>
-
-#include "../util/Exception.h"
-#include "../util/StringUtils.h"
-
-// _____________________________________________________________________________
-ContextFileParser::ContextFileParser(const string& contextFile,
-                                     LocaleManager localeManager)
-    : _in(contextFile), _localeManager(std::move(localeManager)) {}
-
-// _____________________________________________________________________________
-ContextFileParser::~ContextFileParser() { _in.close(); }
-
-// _____________________________________________________________________________
-bool ContextFileParser::getLine(ContextFileParser::Line& line) {
-  string l;
-  if (std::getline(_in, l)) {
-    size_t i = l.find('\t');
-    assert(i != string::npos);
-    size_t j = i + 2;
-    assert(j + 3 < l.size());
-    size_t k = l.find('\t', j + 2);
-    assert(k != string::npos);
-    line._isEntity = (l[i + 1] == '1');
-    line._word =
-        (line._isEntity ? l.substr(0, i)
-                        : _localeManager.getLowercaseUtf8(l.substr(0, i)));
-    line._contextId =
-        TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
-    line._score = static_cast<Score>(atol(l.substr(k + 1).c_str()));
-#ifndef NDEBUG
-    if (_lastCId > line._contextId) {
-      AD_THROW("ContextFile has to be sorted by context Id.");
-    }
-    _lastCId = line._contextId;
-#endif
-    return true;
-  }
-  return false;
-}
diff --git a/src/parser/ContextFileParser.h b/src/parser/ContextFileParser.h
deleted file mode 100644
index ba8d7bac9c..0000000000
--- a/src/parser/ContextFileParser.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2015, University of Freiburg,
-// Chair of Algorithms and Data Structures.
-// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
-
-#pragma once
-
-#include <unicode/locid.h>
-
-#include <fstream>
-#include <string>
-
-#include "../global/Id.h"
-#include "../index/StringSortComparator.h"
-
-using std::string;
-
-class ContextFileParser {
- public:
-  struct Line {
-    string _word;
-    bool _isEntity;
-    TextRecordIndex _contextId;
-    Score _score;
-    bool _isLiteralEntity = false;
-  };
-
-  explicit ContextFileParser(const string& contextFile,
-                             LocaleManager localeManager);
-  ~ContextFileParser();
-  // Don't allow copy & assignment
-  explicit ContextFileParser(const ContextFileParser& other) = delete;
-  ContextFileParser& operator=(const ContextFileParser& other) = delete;
-
-  // Get the next line from the file.
-  // Returns true if something was stored.
-  bool getLine(Line&);
-
- private:
-  std::ifstream _in;
-#ifndef NDEBUG
-  // Only used for sanity checks in debug builds
-  TextRecordIndex _lastCId = TextRecordIndex::make(0);
-#endif
-  LocaleManager _localeManager;
-};
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
new file mode 100644
index 0000000000..e45f47c53a
--- /dev/null
+++ b/src/parser/WordsAndDocsFileParser.cpp
@@ -0,0 +1,61 @@
+// Copyright 2015, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
+
+#include "parser/WordsAndDocsFileParser.h"
+
+#include <cassert>
+
+#include "../util/Exception.h"
+#include "../util/StringUtils.h"
+
+// _____________________________________________________________________________
+WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
+                                               LocaleManager localeManager)
+    : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {}
+
+// _____________________________________________________________________________
+WordsAndDocsFileParser::~WordsAndDocsFileParser() { in_.close(); }
+
+// _____________________________________________________________________________
+ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
+  WordsFileLine line;
+  string l;
+  if (std::getline(in_, l)) {
+    size_t i = l.find('\t');
+    assert(i != string::npos);
+    size_t j = i + 2;
+    assert(j + 3 < l.size());
+    size_t k = l.find('\t', j + 2);
+    assert(k != string::npos);
+    line.isEntity_ = (l[i + 1] == '1');
+    line.word_ =
+        (line.isEntity_ ? l.substr(0, i)
+                        : localeManager_.getLowercaseUtf8(l.substr(0, i)));
+    line.contextId_ =
+        TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
+    line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
+#ifndef NDEBUG
+    if (lastCId_ > line.contextId_) {
+      AD_THROW("ContextFile has to be sorted by context Id.");
+    }
+    lastCId_ = line.contextId_;
+#endif
+    return line;
+  }
+  return std::nullopt;
+}
+
+// _____________________________________________________________________________
+ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
+  DocsFileLine line;
+  string l;
+  if (std::getline(in_, l)) {
+    size_t i = l.find('\t');
+    assert(i != string::npos);
+    line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
+    line.docContent_ = l.substr(i + 1);
+    return line;
+  }
+  return std::nullopt;
+}
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
new file mode 100644
index 0000000000..c72bf24d61
--- /dev/null
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -0,0 +1,63 @@
+// Copyright 2015, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
+
+#pragma once
+
+#include <unicode/locid.h>
+
+#include <fstream>
+#include <string>
+
+#include "global/Id.h"
+#include "index/StringSortComparator.h"
+#include "util/Iterators.h"
+
+using std::string;
+
+struct WordsFileLine {
+  string word_;
+  bool isEntity_;
+  TextRecordIndex contextId_;
+  Score score_;
+  bool isLiteralEntity_ = false;
+};
+
+struct DocsFileLine {
+  string docContent_;
+  DocumentIndex docId_;
+};
+
+class WordsAndDocsFileParser {
+ public:
+  explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
+                                  LocaleManager localeManager);
+  ~WordsAndDocsFileParser();
+  explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete;
+  WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) =
+      delete;
+
+ protected:
+  std::ifstream in_;
+  LocaleManager localeManager_;
+};
+
+class WordsFileParser : public WordsAndDocsFileParser,
+                        public ad_utility::InputRangeFromGet<WordsFileLine> {
+ public:
+  using WordsAndDocsFileParser::WordsAndDocsFileParser;
+  Storage get() override;
+
+ private:
+#ifndef NDEBUG
+  // Only used for sanity checks in debug builds
+  TextRecordIndex lastCId_ = TextRecordIndex::make(0);
+#endif
+};
+
+class DocsFileParser : public WordsAndDocsFileParser,
+                       public ad_utility::InputRangeFromGet<DocsFileLine> {
+ public:
+  using WordsAndDocsFileParser::WordsAndDocsFileParser;
+  Storage get() override;
+};
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 46a83d5b7a..a8cd128e5c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -137,7 +137,7 @@ addLinkAndDiscoverTestSerial(FileTest)
 
 addLinkAndDiscoverTest(Simple8bTest)
 
-addLinkAndDiscoverTest(ContextFileParserTest parser)
+addLinkAndDiscoverTest(WordsAndDocsFileParserTest parser)
 
 addLinkAndDiscoverTest(IndexMetaDataTest index)
 
diff --git a/test/ContextFileParserTest.cpp b/test/ContextFileParserTest.cpp
deleted file mode 100644
index 2b27c0f34d..0000000000
--- a/test/ContextFileParserTest.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2015, University of Freiburg,
-// Chair of Algorithms and Data Structures.
-// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
-
-#include <gtest/gtest.h>
-
-#include <cstdio>
-#include <fstream>
-
-#include "../src/parser/ContextFileParser.h"
-
-TEST(ContextFileParserTest, getLineTest) {
-  char* locale = setlocale(LC_CTYPE, "");
-  std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
-
-  std::fstream f("_testtmp.contexts.tsv", std::ios_base::out);
-  f << "Foo\t0\t0\t2\n"
-       "foo\t0\t0\t2\n"
-       "Bär\t1\t0\t1\n"
-       "Äü\t0\t0\t1\n"
-       "X\t0\t1\t1\n";
-
-  f.close();
-  ContextFileParser p("_testtmp.contexts.tsv",
-                      LocaleManager("en", "US", false));
-  ContextFileParser::Line a;
-  ASSERT_TRUE(p.getLine(a));
-  ASSERT_EQ("foo", a._word);
-  ASSERT_FALSE(a._isEntity);
-  ASSERT_EQ(0u, a._contextId.get());
-  ASSERT_EQ(2u, a._score);
-
-  ASSERT_TRUE(p.getLine(a));
-  ASSERT_EQ("foo", a._word);
-  ASSERT_FALSE(a._isEntity);
-  ASSERT_EQ(0u, a._contextId.get());
-  ASSERT_EQ(2u, a._score);
-
-  ASSERT_TRUE(p.getLine(a));
-  ASSERT_EQ("Bär", a._word);
-  ASSERT_TRUE(a._isEntity);
-  ASSERT_EQ(0u, a._contextId.get());
-  ASSERT_EQ(1u, a._score);
-
-  ASSERT_TRUE(p.getLine(a));
-  ASSERT_EQ("äü", a._word);
-  ASSERT_FALSE(a._isEntity);
-  ASSERT_EQ(0u, a._contextId.get());
-  ASSERT_EQ(1u, a._score);
-
-  ASSERT_TRUE(p.getLine(a));
-  ASSERT_EQ("x", a._word);
-  ASSERT_FALSE(a._isEntity);
-  ASSERT_EQ(1u, a._contextId.get());
-  ASSERT_EQ(1u, a._score);
-
-  ASSERT_FALSE(p.getLine(a));
-  remove("_testtmp.contexts.tsv");
-};
diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp
new file mode 100644
index 0000000000..bc3d1950a5
--- /dev/null
+++ b/test/WordsAndDocsFileParserTest.cpp
@@ -0,0 +1,66 @@
+// Copyright 2015, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
+
+#include <gtest/gtest.h>
+
+#include <cstdio>
+#include <fstream>
+
+#include "parser/WordsAndDocsFileParser.h"
+
+TEST(WordsAndDocsFileParserTest, wordsFileGetLineTest) {
+  char* locale = setlocale(LC_CTYPE, "");
+  std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
+
+  std::fstream f("_testtmp.contexts.tsv", std::ios_base::out);
+  f << "Foo\t0\t0\t2\n"
+       "foo\t0\t0\t2\n"
+       "Bär\t1\t0\t1\n"
+       "Äü\t0\t0\t1\n"
+       "X\t0\t1\t1\n";
+
+  f.close();
+  WordsFileParser p("_testtmp.contexts.tsv", LocaleManager("en", "US", false));
+  size_t i = 0;
+  for (auto line : p) {
+    switch (i) {
+      case 0:
+        ASSERT_EQ("foo", line.word_);
+        ASSERT_FALSE(line.isEntity_);
+        ASSERT_EQ(0u, line.contextId_.get());
+        ASSERT_EQ(2u, line.score_);
+        break;
+      case 1:
+        ASSERT_EQ("foo", line.word_);
+        ASSERT_FALSE(line.isEntity_);
+        ASSERT_EQ(0u, line.contextId_.get());
+        ASSERT_EQ(2u, line.score_);
+        break;
+      case 2:
+        ASSERT_EQ("Bär", line.word_);
+        ASSERT_TRUE(line.isEntity_);
+        ASSERT_EQ(0u, line.contextId_.get());
+        ASSERT_EQ(1u, line.score_);
+        break;
+      case 3:
+        ASSERT_EQ("äü", line.word_);
+        ASSERT_FALSE(line.isEntity_);
+        ASSERT_EQ(0u, line.contextId_.get());
+        ASSERT_EQ(1u, line.score_);
+        break;
+      case 4:
+        ASSERT_EQ("x", line.word_);
+        ASSERT_FALSE(line.isEntity_);
+        ASSERT_EQ(1u, line.contextId_.get());
+        ASSERT_EQ(1u, line.score_);
+        break;
+      default:
+        // shouldn't be reached
+        ASSERT_TRUE(false);
+    }
+    ++i;
+  }
+  ASSERT_EQ(i, 5);
+  remove("_testtmp.contexts.tsv");
+};

From 9c40084f65a30eebeb4c5669dab1f28547c3028f Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 28 Dec 2024 02:55:16 +0100
Subject: [PATCH 02/15] Added method to tokenize and normalize at the same
 time.

---
 src/index/IndexImpl.Text.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 022c49b8cd..1bc6101c7c 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -35,6 +35,16 @@ struct LiteralsTokenizationDelimiter {
   }
 };
 
+cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
+    std::string_view lineView, LocaleManager localeManager) {
+  // Currently it is not possible to use std::views or std::ranges with the
+  // splitter object returned by absl::StrSplit. Every solution I have seen
+  // will remove the lazy nature of StrSplit and views/ranges. (2024-12-28)
+  for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{},
+                                  absl::SkipEmpty{})) {
+    co_yield localeManager.getLowercaseUtf8(word);
+  }
+}
 }  // namespace
 
 // _____________________________________________________________________________
@@ -69,10 +79,8 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{},
-                                      absl::SkipEmpty{})) {
-        auto wordNormalized = localeManager.getLowercaseUtf8(word);
-        WordsFileLine wordLine{wordNormalized, false, contextId, 1};
+      for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
+        WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }
       contextId = contextId.incremented();

From c365935f1a54e4106dcebd8bbfaa5330b85b0030 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 28 Dec 2024 05:02:10 +0100
Subject: [PATCH 03/15] Added the tokenization to the ql_utility namespace

---
 src/index/IndexImpl.Text.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 1bc6101c7c..42d7bd7496 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -21,7 +21,7 @@
 #include "util/Conversions.h"
 #include "util/Simple8bCode.h"
 
-namespace {
+namespace ql_utility {
 
 // Custom delimiter class for tokenization of literals using `absl::StrSplit`.
 // The `Find` function returns the next delimiter in `text` after the given
@@ -45,7 +45,7 @@ cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
     co_yield localeManager.getLowercaseUtf8(word);
   }
 }
-}  // namespace
+}  // namespace ql_utility
 
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
@@ -79,7 +79,8 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
+      for (auto word :
+           ql_utility::tokenizeAndNormalizeTextLine(textView, localeManager)) {
         WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }

From 479b763534f7aec0bafd20348188fa0e0c6f6df7 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 28 Dec 2024 05:06:47 +0100
Subject: [PATCH 04/15] Revert "Added the tokenization to the ql_utility
 namespace"

This reverts commit c365935f1a54e4106dcebd8bbfaa5330b85b0030.
---
 src/index/IndexImpl.Text.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 42d7bd7496..1bc6101c7c 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -21,7 +21,7 @@
 #include "util/Conversions.h"
 #include "util/Simple8bCode.h"
 
-namespace ql_utility {
+namespace {
 
 // Custom delimiter class for tokenization of literals using `absl::StrSplit`.
 // The `Find` function returns the next delimiter in `text` after the given
@@ -45,7 +45,7 @@ cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
     co_yield localeManager.getLowercaseUtf8(word);
   }
 }
-}  // namespace ql_utility
+}  // namespace
 
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
@@ -79,8 +79,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      for (auto word :
-           ql_utility::tokenizeAndNormalizeTextLine(textView, localeManager)) {
+      for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
         WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }

From d0ec708744a9f634bd53c54a8d8e563d4dd8c215 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Thu, 2 Jan 2025 11:22:24 +0100
Subject: [PATCH 05/15] Used the custom InputRangeMixin to lazily tokenize and
 normalize words of texts

---
 src/index/IndexImpl.Text.cpp          | 29 ++-------------
 src/parser/WordsAndDocsFileParser.cpp | 19 ++++++++++
 src/parser/WordsAndDocsFileParser.h   | 52 +++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 1bc6101c7c..9f9e11ec68 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -21,32 +21,6 @@
 #include "util/Conversions.h"
 #include "util/Simple8bCode.h"
 
-namespace {
-
-// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
-// The `Find` function returns the next delimiter in `text` after the given
-// `pos` or an empty substring if there is no next delimiter.
-struct LiteralsTokenizationDelimiter {
-  absl::string_view Find(absl::string_view text, size_t pos) {
-    auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
-    auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
-    if (found == text.end()) return text.substr(text.size());
-    return {found, found + 1};
-  }
-};
-
-cppcoro::generator<std::string> tokenizeAndNormalizeTextLine(
-    std::string_view lineView, LocaleManager localeManager) {
-  // Currently it is not possible to use std::views or std::ranges with the
-  // splitter object returned by absl::StrSplit. Every solution I have seen
-  // will remove the lazy nature of StrSplit and views/ranges. (2024-12-28)
-  for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{},
-                                  absl::SkipEmpty{})) {
-    co_yield localeManager.getLowercaseUtf8(word);
-  }
-}
-}  // namespace
-
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
     const std::string& contextFile, bool addWordsFromLiterals) {
@@ -79,7 +53,8 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) {
+      TokenizeAndNormalizeText normalizedWords(textView, localeManager);
+      for (auto word : normalizedWords) {
         WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
index e45f47c53a..b66932fbfe 100644
--- a/src/parser/WordsAndDocsFileParser.cpp
+++ b/src/parser/WordsAndDocsFileParser.cpp
@@ -9,6 +9,25 @@
 #include "../util/Exception.h"
 #include "../util/StringUtils.h"
 
+// _____________________________________________________________________________
+void TokenizeAndNormalizeText::start() {
+  if (current_ != end_) {
+    currentValue_ = normalizeToken(*current_);
+  } else {
+    currentValue_ = std::nullopt;
+  }
+}
+
+// _____________________________________________________________________________
+void TokenizeAndNormalizeText::next() {
+  ++current_;
+  if (current_ != end_) {
+    currentValue_ = normalizeToken(*current_);
+  } else {
+    currentValue_ = std::nullopt;
+  }
+}
+
 // _____________________________________________________________________________
 WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
                                                LocaleManager localeManager)
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index c72bf24d61..5216a29e13 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <absl/strings/str_split.h>
 #include <unicode/locid.h>
 
 #include <fstream>
@@ -28,6 +29,57 @@ struct DocsFileLine {
   DocumentIndex docId_;
 };
 
+// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
+// The `Find` function returns the next delimiter in `text` after the given
+// `pos` or an empty substring if there is no next delimiter.
+struct LiteralsTokenizationDelimiter {
+  absl::string_view Find(absl::string_view text, size_t pos) {
+    auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
+    auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
+    if (found == text.end()) return text.substr(text.size());
+    return {found, found + 1};
+  }
+};
+
+class TokenizeAndNormalizeText
+    : public ad_utility::InputRangeMixin<TokenizeAndNormalizeText> {
+ public:
+  using StorageType = std::string;
+  explicit TokenizeAndNormalizeText(std::string_view text,
+                                    LocaleManager localeManager)
+      : splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{},
+                                 absl::SkipEmpty{})},
+        current_{splitter_.begin()},
+        end_{splitter_.end()},
+        localeManager_(localeManager){};
+
+  // Delete unsafe constructors
+  TokenizeAndNormalizeText() = delete;
+  TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete;
+  TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete;
+
+ private:
+  using Splitter = decltype(absl::StrSplit(
+      std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{}));
+  Splitter splitter_;
+  Splitter::const_iterator current_;
+  Splitter::const_iterator end_;
+
+  std::optional<StorageType> currentValue_;
+
+  LocaleManager localeManager_;
+
+  std::string normalizeToken(std::string_view token) {
+    return localeManager_.getLowercaseUtf8(token);
+  }
+
+ public:
+  void start();
+  bool isFinished() const { return !currentValue_.has_value(); };
+  const StorageType& get() const { return *currentValue_; };
+  void next();
+};
+
 class WordsAndDocsFileParser {
  public:
   explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,

From f129ecdd442bc4db1ef77d215f95e95392ca8e0e Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Wed, 8 Jan 2025 15:33:32 +0100
Subject: [PATCH 06/15] Added comments and necessary tests to
 WordsAndDocsFileParser

---
 src/parser/WordsAndDocsFileParser.h      |  27 ++++-
 test/WordsAndDocsFileLineCreator.h       |  19 ++++
 test/WordsAndDocsFileParserTest.cpp      | 124 +++++++++++++++++++++--
 test/engine/TextIndexScanForWordTest.cpp |  83 +++++++--------
 test/engine/TextIndexScanTestHelpers.h   |  14 ---
 5 files changed, 204 insertions(+), 63 deletions(-)
 create mode 100644 test/WordsAndDocsFileLineCreator.h

diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index 5216a29e13..775768569c 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -16,6 +16,8 @@
 
 using std::string;
 
+// string word_, bool isEntity_, TextRecordIndex contextId_, Score score_,
+// bool isLiteralEntity_
 struct WordsFileLine {
   string word_;
   bool isEntity_;
@@ -24,6 +26,7 @@ struct WordsFileLine {
   bool isLiteralEntity_ = false;
 };
 
+// string docContent_, DocumentIndex docId_
 struct DocsFileLine {
   string docContent_;
   DocumentIndex docId_;
@@ -41,6 +44,13 @@ struct LiteralsTokenizationDelimiter {
   }
 };
 
+// This class constructs an object that can be iterated to get the normalized
+// words of the text given. The text gets split into tokens using the
+// LiteralsTokenizationDelimiter and those tokens get normalized using
+// the localeManager. You can use the constructed object like
+// obj = TokenizeAndNormalizeText{text, localeManager}
+// for (auto normalizedWord : obj) { code }
+// The type of the value returned when iterating is std::string
 class TokenizeAndNormalizeText
     : public ad_utility::InputRangeMixin<TokenizeAndNormalizeText> {
  public:
@@ -51,7 +61,7 @@ class TokenizeAndNormalizeText
                                  absl::SkipEmpty{})},
         current_{splitter_.begin()},
         end_{splitter_.end()},
-        localeManager_(localeManager){};
+        localeManager_(std::move(localeManager)){};
 
   // Delete unsafe constructors
   TokenizeAndNormalizeText() = delete;
@@ -80,6 +90,9 @@ class TokenizeAndNormalizeText
   void next();
 };
 
+// This class is the parent class of WordsFileParser and DocsFileParser and
+// it exists to reduce code duplication since the only difference between the
+// child classes is the line type returned
 class WordsAndDocsFileParser {
  public:
   explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
@@ -94,6 +107,13 @@ class WordsAndDocsFileParser {
   LocaleManager localeManager_;
 };
 
+// This class takes in the a pathToWordsFile and a localeManager. It then can
+// be used to iterate the wordsFile while already normalizing the words using
+// the localeManager. (If words are entities it doesn't normalize them)
+// An object of this class can be iterated as follows:
+// obj = WordsFileParser{wordsFile, localeManager}
+// for (auto wordsFileLine : obj) { code }
+// The type of the value returned when iterating is WordsFileLine
 class WordsFileParser : public WordsAndDocsFileParser,
                         public ad_utility::InputRangeFromGet<WordsFileLine> {
  public:
@@ -106,7 +126,10 @@ class WordsFileParser : public WordsAndDocsFileParser,
   TextRecordIndex lastCId_ = TextRecordIndex::make(0);
 #endif
 };
-
+// Works similar to WordsFileParser but it instead parses a docsFile and
+// doesn't normalize the text found in docsFile. To parse the returned
+// docContent_ of a DocsFileLine please refer to the TokenizeAndNormalizeText
+// class
 class DocsFileParser : public WordsAndDocsFileParser,
                        public ad_utility::InputRangeFromGet<DocsFileLine> {
  public:
diff --git a/test/WordsAndDocsFileLineCreator.h b/test/WordsAndDocsFileLineCreator.h
new file mode 100644
index 0000000000..6675be2905
--- /dev/null
+++ b/test/WordsAndDocsFileLineCreator.h
@@ -0,0 +1,19 @@
+// Copyright 2024, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Felix Meisen (fesemeisen@outlook.de)
+
+#pragma once
+
+inline std::string inlineSeparator = "\t";
+inline std::string lineSeparator = "\n";
+
+inline std::string createWordsFileLine(std::string word, bool isEntity,
+                                       size_t contextId, size_t score) {
+  return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator +
+         std::to_string(contextId) + inlineSeparator + std::to_string(score) +
+         lineSeparator;
+};
+
+inline std::string createDocsFileLine(size_t docId, std::string docContent) {
+  return std::to_string(docId) + inlineSeparator + docContent + lineSeparator;
+};
diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp
index bc3d1950a5..8a82c5eefd 100644
--- a/test/WordsAndDocsFileParserTest.cpp
+++ b/test/WordsAndDocsFileParserTest.cpp
@@ -7,18 +7,19 @@
 #include <cstdio>
 #include <fstream>
 
+#include "./WordsAndDocsFileLineCreator.h"
 #include "parser/WordsAndDocsFileParser.h"
 
-TEST(WordsAndDocsFileParserTest, wordsFileGetLineTest) {
+TEST(WordsAndDocsFileParserTest, wordsFileParserTest) {
   char* locale = setlocale(LC_CTYPE, "");
   std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
 
   std::fstream f("_testtmp.contexts.tsv", std::ios_base::out);
-  f << "Foo\t0\t0\t2\n"
-       "foo\t0\t0\t2\n"
-       "Bär\t1\t0\t1\n"
-       "Äü\t0\t0\t1\n"
-       "X\t0\t1\t1\n";
+  f << createWordsFileLine("Foo", false, 0, 2)
+    << createWordsFileLine("foo", false, 0, 2)
+    << createWordsFileLine("Bär", true, 0, 1)
+    << createWordsFileLine("Äü", false, 0, 1)
+    << createWordsFileLine("X", false, 1, 1);
 
   f.close();
   WordsFileParser p("_testtmp.contexts.tsv", LocaleManager("en", "US", false));
@@ -64,3 +65,114 @@ TEST(WordsAndDocsFileParserTest, wordsFileGetLineTest) {
   ASSERT_EQ(i, 5);
   remove("_testtmp.contexts.tsv");
 };
+
+TEST(WordsAndDocsFileParser, docsFileParserTest) {
+  char* locale = setlocale(LC_CTYPE, "");
+  std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
+
+  std::fstream f("_testtmp.documents.tsv", std::ios_base::out);
+  f << createDocsFileLine(4, "This TeSt is OnlyCharcters")
+    << createDocsFileLine(7, "Wh4t h4pp3ns t0 num83rs")
+    << createDocsFileLine(8, "An( sp@ci*l ch.ar,:act=_er+s")
+    << createDocsFileLine(190293, "Large docId");
+  f.close();
+  DocsFileParser p("_testtmp.documents.tsv", LocaleManager("en", "US", false));
+  size_t i = 0;
+  for (auto line : p) {
+    switch (i) {
+      case 0:
+        ASSERT_EQ(4u, line.docId_.get());
+        ASSERT_EQ("This TeSt is OnlyCharcters", line.docContent_);
+        break;
+      case 1:
+        ASSERT_EQ(7u, line.docId_.get());
+        ASSERT_EQ("Wh4t h4pp3ns t0 num83rs", line.docContent_);
+        break;
+      case 2:
+        ASSERT_EQ(8u, line.docId_.get());
+        ASSERT_EQ("An( sp@ci*l ch.ar,:act=_er+s", line.docContent_);
+        break;
+      case 3:
+        ASSERT_EQ(190293u, line.docId_.get());
+        ASSERT_EQ("Large docId", line.docContent_);
+        break;
+      default:
+        // shouldn't be reached
+        ASSERT_TRUE(false);
+    }
+    ++i;
+  }
+  ASSERT_EQ(i, 4);
+  remove("_testtmp.documents.tsv");
+}
+
+TEST(TokenizeAndNormalizeText, tokenizeAndNormalizeTextTest) {
+  char* locale = setlocale(LC_CTYPE, "");
+  std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
+  using TestVec = std::vector<std::string>;
+  char separator = ' ';
+  auto testVecToText = [separator](TestVec input) {
+    std::string output = "";
+    for (auto word : input) {
+      output.append(word + separator);
+    }
+    if (output.size() >= 1) {
+      output.pop_back();
+    }
+    return output;
+  };
+
+  TestVec wordsTest1 = {"already", "normalized", "text"};
+  TestVec normalizedWordsTest1 = wordsTest1;
+
+  std::string textTest1 = testVecToText(wordsTest1);
+
+  TokenizeAndNormalizeText test1(textTest1, LocaleManager("en", "US", false));
+  size_t i1 = 0;
+  for (auto normalizedWord : test1) {
+    ASSERT_EQ(normalizedWordsTest1.at(i1), normalizedWord);
+    ++i1;
+  }
+  ASSERT_EQ(i1, 3);
+
+  TestVec wordsTest2 = {"TeXt", "WITH", "UpperCASe"};
+  TestVec normalizedWordsTest2 = {"text", "with", "uppercase"};
+
+  std::string textTest2 = testVecToText(wordsTest2);
+
+  TokenizeAndNormalizeText test2(textTest2, LocaleManager("en", "US", false));
+  size_t i2 = 0;
+  for (auto normalizedWord : test2) {
+    ASSERT_EQ(normalizedWordsTest2.at(i2), normalizedWord);
+    ++i2;
+  }
+  ASSERT_EQ(i2, 3);
+
+  TestVec wordsTest3 = {"41ph4num3r1c", "t3xt"};
+  TestVec normalizedWordsTest3 = {"41ph4num3r1c", "t3xt"};
+
+  std::string textTest3 = testVecToText(wordsTest3);
+
+  TokenizeAndNormalizeText test3(textTest3, LocaleManager("en", "US", false));
+  size_t i3 = 0;
+  for (auto normalizedWord : test3) {
+    ASSERT_EQ(normalizedWordsTest3.at(i3), normalizedWord);
+    ++i3;
+  }
+  ASSERT_EQ(i3, 2);
+
+  TestVec wordsTest4 = {"test\t",      "with\n", "different,",
+                        "separators.", "here",   ",.\t"};
+  TestVec normalizedWordsTest4 = {"test", "with", "different", "separators",
+                                  "here"};
+
+  std::string textTest4 = testVecToText(wordsTest4);
+
+  TokenizeAndNormalizeText test4(textTest4, LocaleManager("en", "US", false));
+  size_t i4 = 0;
+  for (auto normalizedWord : test4) {
+    ASSERT_EQ(normalizedWordsTest4.at(i4), normalizedWord);
+    ++i4;
+  }
+  ASSERT_EQ(i4, 5);
+}
diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp
index eac3cb0d2f..c62d1ef136 100644
--- a/test/engine/TextIndexScanForWordTest.cpp
+++ b/test/engine/TextIndexScanForWordTest.cpp
@@ -5,6 +5,7 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "../WordsAndDocsFileLineCreator.h"
 #include "../printers/VariablePrinters.h"
 #include "../util/GTestHelpers.h"
 #include "../util/IdTableHelpers.h"
@@ -26,45 +27,45 @@ std::string kg =
     ". <b> <x2> <x> . <b> <x2> <xb2> . <Astronomer> <is-a> <job> .";
 
 std::string wordsFileContent =
-    h::createWordsFileLine("astronomer", false, 1, 1) +
-    h::createWordsFileLine("<Astronomer>", true, 1, 0) +
-    h::createWordsFileLine("scientist", false, 1, 1) +
-    h::createWordsFileLine("field", false, 1, 1) +
-    h::createWordsFileLine("astronomy", false, 1, 1) +
-    h::createWordsFileLine("astronomer", false, 2, 0) +
-    h::createWordsFileLine("<Astronomer>", true, 2, 0) +
-    h::createWordsFileLine(":s:firstsentence", false, 2, 0) +
-    h::createWordsFileLine("scientist", false, 2, 0) +
-    h::createWordsFileLine("field", false, 2, 0) +
-    h::createWordsFileLine("astronomy", false, 2, 0) +
-    h::createWordsFileLine("astronomy", false, 3, 1) +
-    h::createWordsFileLine("concentrates", false, 3, 1) +
-    h::createWordsFileLine("studies", false, 3, 1) +
-    h::createWordsFileLine("specific", false, 3, 1) +
-    h::createWordsFileLine("question", false, 3, 1) +
-    h::createWordsFileLine("outside", false, 3, 1) +
-    h::createWordsFileLine("scope", false, 3, 1) +
-    h::createWordsFileLine("earth", false, 3, 1) +
-    h::createWordsFileLine("astronomy", false, 4, 1) +
-    h::createWordsFileLine("concentrates", false, 4, 1) +
-    h::createWordsFileLine("studies", false, 4, 1) +
-    h::createWordsFileLine("field", false, 4, 1) +
-    h::createWordsFileLine("outside", false, 4, 1) +
-    h::createWordsFileLine("scope", false, 4, 1) +
-    h::createWordsFileLine("earth", false, 4, 1) +
-    h::createWordsFileLine("tester", false, 5, 1) +
-    h::createWordsFileLine("rockets", false, 5, 1) +
-    h::createWordsFileLine("astronomer", false, 5, 1) +
-    h::createWordsFileLine("<Astronomer>", true, 5, 0) +
-    h::createWordsFileLine("although", false, 5, 1) +
-    h::createWordsFileLine("astronomer", false, 6, 0) +
-    h::createWordsFileLine("<Astronomer>", true, 6, 0) +
-    h::createWordsFileLine("although", false, 6, 0) +
-    h::createWordsFileLine("<Astronomer>", true, 6, 0) +
-    h::createWordsFileLine("space", false, 6, 1) +
-    h::createWordsFileLine("<Astronomer>", true, 7, 0) +
-    h::createWordsFileLine("space", false, 7, 0) +
-    h::createWordsFileLine("earth", false, 7, 1);
+    createWordsFileLine("astronomer", false, 1, 1) +
+    createWordsFileLine("<Astronomer>", true, 1, 0) +
+    createWordsFileLine("scientist", false, 1, 1) +
+    createWordsFileLine("field", false, 1, 1) +
+    createWordsFileLine("astronomy", false, 1, 1) +
+    createWordsFileLine("astronomer", false, 2, 0) +
+    createWordsFileLine("<Astronomer>", true, 2, 0) +
+    createWordsFileLine(":s:firstsentence", false, 2, 0) +
+    createWordsFileLine("scientist", false, 2, 0) +
+    createWordsFileLine("field", false, 2, 0) +
+    createWordsFileLine("astronomy", false, 2, 0) +
+    createWordsFileLine("astronomy", false, 3, 1) +
+    createWordsFileLine("concentrates", false, 3, 1) +
+    createWordsFileLine("studies", false, 3, 1) +
+    createWordsFileLine("specific", false, 3, 1) +
+    createWordsFileLine("question", false, 3, 1) +
+    createWordsFileLine("outside", false, 3, 1) +
+    createWordsFileLine("scope", false, 3, 1) +
+    createWordsFileLine("earth", false, 3, 1) +
+    createWordsFileLine("astronomy", false, 4, 1) +
+    createWordsFileLine("concentrates", false, 4, 1) +
+    createWordsFileLine("studies", false, 4, 1) +
+    createWordsFileLine("field", false, 4, 1) +
+    createWordsFileLine("outside", false, 4, 1) +
+    createWordsFileLine("scope", false, 4, 1) +
+    createWordsFileLine("earth", false, 4, 1) +
+    createWordsFileLine("tester", false, 5, 1) +
+    createWordsFileLine("rockets", false, 5, 1) +
+    createWordsFileLine("astronomer", false, 5, 1) +
+    createWordsFileLine("<Astronomer>", true, 5, 0) +
+    createWordsFileLine("although", false, 5, 1) +
+    createWordsFileLine("astronomer", false, 6, 0) +
+    createWordsFileLine("<Astronomer>", true, 6, 0) +
+    createWordsFileLine("although", false, 6, 0) +
+    createWordsFileLine("<Astronomer>", true, 6, 0) +
+    createWordsFileLine("space", false, 6, 1) +
+    createWordsFileLine("<Astronomer>", true, 7, 0) +
+    createWordsFileLine("space", false, 7, 0) +
+    createWordsFileLine("earth", false, 7, 1);
 
 std::string firstDocText =
     "An astronomer is a scientist in the field of "
@@ -77,8 +78,8 @@ std::string secondDocText =
     "too although they might not be in space but on "
     "earth.";
 
-std::string docsFileContent = h::createDocsFileLine(4, firstDocText) +
-                              h::createDocsFileLine(7, secondDocText);
+std::string docsFileContent =
+    createDocsFileLine(4, firstDocText) + createDocsFileLine(7, secondDocText);
 
 std::pair<std::string, std::string> contentsOfWordsFileAndDocsFile = {
     wordsFileContent, docsFileContent};
diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h
index 83a72ddea4..6ba1b8c6de 100644
--- a/test/engine/TextIndexScanTestHelpers.h
+++ b/test/engine/TextIndexScanTestHelpers.h
@@ -66,18 +66,4 @@ inline string combineToString(const string& text, const string& word) {
   ss << "Text: " << text << ", Word: " << word << std::endl;
   return ss.str();
 }
-
-inline std::string inlineSeparator = "\t";
-inline std::string lineSeparator = "\n";
-
-inline string createWordsFileLine(std::string word, bool isEntity,
-                                  size_t contextId, size_t score) {
-  return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator +
-         std::to_string(contextId) + inlineSeparator + std::to_string(score) +
-         lineSeparator;
-};
-
-inline string createDocsFileLine(size_t docId, std::string docContent) {
-  return std::to_string(docId) + inlineSeparator + docContent + lineSeparator;
-};
 }  // namespace textIndexScanTestHelpers

From 8c8a1a1de6c811d3aa20ee884c9d2983c5684554 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Thu, 9 Jan 2025 16:46:18 +0100
Subject: [PATCH 07/15] Added comments to WordsAndDcosFileParser.h. Improved
 useability of tests in WordsAndDocsFileParserTest.cpp. Renamed methods in
 WordsAndDocsFileLineCreator.h to reduce ambiguity. Incorporated requested
 small changes of PR.

---
 src/index/IndexImpl.Text.cpp             |   2 +-
 src/parser/WordsAndDocsFileParser.cpp    |  66 +++---
 src/parser/WordsAndDocsFileParser.h      |  66 +++++-
 test/WordsAndDocsFileLineCreator.h       |  21 +-
 test/WordsAndDocsFileParserTest.cpp      | 260 +++++++++++------------
 test/engine/TextIndexScanForWordTest.cpp |  82 +++----
 6 files changed, 266 insertions(+), 231 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 9f9e11ec68..b8bc919e39 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -31,7 +31,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
   if (!contextFile.empty()) {
     WordsFileParser p(contextFile, localeManager);
     ad_utility::HashSet<string> items;
-    for (auto line : p) {
+    for (auto& line : p) {
       contextId = line.contextId_;
       co_yield line;
     }
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
index b66932fbfe..dbc638cb7a 100644
--- a/src/parser/WordsAndDocsFileParser.cpp
+++ b/src/parser/WordsAndDocsFileParser.cpp
@@ -1,13 +1,14 @@
 // Copyright 2015, University of Freiburg,
 // Chair of Algorithms and Data Structures.
 // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
+//         Felix Meisen (fesemeisen@outlook.de)
 
 #include "parser/WordsAndDocsFileParser.h"
 
 #include <cassert>
 
-#include "../util/Exception.h"
-#include "../util/StringUtils.h"
+#include "util/Exception.h"
+#include "util/StringUtils.h"
 
 // _____________________________________________________________________________
 void TokenizeAndNormalizeText::start() {
@@ -33,48 +34,45 @@ WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
                                                LocaleManager localeManager)
     : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {}
 
-// _____________________________________________________________________________
-WordsAndDocsFileParser::~WordsAndDocsFileParser() { in_.close(); }
-
 // _____________________________________________________________________________
 ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
   WordsFileLine line;
   string l;
-  if (std::getline(in_, l)) {
-    size_t i = l.find('\t');
-    assert(i != string::npos);
-    size_t j = i + 2;
-    assert(j + 3 < l.size());
-    size_t k = l.find('\t', j + 2);
-    assert(k != string::npos);
-    line.isEntity_ = (l[i + 1] == '1');
-    line.word_ =
-        (line.isEntity_ ? l.substr(0, i)
-                        : localeManager_.getLowercaseUtf8(l.substr(0, i)));
-    line.contextId_ =
-        TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
-    line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
+  if (!std::getline(in_, l)) {
+    return std::nullopt;
+  };
+  size_t i = l.find('\t');
+  assert(i != string::npos);
+  size_t j = i + 2;
+  assert(j + 3 < l.size());
+  size_t k = l.find('\t', j + 2);
+  assert(k != string::npos);
+  line.isEntity_ = (l[i + 1] == '1');
+  line.word_ =
+      (line.isEntity_ ? l.substr(0, i)
+                      : localeManager_.getLowercaseUtf8(l.substr(0, i)));
+  line.contextId_ =
+      TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
+  line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
 #ifndef NDEBUG
-    if (lastCId_ > line.contextId_) {
-      AD_THROW("ContextFile has to be sorted by context Id.");
-    }
-    lastCId_ = line.contextId_;
-#endif
-    return line;
+  if (lastCId_ > line.contextId_) {
+    AD_THROW("ContextFile has to be sorted by context Id.");
   }
-  return std::nullopt;
+  lastCId_ = line.contextId_;
+#endif
+  return line;
 }
 
 // _____________________________________________________________________________
 ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
   DocsFileLine line;
   string l;
-  if (std::getline(in_, l)) {
-    size_t i = l.find('\t');
-    assert(i != string::npos);
-    line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
-    line.docContent_ = l.substr(i + 1);
-    return line;
-  }
-  return std::nullopt;
+  if (!std::getline(in_, l)) {
+    return std::nullopt;
+  };
+  size_t i = l.find('\t');
+  assert(i != string::npos);
+  line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
+  line.docContent_ = l.substr(i + 1);
+  return line;
 }
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index 775768569c..c8f632d458 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -1,6 +1,7 @@
 // Copyright 2015, University of Freiburg,
 // Chair of Algorithms and Data Structures.
 // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de)
+//         Felix Meisen (fesemeisen@outlook.de)
 
 #pragma once
 
@@ -15,9 +16,44 @@
 #include "util/Iterators.h"
 
 using std::string;
-
-// string word_, bool isEntity_, TextRecordIndex contextId_, Score score_,
-// bool isLiteralEntity_
+// Represents a line from the wordsfile.tsv, which stores everything given in
+// the file line and extra information with isLiteralEntity. Also used to add
+// literals to the text index through emulating wordsfile lines.
+//
+// The Fields are ordered in the same way the values follow in a line.
+// Short field overview: string word_, bool isEntity, TextRecordIndex contextId,
+//                       Score score_, bool isLiteralEntity (not found in
+//                       wordsfile)
+//
+// Fields:
+// - string word_: The string of the word, if it is an entity it will be
+//                 <Entity_Name>. bool isEntity_: True if the given word is an
+//                 entity, false if it's a word.
+// - TextRecordIndex contextId_: When creating the wordsfile docs from the
+//                               docsfile get split into so called contexts.
+//                               Those contexts overlap, meaning words and
+//                               entities are covered multiple times. Each
+//                               contextId corresponds to the next bigger or
+//                               equal docId.
+// - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if
+//                 isEntity is true. (this info is only constructed on the
+//                 scientists.wordsfile.tsv) The score in the wordsfile is only
+//                 relevant for the counting scoring metric. Because of the
+//                 overlap of contexts the score is 1 if the word really has
+//                 been seen for the first time and 0 if not. If a doc contains
+//                 multiple mentions of a word there should be exactly as many
+//                 wordsfile lines of that word with score 1 as there are
+//                 mentions. The score for entities seems rather random and
+//                 since no clear explanation of the creation of wordsfiles
+//                 has been found yet they will stay rather random.
+// - bool isLiteralEntity_: This does not directly stem from the wordsfile.
+//                          When building the text index with literals, for
+//                          every literal there will be WordsFileLines for all
+//                          words in that literal. Additionally the whole
+//                          literal itself will be added as word with isEntity
+//                          being true. The need to count this comes only from
+//                          a trick used in testing right now.  To be specific
+//                          the method getTextRecordFromResultTable
 struct WordsFileLine {
   string word_;
   bool isEntity_;
@@ -26,10 +62,28 @@ struct WordsFileLine {
   bool isLiteralEntity_ = false;
 };
 
-// string docContent_, DocumentIndex docId_
+// Represents a line from the docsfile.tsv, which stores everything given in
+// the file line.
+//
+// The Fields are ordered in the same way the values follow in a line.
+// Short field overview: DocumentIndex docId_, string docContent_
+//
+// Fields:
+//
+// - DocumentIndex docId_: The docId is needed to built inverted indices for
+//                         Scoring and building of the docsDB. It is also used
+//                         to return actual texts when searching for a word.
+//                         The word (and entity) search returns a table with
+//                         TextRecordIndex as type of one col. Those get mapped
+//                         to the next bigger or equal docId which is then
+//                         used to extract the text from the docsDB.
+//                         TODO: check if this behaviour is consintently
+//                         implemented
+// - string docContent_: The whole text given after the first tab of a line of
+//                       docsfile.
 struct DocsFileLine {
-  string docContent_;
   DocumentIndex docId_;
+  string docContent_;
 };
 
 // Custom delimiter class for tokenization of literals using `absl::StrSplit`.
@@ -97,7 +151,7 @@ class WordsAndDocsFileParser {
  public:
   explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
                                   LocaleManager localeManager);
-  ~WordsAndDocsFileParser();
+  ~WordsAndDocsFileParser() = default;
   explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete;
   WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) =
       delete;
diff --git a/test/WordsAndDocsFileLineCreator.h b/test/WordsAndDocsFileLineCreator.h
index 6675be2905..cb151216fd 100644
--- a/test/WordsAndDocsFileLineCreator.h
+++ b/test/WordsAndDocsFileLineCreator.h
@@ -4,16 +4,19 @@
 
 #pragma once
 
-inline std::string inlineSeparator = "\t";
-inline std::string lineSeparator = "\n";
+#include <absl/strings/str_cat.h>
 
-inline std::string createWordsFileLine(std::string word, bool isEntity,
-                                       size_t contextId, size_t score) {
-  return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator +
-         std::to_string(contextId) + inlineSeparator + std::to_string(score) +
-         lineSeparator;
+constexpr std::string_view inlineSeparator = "\t";
+constexpr std::string_view lineSeparator = "\n";
+
+inline std::string createWordsFileLineAsString(std::string_view word,
+                                               bool isEntity, size_t contextId,
+                                               size_t score) {
+  return absl::StrCat(word, inlineSeparator, isEntity, inlineSeparator,
+                      contextId, inlineSeparator, score, lineSeparator);
 };
 
-inline std::string createDocsFileLine(size_t docId, std::string docContent) {
-  return std::to_string(docId) + inlineSeparator + docContent + lineSeparator;
+inline std::string createDocsFileLineAsString(size_t docId,
+                                              std::string_view docContent) {
+  return absl::StrCat(docId, inlineSeparator, docContent, lineSeparator);
 };
diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp
index 8a82c5eefd..0672064f72 100644
--- a/test/WordsAndDocsFileParserTest.cpp
+++ b/test/WordsAndDocsFileParserTest.cpp
@@ -10,59 +10,109 @@
 #include "./WordsAndDocsFileLineCreator.h"
 #include "parser/WordsAndDocsFileParser.h"
 
+// All lambdas and type aliases used in this file contained here
+namespace {
+
+/// Type aliases
+
+// Word, isEntity, contextId, score
+using WordLine = std::tuple<std::string, bool, size_t, size_t>;
+using WordLineVec = std::vector<WordLine>;
+
+// docId, docContent
+using DocLine = std::tuple<size_t, std::string>;
+using DocLineVec = std::vector<DocLine>;
+
+using StringVec = std::vector<std::string>;
+
+/// Lambdas
+
+auto getLocaleManager = []() { return LocaleManager("en", "US", false); };
+
+auto wordsFileLineToWordLine = [](WordsFileLine wordsFileLine) -> WordLine {
+  return std::make_tuple(wordsFileLine.word_, wordsFileLine.isEntity_,
+                         static_cast<size_t>(wordsFileLine.contextId_.get()),
+                         static_cast<size_t>(wordsFileLine.score_));
+};
+
+// Lambda that takes in a path to wordsFile to initialize the Parser and an
+// expectedResult that is compared against the parsers outputs.
+auto testWordsFileParser = [](std::string wordsFilePath,
+                              WordLineVec expectedResult) {
+  WordsFileParser p(wordsFilePath, getLocaleManager());
+  size_t i = 0;
+  for (auto wordsFileLine : p) {
+    ASSERT_TRUE(i < expectedResult.size());
+    WordLine testLine = wordsFileLineToWordLine(wordsFileLine);
+
+    // Not testing the whole tuples against each other to have a cleaner
+    // indication what exactly caused the assertion to fail
+    ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i)));
+    ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i)));
+    ASSERT_EQ(std::get<2>(testLine), std::get<2>(expectedResult.at(i)));
+    ASSERT_EQ(std::get<3>(testLine), std::get<3>(expectedResult.at(i)));
+
+    ++i;
+  }
+  ASSERT_EQ(i, expectedResult.size());
+};
+
+auto docsFileLineToDocLine = [](DocsFileLine docsFileLine) -> DocLine {
+  return std::make_tuple(static_cast<size_t>(docsFileLine.docId_.get()),
+                         docsFileLine.docContent_);
+};
+
+// Same as testWordsFileParser but for docsFile
+auto testDocsFileParser = [](std::string docsFilePath,
+                             DocLineVec expectedResult) {
+  DocsFileParser p(docsFilePath, getLocaleManager());
+  size_t i = 0;
+  for (auto docsFileLine : p) {
+    ASSERT_TRUE(i < expectedResult.size());
+    DocLine testLine = docsFileLineToDocLine(docsFileLine);
+
+    // Not testing the whole tuples against each other to have a cleaner
+    // indication what exactly caused the assertion to fail
+    ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i)));
+    ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i)));
+
+    ++i;
+  }
+};
+
+auto testTokenizeAndNormalizeText = [](std::string testText,
+                                       StringVec normalizedTextAsVec) {
+  TokenizeAndNormalizeText testTokenizer(testText, getLocaleManager());
+  size_t i = 0;
+  for (auto normalizedWord : testTokenizer) {
+    ASSERT_TRUE(i < normalizedTextAsVec.size());
+    ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i));
+
+    ++i;
+  }
+  ASSERT_EQ(i, normalizedTextAsVec.size());
+};
+
+}  // namespace
+
 TEST(WordsAndDocsFileParserTest, wordsFileParserTest) {
   char* locale = setlocale(LC_CTYPE, "");
   std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
 
   std::fstream f("_testtmp.contexts.tsv", std::ios_base::out);
-  f << createWordsFileLine("Foo", false, 0, 2)
-    << createWordsFileLine("foo", false, 0, 2)
-    << createWordsFileLine("Bär", true, 0, 1)
-    << createWordsFileLine("Äü", false, 0, 1)
-    << createWordsFileLine("X", false, 1, 1);
-
+  f << createWordsFileLineAsString("Foo", false, 0, 2)
+    << createWordsFileLineAsString("foo", false, 0, 2)
+    << createWordsFileLineAsString("Bär", true, 0, 1)
+    << createWordsFileLineAsString("Äü", false, 0, 1)
+    << createWordsFileLineAsString("X", false, 1, 1);
   f.close();
-  WordsFileParser p("_testtmp.contexts.tsv", LocaleManager("en", "US", false));
-  size_t i = 0;
-  for (auto line : p) {
-    switch (i) {
-      case 0:
-        ASSERT_EQ("foo", line.word_);
-        ASSERT_FALSE(line.isEntity_);
-        ASSERT_EQ(0u, line.contextId_.get());
-        ASSERT_EQ(2u, line.score_);
-        break;
-      case 1:
-        ASSERT_EQ("foo", line.word_);
-        ASSERT_FALSE(line.isEntity_);
-        ASSERT_EQ(0u, line.contextId_.get());
-        ASSERT_EQ(2u, line.score_);
-        break;
-      case 2:
-        ASSERT_EQ("Bär", line.word_);
-        ASSERT_TRUE(line.isEntity_);
-        ASSERT_EQ(0u, line.contextId_.get());
-        ASSERT_EQ(1u, line.score_);
-        break;
-      case 3:
-        ASSERT_EQ("äü", line.word_);
-        ASSERT_FALSE(line.isEntity_);
-        ASSERT_EQ(0u, line.contextId_.get());
-        ASSERT_EQ(1u, line.score_);
-        break;
-      case 4:
-        ASSERT_EQ("x", line.word_);
-        ASSERT_FALSE(line.isEntity_);
-        ASSERT_EQ(1u, line.contextId_.get());
-        ASSERT_EQ(1u, line.score_);
-        break;
-      default:
-        // shouldn't be reached
-        ASSERT_TRUE(false);
-    }
-    ++i;
-  }
-  ASSERT_EQ(i, 5);
+
+  WordLineVec expected = {
+      std::make_tuple("foo", false, 0, 2), std::make_tuple("foo", false, 0, 2),
+      std::make_tuple("Bär", true, 0, 1), std::make_tuple("äü", false, 0, 1),
+      std::make_tuple("x", false, 1, 1)};
+
+  testWordsFileParser("_testtmp.contexts.tsv", expected);
   remove("_testtmp.contexts.tsv");
 };
 
@@ -71,108 +121,38 @@ TEST(WordsAndDocsFileParser, docsFileParserTest) {
   std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
 
   std::fstream f("_testtmp.documents.tsv", std::ios_base::out);
-  f << createDocsFileLine(4, "This TeSt is OnlyCharcters")
-    << createDocsFileLine(7, "Wh4t h4pp3ns t0 num83rs")
-    << createDocsFileLine(8, "An( sp@ci*l ch.ar,:act=_er+s")
-    << createDocsFileLine(190293, "Large docId");
+  f << createDocsFileLineAsString(4, "This TeSt is OnlyCharcters")
+    << createDocsFileLineAsString(7, "Wh4t h4pp3ns t0 num83rs")
+    << createDocsFileLineAsString(8, "An( sp@ci*l ch.ar,:act=_er+s")
+    << createDocsFileLineAsString(190293, "Large docId");
   f.close();
-  DocsFileParser p("_testtmp.documents.tsv", LocaleManager("en", "US", false));
-  size_t i = 0;
-  for (auto line : p) {
-    switch (i) {
-      case 0:
-        ASSERT_EQ(4u, line.docId_.get());
-        ASSERT_EQ("This TeSt is OnlyCharcters", line.docContent_);
-        break;
-      case 1:
-        ASSERT_EQ(7u, line.docId_.get());
-        ASSERT_EQ("Wh4t h4pp3ns t0 num83rs", line.docContent_);
-        break;
-      case 2:
-        ASSERT_EQ(8u, line.docId_.get());
-        ASSERT_EQ("An( sp@ci*l ch.ar,:act=_er+s", line.docContent_);
-        break;
-      case 3:
-        ASSERT_EQ(190293u, line.docId_.get());
-        ASSERT_EQ("Large docId", line.docContent_);
-        break;
-      default:
-        // shouldn't be reached
-        ASSERT_TRUE(false);
-    }
-    ++i;
-  }
-  ASSERT_EQ(i, 4);
+
+  DocLineVec expected = {std::make_pair(4, "This TeSt is OnlyCharcters"),
+                         std::make_pair(7, "Wh4t h4pp3ns t0 num83rs"),
+                         std::make_pair(8, "An( sp@ci*l ch.ar,:act=_er+s"),
+                         std::make_pair(190293, "Large docId")};
+
+  testDocsFileParser("_testtmp.documents.tsv", expected);
   remove("_testtmp.documents.tsv");
 }
 
 TEST(TokenizeAndNormalizeText, tokenizeAndNormalizeTextTest) {
   char* locale = setlocale(LC_CTYPE, "");
   std::cout << "Set locale LC_CTYPE to: " << locale << std::endl;
-  using TestVec = std::vector<std::string>;
-  char separator = ' ';
-  auto testVecToText = [separator](TestVec input) {
-    std::string output = "";
-    for (auto word : input) {
-      output.append(word + separator);
-    }
-    if (output.size() >= 1) {
-      output.pop_back();
-    }
-    return output;
-  };
-
-  TestVec wordsTest1 = {"already", "normalized", "text"};
-  TestVec normalizedWordsTest1 = wordsTest1;
-
-  std::string textTest1 = testVecToText(wordsTest1);
-
-  TokenizeAndNormalizeText test1(textTest1, LocaleManager("en", "US", false));
-  size_t i1 = 0;
-  for (auto normalizedWord : test1) {
-    ASSERT_EQ(normalizedWordsTest1.at(i1), normalizedWord);
-    ++i1;
-  }
-  ASSERT_EQ(i1, 3);
-
-  TestVec wordsTest2 = {"TeXt", "WITH", "UpperCASe"};
-  TestVec normalizedWordsTest2 = {"text", "with", "uppercase"};
 
-  std::string textTest2 = testVecToText(wordsTest2);
+  // Test 1
+  testTokenizeAndNormalizeText("already normalized text",
+                               {"already", "normalized", "text"});
 
-  TokenizeAndNormalizeText test2(textTest2, LocaleManager("en", "US", false));
-  size_t i2 = 0;
-  for (auto normalizedWord : test2) {
-    ASSERT_EQ(normalizedWordsTest2.at(i2), normalizedWord);
-    ++i2;
-  }
-  ASSERT_EQ(i2, 3);
+  // Test 2
+  testTokenizeAndNormalizeText("TeXt WITH UpperCASe",
+                               {"text", "with", "uppercase"});
 
-  TestVec wordsTest3 = {"41ph4num3r1c", "t3xt"};
-  TestVec normalizedWordsTest3 = {"41ph4num3r1c", "t3xt"};
+  // Test 3
+  testTokenizeAndNormalizeText("41ph4num3r1c t3xt", {"41ph4num3r1c", "t3xt"});
 
-  std::string textTest3 = testVecToText(wordsTest3);
-
-  TokenizeAndNormalizeText test3(textTest3, LocaleManager("en", "US", false));
-  size_t i3 = 0;
-  for (auto normalizedWord : test3) {
-    ASSERT_EQ(normalizedWordsTest3.at(i3), normalizedWord);
-    ++i3;
-  }
-  ASSERT_EQ(i3, 2);
-
-  TestVec wordsTest4 = {"test\t",      "with\n", "different,",
-                        "separators.", "here",   ",.\t"};
-  TestVec normalizedWordsTest4 = {"test", "with", "different", "separators",
-                                  "here"};
-
-  std::string textTest4 = testVecToText(wordsTest4);
-
-  TokenizeAndNormalizeText test4(textTest4, LocaleManager("en", "US", false));
-  size_t i4 = 0;
-  for (auto normalizedWord : test4) {
-    ASSERT_EQ(normalizedWordsTest4.at(i4), normalizedWord);
-    ++i4;
-  }
-  ASSERT_EQ(i4, 5);
+  // Test 4
+  testTokenizeAndNormalizeText(
+      "test\twith\ndifferent,separators.here ,.\t",
+      {"test", "with", "different", "separators", "here"});
 }
diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp
index c62d1ef136..cc9b685ec8 100644
--- a/test/engine/TextIndexScanForWordTest.cpp
+++ b/test/engine/TextIndexScanForWordTest.cpp
@@ -27,45 +27,45 @@ std::string kg =
     ". <b> <x2> <x> . <b> <x2> <xb2> . <Astronomer> <is-a> <job> .";
 
 std::string wordsFileContent =
-    createWordsFileLine("astronomer", false, 1, 1) +
-    createWordsFileLine("<Astronomer>", true, 1, 0) +
-    createWordsFileLine("scientist", false, 1, 1) +
-    createWordsFileLine("field", false, 1, 1) +
-    createWordsFileLine("astronomy", false, 1, 1) +
-    createWordsFileLine("astronomer", false, 2, 0) +
-    createWordsFileLine("<Astronomer>", true, 2, 0) +
-    createWordsFileLine(":s:firstsentence", false, 2, 0) +
-    createWordsFileLine("scientist", false, 2, 0) +
-    createWordsFileLine("field", false, 2, 0) +
-    createWordsFileLine("astronomy", false, 2, 0) +
-    createWordsFileLine("astronomy", false, 3, 1) +
-    createWordsFileLine("concentrates", false, 3, 1) +
-    createWordsFileLine("studies", false, 3, 1) +
-    createWordsFileLine("specific", false, 3, 1) +
-    createWordsFileLine("question", false, 3, 1) +
-    createWordsFileLine("outside", false, 3, 1) +
-    createWordsFileLine("scope", false, 3, 1) +
-    createWordsFileLine("earth", false, 3, 1) +
-    createWordsFileLine("astronomy", false, 4, 1) +
-    createWordsFileLine("concentrates", false, 4, 1) +
-    createWordsFileLine("studies", false, 4, 1) +
-    createWordsFileLine("field", false, 4, 1) +
-    createWordsFileLine("outside", false, 4, 1) +
-    createWordsFileLine("scope", false, 4, 1) +
-    createWordsFileLine("earth", false, 4, 1) +
-    createWordsFileLine("tester", false, 5, 1) +
-    createWordsFileLine("rockets", false, 5, 1) +
-    createWordsFileLine("astronomer", false, 5, 1) +
-    createWordsFileLine("<Astronomer>", true, 5, 0) +
-    createWordsFileLine("although", false, 5, 1) +
-    createWordsFileLine("astronomer", false, 6, 0) +
-    createWordsFileLine("<Astronomer>", true, 6, 0) +
-    createWordsFileLine("although", false, 6, 0) +
-    createWordsFileLine("<Astronomer>", true, 6, 0) +
-    createWordsFileLine("space", false, 6, 1) +
-    createWordsFileLine("<Astronomer>", true, 7, 0) +
-    createWordsFileLine("space", false, 7, 0) +
-    createWordsFileLine("earth", false, 7, 1);
+    createWordsFileLineAsString("astronomer", false, 1, 1) +
+    createWordsFileLineAsString("<Astronomer>", true, 1, 0) +
+    createWordsFileLineAsString("scientist", false, 1, 1) +
+    createWordsFileLineAsString("field", false, 1, 1) +
+    createWordsFileLineAsString("astronomy", false, 1, 1) +
+    createWordsFileLineAsString("astronomer", false, 2, 0) +
+    createWordsFileLineAsString("<Astronomer>", true, 2, 0) +
+    createWordsFileLineAsString(":s:firstsentence", false, 2, 0) +
+    createWordsFileLineAsString("scientist", false, 2, 0) +
+    createWordsFileLineAsString("field", false, 2, 0) +
+    createWordsFileLineAsString("astronomy", false, 2, 0) +
+    createWordsFileLineAsString("astronomy", false, 3, 1) +
+    createWordsFileLineAsString("concentrates", false, 3, 1) +
+    createWordsFileLineAsString("studies", false, 3, 1) +
+    createWordsFileLineAsString("specific", false, 3, 1) +
+    createWordsFileLineAsString("question", false, 3, 1) +
+    createWordsFileLineAsString("outside", false, 3, 1) +
+    createWordsFileLineAsString("scope", false, 3, 1) +
+    createWordsFileLineAsString("earth", false, 3, 1) +
+    createWordsFileLineAsString("astronomy", false, 4, 1) +
+    createWordsFileLineAsString("concentrates", false, 4, 1) +
+    createWordsFileLineAsString("studies", false, 4, 1) +
+    createWordsFileLineAsString("field", false, 4, 1) +
+    createWordsFileLineAsString("outside", false, 4, 1) +
+    createWordsFileLineAsString("scope", false, 4, 1) +
+    createWordsFileLineAsString("earth", false, 4, 1) +
+    createWordsFileLineAsString("tester", false, 5, 1) +
+    createWordsFileLineAsString("rockets", false, 5, 1) +
+    createWordsFileLineAsString("astronomer", false, 5, 1) +
+    createWordsFileLineAsString("<Astronomer>", true, 5, 0) +
+    createWordsFileLineAsString("although", false, 5, 1) +
+    createWordsFileLineAsString("astronomer", false, 6, 0) +
+    createWordsFileLineAsString("<Astronomer>", true, 6, 0) +
+    createWordsFileLineAsString("although", false, 6, 0) +
+    createWordsFileLineAsString("<Astronomer>", true, 6, 0) +
+    createWordsFileLineAsString("space", false, 6, 1) +
+    createWordsFileLineAsString("<Astronomer>", true, 7, 0) +
+    createWordsFileLineAsString("space", false, 7, 0) +
+    createWordsFileLineAsString("earth", false, 7, 1);
 
 std::string firstDocText =
     "An astronomer is a scientist in the field of "
@@ -78,8 +78,8 @@ std::string secondDocText =
     "too although they might not be in space but on "
     "earth.";
 
-std::string docsFileContent =
-    createDocsFileLine(4, firstDocText) + createDocsFileLine(7, secondDocText);
+std::string docsFileContent = createDocsFileLineAsString(4, firstDocText) +
+                              createDocsFileLineAsString(7, secondDocText);
 
 std::pair<std::string, std::string> contentsOfWordsFileAndDocsFile = {
     wordsFileContent, docsFileContent};

From 0369de68b9ae4f95eaa0d3469dec952a7da1c940 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Fri, 10 Jan 2025 10:30:06 +0100
Subject: [PATCH 08/15] Rewrite the tokenizer as a view.

Signed-off-by: Johannes Kalmbach <johannes.kalmbach@gmail.com>
---
 src/index/IndexImpl.Text.cpp          |  2 +-
 src/parser/WordsAndDocsFileParser.cpp | 19 ----------
 src/parser/WordsAndDocsFileParser.h   | 53 ++++++++-------------------
 test/WordsAndDocsFileParserTest.cpp   |  2 +-
 4 files changed, 17 insertions(+), 59 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index b8bc919e39..502063a920 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -53,7 +53,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      TokenizeAndNormalizeText normalizedWords(textView, localeManager);
+      auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager);
       for (auto word : normalizedWords) {
         WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
index dbc638cb7a..35bd955700 100644
--- a/src/parser/WordsAndDocsFileParser.cpp
+++ b/src/parser/WordsAndDocsFileParser.cpp
@@ -10,25 +10,6 @@
 #include "util/Exception.h"
 #include "util/StringUtils.h"
 
-// _____________________________________________________________________________
-void TokenizeAndNormalizeText::start() {
-  if (current_ != end_) {
-    currentValue_ = normalizeToken(*current_);
-  } else {
-    currentValue_ = std::nullopt;
-  }
-}
-
-// _____________________________________________________________________________
-void TokenizeAndNormalizeText::next() {
-  ++current_;
-  if (current_ != end_) {
-    currentValue_ = normalizeToken(*current_);
-  } else {
-    currentValue_ = std::nullopt;
-  }
-}
-
 // _____________________________________________________________________________
 WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
                                                LocaleManager localeManager)
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index c8f632d458..9b6c73400a 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -14,6 +14,7 @@
 #include "global/Id.h"
 #include "index/StringSortComparator.h"
 #include "util/Iterators.h"
+#include "util/Views.h"
 
 using std::string;
 // Represents a line from the wordsfile.tsv, which stores everything given in
@@ -105,44 +106,20 @@ struct LiteralsTokenizationDelimiter {
 // obj = TokenizeAndNormalizeText{text, localeManager}
 // for (auto normalizedWord : obj) { code }
 // The type of the value returned when iterating is std::string
-class TokenizeAndNormalizeText
-    : public ad_utility::InputRangeMixin<TokenizeAndNormalizeText> {
- public:
-  using StorageType = std::string;
-  explicit TokenizeAndNormalizeText(std::string_view text,
-                                    LocaleManager localeManager)
-      : splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{},
-                                 absl::SkipEmpty{})},
-        current_{splitter_.begin()},
-        end_{splitter_.end()},
-        localeManager_(std::move(localeManager)){};
-
-  // Delete unsafe constructors
-  TokenizeAndNormalizeText() = delete;
-  TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete;
-  TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete;
-
- private:
-  using Splitter = decltype(absl::StrSplit(
-      std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{}));
-  Splitter splitter_;
-  Splitter::const_iterator current_;
-  Splitter::const_iterator end_;
-
-  std::optional<StorageType> currentValue_;
-
-  LocaleManager localeManager_;
-
-  std::string normalizeToken(std::string_view token) {
-    return localeManager_.getLowercaseUtf8(token);
-  }
-
- public:
-  void start();
-  bool isFinished() const { return !currentValue_.has_value(); };
-  const StorageType& get() const { return *currentValue_; };
-  void next();
-};
+// TODO<flixtastic> Adapt the comment (it is now a function, and you call it a
+// little bit differently)
+// TODO<flixtastic> Also comment about the lifetime (the `text` and the
+// `localeManager` have to be kept alive while the tokenizer is being used, the
+// tokenizer only uses references.
+inline auto tokenizeAndNormalizeText(std::string_view text,
+                                     const LocaleManager& localeManager) {
+  std::vector<std::string_view> split{
+      absl::StrSplit(text, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})};
+  return ql::views::transform(ad_utility::OwningView{std::move(split)},
+                              [&localeManager](const auto& str) {
+                                return localeManager.getLowercaseUtf8(str);
+                              });
+}
 
 // This class is the parent class of WordsFileParser and DocsFileParser and
 // it exists to reduce code duplication since the only difference between the
diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp
index 0672064f72..8a31887d70 100644
--- a/test/WordsAndDocsFileParserTest.cpp
+++ b/test/WordsAndDocsFileParserTest.cpp
@@ -82,7 +82,7 @@ auto testDocsFileParser = [](std::string docsFilePath,
 
 auto testTokenizeAndNormalizeText = [](std::string testText,
                                        StringVec normalizedTextAsVec) {
-  TokenizeAndNormalizeText testTokenizer(testText, getLocaleManager());
+  auto testTokenizer = tokenizeAndNormalizeText(testText, getLocaleManager());
   size_t i = 0;
   for (auto normalizedWord : testTokenizer) {
     ASSERT_TRUE(i < normalizedTextAsVec.size());

From c412983e148fab5ad3a44f596c9b8adb31504ed4 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Fri, 10 Jan 2025 14:12:00 +0100
Subject: [PATCH 09/15] Improved comment, addressed small requested changes

---
 src/index/IndexImpl.Text.cpp        |   3 +-
 src/parser/WordsAndDocsFileParser.h | 191 +++++++++++++++-------------
 test/WordsAndDocsFileParserTest.cpp |  45 ++++---
 3 files changed, 131 insertions(+), 108 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 502063a920..b793189d6a 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -53,8 +53,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager);
-      for (auto word : normalizedWords) {
+      for (auto word : tokenizeAndNormalizeText(textView, localeManager)) {
         WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index 9b6c73400a..8365783a19 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -17,44 +17,50 @@
 #include "util/Views.h"
 
 using std::string;
-// Represents a line from the wordsfile.tsv, which stores everything given in
-// the file line and extra information with isLiteralEntity. Also used to add
-// literals to the text index through emulating wordsfile lines.
-//
-// The Fields are ordered in the same way the values follow in a line.
-// Short field overview: string word_, bool isEntity, TextRecordIndex contextId,
-//                       Score score_, bool isLiteralEntity (not found in
-//                       wordsfile)
-//
-// Fields:
-// - string word_: The string of the word, if it is an entity it will be
-//                 <Entity_Name>. bool isEntity_: True if the given word is an
-//                 entity, false if it's a word.
-// - TextRecordIndex contextId_: When creating the wordsfile docs from the
-//                               docsfile get split into so called contexts.
-//                               Those contexts overlap, meaning words and
-//                               entities are covered multiple times. Each
-//                               contextId corresponds to the next bigger or
-//                               equal docId.
-// - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if
-//                 isEntity is true. (this info is only constructed on the
-//                 scientists.wordsfile.tsv) The score in the wordsfile is only
-//                 relevant for the counting scoring metric. Because of the
-//                 overlap of contexts the score is 1 if the word really has
-//                 been seen for the first time and 0 if not. If a doc contains
-//                 multiple mentions of a word there should be exactly as many
-//                 wordsfile lines of that word with score 1 as there are
-//                 mentions. The score for entities seems rather random and
-//                 since no clear explanation of the creation of wordsfiles
-//                 has been found yet they will stay rather random.
-// - bool isLiteralEntity_: This does not directly stem from the wordsfile.
-//                          When building the text index with literals, for
-//                          every literal there will be WordsFileLines for all
-//                          words in that literal. Additionally the whole
-//                          literal itself will be added as word with isEntity
-//                          being true. The need to count this comes only from
-//                          a trick used in testing right now.  To be specific
-//                          the method getTextRecordFromResultTable
+
+/**
+ * @brief Represents a line in the words file.
+ *
+ * This struct holds information about a word or entity as it appears in the
+ * words file.
+ *
+ * The Fields are ordered in the same way the values follow in a line.
+ * Short field overview: string word_, bool isEntity, TextRecordIndex contextId,
+ *                       Score score_, bool isLiteralEntity (not found in
+ *                       wordsfile)
+ *
+ * @details
+ *
+ * Fields:
+ * - string word_: The string of the word, if it is an entity it will be
+ *                 <Entity_Name>.
+ * - bool isEntity_: True if the given word is an entity, false if it's a word.
+ * - TextRecordIndex contextId_: When creating the wordsfile docs from the
+ *                               docsfile get split into so called contexts.
+ *                               Those contexts overlap, meaning words and
+ *                               entities are covered multiple times. Each
+ *                               contextId corresponds to the next bigger or
+ *                               equal docId.
+ * - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if
+ *                 isEntity is true. (this info is only constructed on the
+ *                 scientists.wordsfile.tsv) The score in the wordsfile is only
+ *                 relevant for the counting scoring metric. Because of the
+ *                 overlap of contexts the score is 1 if the word really has
+ *                 been seen for the first time and 0 if not. If a doc contains
+ *                 multiple mentions of a word there should be exactly as many
+ *                 wordsfile lines of that word with score 1 as there are
+ *                 mentions. The score for entities seems rather random and
+ *                 since no clear explanation of the creation of wordsfiles
+ *                 has been found yet they will stay rather random.
+ * - bool isLiteralEntity_: This does not directly stem from the wordsfile.
+ *                          When building the text index with literals, for
+ *                          every literal there will be WordsFileLines for all
+ *                          words in that literal. Additionally the whole
+ *                          literal itself will be added as word with isEntity
+ *                          being true. The need to count this comes only from
+ *                          a trick used in testing right now.  To be specific
+ *                          the method getTextRecordFromResultTable
+ */
 struct WordsFileLine {
   string word_;
   bool isEntity_;
@@ -63,25 +69,29 @@ struct WordsFileLine {
   bool isLiteralEntity_ = false;
 };
 
-// Represents a line from the docsfile.tsv, which stores everything given in
-// the file line.
-//
-// The Fields are ordered in the same way the values follow in a line.
-// Short field overview: DocumentIndex docId_, string docContent_
-//
-// Fields:
-//
-// - DocumentIndex docId_: The docId is needed to built inverted indices for
-//                         Scoring and building of the docsDB. It is also used
-//                         to return actual texts when searching for a word.
-//                         The word (and entity) search returns a table with
-//                         TextRecordIndex as type of one col. Those get mapped
-//                         to the next bigger or equal docId which is then
-//                         used to extract the text from the docsDB.
-//                         TODO: check if this behaviour is consintently
-//                         implemented
-// - string docContent_: The whole text given after the first tab of a line of
-//                       docsfile.
+/**
+ * @brief Represents a line from the docsfile.tsv.
+ *
+ * This struct stores everything given in a line of the docsfile.tsv.
+ *
+ * The Fields are ordered in the same way the values follow in a line.
+ * Short field overview: DocumentIndex docId_, string docContent_
+ *
+ * @details
+ *
+ * Fields:
+ * - DocumentIndex docId_: The docId is needed to build inverted indices for
+ *                         scoring and building of the docsDB. It is also used
+ *                         to return actual texts when searching for a word.
+ *                         The word (and entity) search returns a table with
+ *                         TextRecordIndex as type of one column. Those get
+ *                         mapped to the next bigger or equal docId which is
+ *                         then used to extract the text from the docsDB.
+ *                         TODO: check if this behaviour is consistently
+ *                         implemented.
+ * - string docContent_: The whole text given after the first tab of a line of
+ *                       docsfile.
+ */
 struct DocsFileLine {
   DocumentIndex docId_;
   string docContent_;
@@ -99,18 +109,17 @@ struct LiteralsTokenizationDelimiter {
   }
 };
 
-// This class constructs an object that can be iterated to get the normalized
-// words of the text given. The text gets split into tokens using the
-// LiteralsTokenizationDelimiter and those tokens get normalized using
-// the localeManager. You can use the constructed object like
-// obj = TokenizeAndNormalizeText{text, localeManager}
-// for (auto normalizedWord : obj) { code }
-// The type of the value returned when iterating is std::string
-// TODO<flixtastic> Adapt the comment (it is now a function, and you call it a
-// little bit differently)
-// TODO<flixtastic> Also comment about the lifetime (the `text` and the
-// `localeManager` have to be kept alive while the tokenizer is being used, the
-// tokenizer only uses references.
+/**
+ * @brief A function that can be used to tokenize and normalize a given text.
+ * @warning Both params are const refs where the original objects have to be
+ * kept alive during the usage of the returned object.
+ * @param text The text to be tokenized and normalized.
+ * @param localeManager The localeManager to be used for normalization.
+ * @details This function can be used in the following way:
+ * for (auto normalizedWord : tokenizeAndNormalizeText(text, localeManager)) {
+ *  code;
+ * }
+ */
 inline auto tokenizeAndNormalizeText(std::string_view text,
                                      const LocaleManager& localeManager) {
   std::vector<std::string_view> split{
@@ -120,15 +129,16 @@ inline auto tokenizeAndNormalizeText(std::string_view text,
                                 return localeManager.getLowercaseUtf8(str);
                               });
 }
-
-// This class is the parent class of WordsFileParser and DocsFileParser and
-// it exists to reduce code duplication since the only difference between the
-// child classes is the line type returned
+/**
+ * @brief This class is the parent class of WordsFileParser and DocsFileParser
+ *
+ * @details It exists to reduce code duplication since the only difference
+ * between the child classes is the line type returned.
+ */
 class WordsAndDocsFileParser {
  public:
   explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
                                   LocaleManager localeManager);
-  ~WordsAndDocsFileParser() = default;
   explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete;
   WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) =
       delete;
@@ -138,13 +148,17 @@ class WordsAndDocsFileParser {
   LocaleManager localeManager_;
 };
 
-// This class takes in the a pathToWordsFile and a localeManager. It then can
-// be used to iterate the wordsFile while already normalizing the words using
-// the localeManager. (If words are entities it doesn't normalize them)
-// An object of this class can be iterated as follows:
-// obj = WordsFileParser{wordsFile, localeManager}
-// for (auto wordsFileLine : obj) { code }
-// The type of the value returned when iterating is WordsFileLine
+/**
+ * @brief This class takes in the a pathToWordsFile and a localeManager. It then
+ * can be used to iterate the wordsFile while already normalizing the words
+ * using the localeManager. (If words are entities it doesn't normalize them)
+ *
+ * @details An object of this class can be iterated as follows:
+ * for (auto wordsFileLine : WordsFileParser{wordsFile, localeManager}) {
+ *  code;
+ * }
+ * The type of the value returned when iterating is WordsFileLine
+ */
 class WordsFileParser : public WordsAndDocsFileParser,
                         public ad_utility::InputRangeFromGet<WordsFileLine> {
  public:
@@ -157,10 +171,17 @@ class WordsFileParser : public WordsAndDocsFileParser,
   TextRecordIndex lastCId_ = TextRecordIndex::make(0);
 #endif
 };
-// Works similar to WordsFileParser but it instead parses a docsFile and
-// doesn't normalize the text found in docsFile. To parse the returned
-// docContent_ of a DocsFileLine please refer to the TokenizeAndNormalizeText
-// class
+
+/**
+ * @brief This class takes in the a pathToDocsFile and a localeManager. It then
+ * can be used to iterate over the docsFile to get the lines.
+ *
+ * @details An object of this class can be iterated as follows:
+ * for (auto docsFileLine : DocsFileParser{docsFile, localeManager}) {
+ *  code;
+ * }
+ * The type of the value returned when iterating is DocsFileLine
+ */
 class DocsFileParser : public WordsAndDocsFileParser,
                        public ad_utility::InputRangeFromGet<DocsFileLine> {
  public:
diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp
index 8a31887d70..35ba0c4ee8 100644
--- a/test/WordsAndDocsFileParserTest.cpp
+++ b/test/WordsAndDocsFileParserTest.cpp
@@ -29,7 +29,8 @@ using StringVec = std::vector<std::string>;
 
 auto getLocaleManager = []() { return LocaleManager("en", "US", false); };
 
-auto wordsFileLineToWordLine = [](WordsFileLine wordsFileLine) -> WordLine {
+auto wordsFileLineToWordLine =
+    [](const WordsFileLine& wordsFileLine) -> WordLine {
   return std::make_tuple(wordsFileLine.word_, wordsFileLine.isEntity_,
                          static_cast<size_t>(wordsFileLine.contextId_.get()),
                          static_cast<size_t>(wordsFileLine.score_));
@@ -37,11 +38,11 @@ auto wordsFileLineToWordLine = [](WordsFileLine wordsFileLine) -> WordLine {
 
 // Lambda that takes in a path to wordsFile to initialize the Parser and an
 // expectedResult that is compared against the parsers outputs.
-auto testWordsFileParser = [](std::string wordsFilePath,
-                              WordLineVec expectedResult) {
-  WordsFileParser p(wordsFilePath, getLocaleManager());
+auto testWordsFileParser = [](const std::string& wordsFilePath,
+                              const WordLineVec& expectedResult) {
   size_t i = 0;
-  for (auto wordsFileLine : p) {
+  for (auto wordsFileLine :
+       WordsFileParser{wordsFilePath, getLocaleManager()}) {
     ASSERT_TRUE(i < expectedResult.size());
     WordLine testLine = wordsFileLineToWordLine(wordsFileLine);
 
@@ -57,17 +58,16 @@ auto testWordsFileParser = [](std::string wordsFilePath,
   ASSERT_EQ(i, expectedResult.size());
 };
 
-auto docsFileLineToDocLine = [](DocsFileLine docsFileLine) -> DocLine {
+auto docsFileLineToDocLine = [](const DocsFileLine& docsFileLine) -> DocLine {
   return std::make_tuple(static_cast<size_t>(docsFileLine.docId_.get()),
                          docsFileLine.docContent_);
 };
 
 // Same as testWordsFileParser but for docsFile
-auto testDocsFileParser = [](std::string docsFilePath,
-                             DocLineVec expectedResult) {
-  DocsFileParser p(docsFilePath, getLocaleManager());
+auto testDocsFileParser = [](const std::string& docsFilePath,
+                             const DocLineVec& expectedResult) {
   size_t i = 0;
-  for (auto docsFileLine : p) {
+  for (auto docsFileLine : DocsFileParser{docsFilePath, getLocaleManager()}) {
     ASSERT_TRUE(i < expectedResult.size());
     DocLine testLine = docsFileLineToDocLine(docsFileLine);
 
@@ -80,11 +80,13 @@ auto testDocsFileParser = [](std::string docsFilePath,
   }
 };
 
+// Passing the testText as copy to make sure it stays alive during the usage of
+// tokenizer
 auto testTokenizeAndNormalizeText = [](std::string testText,
-                                       StringVec normalizedTextAsVec) {
-  auto testTokenizer = tokenizeAndNormalizeText(testText, getLocaleManager());
+                                       const StringVec& normalizedTextAsVec) {
   size_t i = 0;
-  for (auto normalizedWord : testTokenizer) {
+  for (auto normalizedWord :
+       tokenizeAndNormalizeText(testText, getLocaleManager())) {
     ASSERT_TRUE(i < normalizedTextAsVec.size());
     ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i));
 
@@ -107,10 +109,11 @@ TEST(WordsAndDocsFileParserTest, wordsFileParserTest) {
     << createWordsFileLineAsString("X", false, 1, 1);
   f.close();
 
-  WordLineVec expected = {
-      std::make_tuple("foo", false, 0, 2), std::make_tuple("foo", false, 0, 2),
-      std::make_tuple("Bär", true, 0, 1), std::make_tuple("äü", false, 0, 1),
-      std::make_tuple("x", false, 1, 1)};
+  WordLineVec expected = {{"foo", false, 0, 2},
+                          {"foo", false, 0, 2},
+                          {"Bär", true, 0, 1},
+                          {"äü", false, 0, 1},
+                          {"x", false, 1, 1}};
 
   testWordsFileParser("_testtmp.contexts.tsv", expected);
   remove("_testtmp.contexts.tsv");
@@ -127,10 +130,10 @@ TEST(WordsAndDocsFileParser, docsFileParserTest) {
     << createDocsFileLineAsString(190293, "Large docId");
   f.close();
 
-  DocLineVec expected = {std::make_pair(4, "This TeSt is OnlyCharcters"),
-                         std::make_pair(7, "Wh4t h4pp3ns t0 num83rs"),
-                         std::make_pair(8, "An( sp@ci*l ch.ar,:act=_er+s"),
-                         std::make_pair(190293, "Large docId")};
+  DocLineVec expected = {{4, "This TeSt is OnlyCharcters"},
+                         {7, "Wh4t h4pp3ns t0 num83rs"},
+                         {8, "An( sp@ci*l ch.ar,:act=_er+s"},
+                         {190293, "Large docId"}};
 
   testDocsFileParser("_testtmp.documents.tsv", expected);
   remove("_testtmp.documents.tsv");

From 46fbb980ccebc2888745041ebbe748ea6e4764e9 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Fri, 10 Jan 2025 15:33:25 +0100
Subject: [PATCH 10/15] Addressed sonar issues

---
 src/index/IndexImpl.Text.cpp          |  2 +-
 src/index/IndexImpl.h                 |  2 +-
 src/parser/WordsAndDocsFileParser.cpp | 28 ++++++++++++++-------------
 src/parser/WordsAndDocsFileParser.h   | 10 +++++++---
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index b793189d6a..08afa9a112 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -23,7 +23,7 @@
 
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
-    const std::string& contextFile, bool addWordsFromLiterals) {
+    const std::string& contextFile, bool addWordsFromLiterals) const {
   auto localeManager = textVocab_.getLocaleManager();
   // ROUND 1: If context file aka wordsfile is not empty, read words from there.
   // Remember the last context id for the (optional) second round.
diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
index 5ce5eeaea3..1b491b04ab 100644
--- a/src/index/IndexImpl.h
+++ b/src/index/IndexImpl.h
@@ -522,7 +522,7 @@ class IndexImpl {
   // testing phase, once it works, it should be easy to include the IRIs and
   // literals from the external vocabulary as well).
   cppcoro::generator<WordsFileLine> wordsInTextRecords(
-      const std::string& contextFile, bool addWordsFromLiterals);
+      const std::string& contextFile, bool addWordsFromLiterals) const;
 
   size_t processWordsForVocabulary(const string& contextFile,
                                    bool addWordsFromLiterals);
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
index 35bd955700..d5756e01d4 100644
--- a/src/parser/WordsAndDocsFileParser.cpp
+++ b/src/parser/WordsAndDocsFileParser.cpp
@@ -11,30 +11,32 @@
 #include "util/StringUtils.h"
 
 // _____________________________________________________________________________
-WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
-                                               LocaleManager localeManager)
-    : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {}
+WordsAndDocsFileParser::WordsAndDocsFileParser(
+    const string& wordsOrDocsFile, const LocaleManager& localeManager)
+    : in_(wordsOrDocsFile), localeManager_(localeManager) {}
 
 // _____________________________________________________________________________
 ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
   WordsFileLine line;
   string l;
-  if (!std::getline(in_, l)) {
+  if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
   };
-  size_t i = l.find('\t');
+  std::string_view lineView(l);
+  size_t i = lineView.find('\t');
   assert(i != string::npos);
   size_t j = i + 2;
-  assert(j + 3 < l.size());
-  size_t k = l.find('\t', j + 2);
+  assert(j + 3 < lineView.size());
+  size_t k = lineView.find('\t', j + 2);
   assert(k != string::npos);
-  line.isEntity_ = (l[i + 1] == '1');
+  line.isEntity_ = (lineView[i + 1] == '1');
   line.word_ =
-      (line.isEntity_ ? l.substr(0, i)
-                      : localeManager_.getLowercaseUtf8(l.substr(0, i)));
+      (line.isEntity_
+           ? lineView.substr(0, i)
+           : getLocaleManager().getLowercaseUtf8(lineView.substr(0, i)));
   line.contextId_ =
-      TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
-  line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
+      TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data()));
+  line.score_ = static_cast<Score>(atol(lineView.substr(k + 1).data()));
 #ifndef NDEBUG
   if (lastCId_ > line.contextId_) {
     AD_THROW("ContextFile has to be sorted by context Id.");
@@ -48,7 +50,7 @@ ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
 ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
   DocsFileLine line;
   string l;
-  if (!std::getline(in_, l)) {
+  if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
   };
   size_t i = l.find('\t');
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index 8365783a19..ebacfc2ebe 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -101,7 +101,7 @@ struct DocsFileLine {
 // The `Find` function returns the next delimiter in `text` after the given
 // `pos` or an empty substring if there is no next delimiter.
 struct LiteralsTokenizationDelimiter {
-  absl::string_view Find(absl::string_view text, size_t pos) {
+  absl::string_view Find(absl::string_view text, size_t pos) const {
     auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
     auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
     if (found == text.end()) return text.substr(text.size());
@@ -138,12 +138,16 @@ inline auto tokenizeAndNormalizeText(std::string_view text,
 class WordsAndDocsFileParser {
  public:
   explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
-                                  LocaleManager localeManager);
+                                  const LocaleManager& localeManager);
   explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete;
   WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) =
       delete;
 
  protected:
+  std::ifstream& getInputStream() { return in_; }
+  const LocaleManager& getLocaleManager() { return localeManager_; }
+
+ private:
   std::ifstream in_;
   LocaleManager localeManager_;
 };
@@ -165,8 +169,8 @@ class WordsFileParser : public WordsAndDocsFileParser,
   using WordsAndDocsFileParser::WordsAndDocsFileParser;
   Storage get() override;
 
- private:
 #ifndef NDEBUG
+ private:
   // Only used for sanity checks in debug builds
   TextRecordIndex lastCId_ = TextRecordIndex::make(0);
 #endif

From 1e0fc147c207234821248a0afae2434c57874c79 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Fri, 10 Jan 2025 16:21:35 +0100
Subject: [PATCH 11/15] Removed the temporary localeManagers in
 WordsAndDocsFileParserTest.cpp

---
 test/WordsAndDocsFileParserTest.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp
index 35ba0c4ee8..de7216ada7 100644
--- a/test/WordsAndDocsFileParserTest.cpp
+++ b/test/WordsAndDocsFileParserTest.cpp
@@ -27,7 +27,9 @@ using StringVec = std::vector<std::string>;
 
 /// Lambdas
 
-auto getLocaleManager = []() { return LocaleManager("en", "US", false); };
+auto getLocaleManager = []() -> LocaleManager {
+  return LocaleManager("en", "US", false);
+};
 
 auto wordsFileLineToWordLine =
     [](const WordsFileLine& wordsFileLine) -> WordLine {
@@ -41,8 +43,8 @@ auto wordsFileLineToWordLine =
 auto testWordsFileParser = [](const std::string& wordsFilePath,
                               const WordLineVec& expectedResult) {
   size_t i = 0;
-  for (auto wordsFileLine :
-       WordsFileParser{wordsFilePath, getLocaleManager()}) {
+  LocaleManager localeManager = getLocaleManager();
+  for (auto wordsFileLine : WordsFileParser{wordsFilePath, localeManager}) {
     ASSERT_TRUE(i < expectedResult.size());
     WordLine testLine = wordsFileLineToWordLine(wordsFileLine);
 
@@ -67,7 +69,8 @@ auto docsFileLineToDocLine = [](const DocsFileLine& docsFileLine) -> DocLine {
 auto testDocsFileParser = [](const std::string& docsFilePath,
                              const DocLineVec& expectedResult) {
   size_t i = 0;
-  for (auto docsFileLine : DocsFileParser{docsFilePath, getLocaleManager()}) {
+  LocaleManager localeManager = getLocaleManager();
+  for (auto docsFileLine : DocsFileParser{docsFilePath, localeManager}) {
     ASSERT_TRUE(i < expectedResult.size());
     DocLine testLine = docsFileLineToDocLine(docsFileLine);
 
@@ -85,8 +88,9 @@ auto testDocsFileParser = [](const std::string& docsFilePath,
 auto testTokenizeAndNormalizeText = [](std::string testText,
                                        const StringVec& normalizedTextAsVec) {
   size_t i = 0;
+  LocaleManager localeManager = getLocaleManager();
   for (auto normalizedWord :
-       tokenizeAndNormalizeText(testText, getLocaleManager())) {
+       tokenizeAndNormalizeText(testText, localeManager)) {
     ASSERT_TRUE(i < normalizedTextAsVec.size());
     ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i));
 

From 9f9738c76d294482d57c250c464873eca6b4c983 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 11 Jan 2025 12:46:24 +0100
Subject: [PATCH 12/15] Addressed more SonarQube problems

---
 src/index/IndexImpl.Text.cpp          | 86 +++++++++++++++++----------
 src/index/IndexImpl.h                 | 14 ++++-
 src/parser/WordsAndDocsFileParser.cpp |  6 +-
 src/parser/WordsAndDocsFileParser.h   |  4 +-
 4 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index 08afa9a112..a27a5a7817 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -23,7 +23,7 @@
 
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
-    const std::string& contextFile, bool addWordsFromLiterals) const {
+    std::string contextFile, bool addWordsFromLiterals) const {
   auto localeManager = textVocab_.getLocaleManager();
   // ROUND 1: If context file aka wordsfile is not empty, read words from there.
   // Remember the last context id for the (optional) second round.
@@ -62,6 +62,56 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
   }
 }
 
+// _____________________________________________________________________________
+void IndexImpl::processEntityCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<Id, Score>& entitiesInContext, size_t& nofLiterals,
+    size_t& entityNotFoundErrorMsgCount) const {
+  VocabIndex eid;
+  // TODO<joka921> Currently only IRIs and strings from the vocabulary can
+  // be tagged entities in the text index (no doubles, ints, etc).
+  if (getVocab().getId(line.word_, &eid)) {
+    // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
+    // to be contiguous.
+    entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
+    if (line.isLiteralEntity_) {
+      ++nofLiterals;
+    }
+  } else {
+    logEntityNotFound(line.word_, entityNotFoundErrorMsgCount);
+  }
+}
+
+// _____________________________________________________________________________
+void IndexImpl::processWordCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<WordIndex, Score>& wordsInContext) const {
+  // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
+  WordVocabIndex vid;
+  bool ret = textVocab_.getId(line.word_, &vid);
+  WordIndex wid = vid.get();
+  if (!ret) {
+    LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
+               << "not found in textVocab. Terminating\n";
+    AD_FAIL();
+  }
+  wordsInContext[wid] += line.score_;
+}
+
+// _____________________________________________________________________________
+void IndexImpl::logEntityNotFound(const string& word,
+                                  size_t& entityNotFoundErrorMsgCount) const {
+  if (entityNotFoundErrorMsgCount < 20) {
+    LOG(WARN) << "Entity from text not in KB: " << word << '\n';
+    if (++entityNotFoundErrorMsgCount == 20) {
+      LOG(WARN) << "There are more entities not in the KB..."
+                << " suppressing further warnings...\n";
+    }
+  } else {
+    entityNotFoundErrorMsgCount++;
+  }
+}
+
 // _____________________________________________________________________________
 void IndexImpl::addTextFromContextFile(const string& contextFile,
                                        bool addWordsFromLiterals) {
@@ -234,39 +284,11 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
     }
     if (line.isEntity_) {
       ++nofEntityPostings;
-      // TODO<joka921> Currently only IRIs and strings from the vocabulary can
-      // be tagged entities in the text index (no doubles, ints, etc).
-      VocabIndex eid;
-      if (getVocab().getId(line.word_, &eid)) {
-        // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
-        // to be contiguous.
-        entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
-        if (line.isLiteralEntity_) {
-          ++nofLiterals;
-        }
-      } else {
-        if (entityNotFoundErrorMsgCount < 20) {
-          LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n';
-          if (++entityNotFoundErrorMsgCount == 20) {
-            LOG(WARN) << "There are more entities not in the KB..."
-                      << " suppressing further warnings...\n";
-          }
-        } else {
-          entityNotFoundErrorMsgCount++;
-        }
-      }
+      processEntityCaseDuringInvertedListProcessing(
+          line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount);
     } else {
       ++nofWordPostings;
-      // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
-      WordVocabIndex vid;
-      bool ret = textVocab_.getId(line.word_, &vid);
-      WordIndex wid = vid.get();
-      if (!ret) {
-        LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
-                   << "not found in textVocab. Terminating\n";
-        AD_FAIL();
-      }
-      wordsInContext[wid] += line.score_;
+      processWordCaseDuringInvertedListProcessing(line, wordsInContext);
     }
   }
   if (entityNotFoundErrorMsgCount > 0) {
diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
index 1b491b04ab..ac0003db87 100644
--- a/src/index/IndexImpl.h
+++ b/src/index/IndexImpl.h
@@ -522,7 +522,19 @@ class IndexImpl {
   // testing phase, once it works, it should be easy to include the IRIs and
   // literals from the external vocabulary as well).
   cppcoro::generator<WordsFileLine> wordsInTextRecords(
-      const std::string& contextFile, bool addWordsFromLiterals) const;
+      std::string contextFile, bool addWordsFromLiterals) const;
+
+  void processEntityCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<Id, Score>& entitiesInContxt, size_t& nofLiterals,
+      size_t& entityNotFoundErrorMsgCount) const;
+
+  void processWordCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<WordIndex, Score>& wordsInContext) const;
+
+  void logEntityNotFound(const string& word,
+                         size_t& entityNotFoundErrorMsgCount) const;
 
   size_t processWordsForVocabulary(const string& contextFile,
                                    bool addWordsFromLiterals);
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
index d5756e01d4..e7d36974c6 100644
--- a/src/parser/WordsAndDocsFileParser.cpp
+++ b/src/parser/WordsAndDocsFileParser.cpp
@@ -21,7 +21,7 @@ ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
   string l;
   if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
-  };
+  }
   std::string_view lineView(l);
   size_t i = lineView.find('\t');
   assert(i != string::npos);
@@ -48,11 +48,11 @@ ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
 
 // _____________________________________________________________________________
 ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
-  DocsFileLine line;
   string l;
   if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
-  };
+  }
+  DocsFileLine line;
   size_t i = l.find('\t');
   assert(i != string::npos);
   line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h
index ebacfc2ebe..1fc80523ff 100644
--- a/src/parser/WordsAndDocsFileParser.h
+++ b/src/parser/WordsAndDocsFileParser.h
@@ -87,8 +87,6 @@ struct WordsFileLine {
  *                         TextRecordIndex as type of one column. Those get
  *                         mapped to the next bigger or equal docId which is
  *                         then used to extract the text from the docsDB.
- *                         TODO: check if this behaviour is consistently
- *                         implemented.
  * - string docContent_: The whole text given after the first tab of a line of
  *                       docsfile.
  */
@@ -145,7 +143,7 @@ class WordsAndDocsFileParser {
 
  protected:
   std::ifstream& getInputStream() { return in_; }
-  const LocaleManager& getLocaleManager() { return localeManager_; }
+  const LocaleManager& getLocaleManager() const { return localeManager_; }
 
  private:
   std::ifstream in_;

From a55f2be2ba377e2f1f8c8b3c0e90b419a6fbb55a Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 11 Jan 2025 14:48:14 +0100
Subject: [PATCH 13/15] For now excluding helper functions from code coverage
 since they could be outsourced in further refactorings

---
 src/index/IndexImpl.Text.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index a27a5a7817..bde9440aa6 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -62,6 +62,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
   }
 }
 
+// LCOV_EXCL_START
 // _____________________________________________________________________________
 void IndexImpl::processEntityCaseDuringInvertedListProcessing(
     const WordsFileLine& line,
@@ -111,6 +112,7 @@ void IndexImpl::logEntityNotFound(const string& word,
     entityNotFoundErrorMsgCount++;
   }
 }
+// LCOV_EXCL_STOP
 
 // _____________________________________________________________________________
 void IndexImpl::addTextFromContextFile(const string& contextFile,

From bea5936cc0c5ea7468cdd41757920a08155a3951 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Sat, 11 Jan 2025 15:39:33 +0100
Subject: [PATCH 14/15] Reverting last commit

---
 src/index/IndexImpl.Text.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index bde9440aa6..a27a5a7817 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -62,7 +62,6 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
   }
 }
 
-// LCOV_EXCL_START
 // _____________________________________________________________________________
 void IndexImpl::processEntityCaseDuringInvertedListProcessing(
     const WordsFileLine& line,
@@ -112,7 +111,6 @@ void IndexImpl::logEntityNotFound(const string& word,
     entityNotFoundErrorMsgCount++;
   }
 }
-// LCOV_EXCL_STOP
 
 // _____________________________________________________________________________
 void IndexImpl::addTextFromContextFile(const string& contextFile,

From 349be6d4518148ad207f90881b3cec2329fa7676 Mon Sep 17 00:00:00 2001
From: Felix Meisen <fesemeisen@outlook.de>
Date: Wed, 15 Jan 2025 14:14:49 +0100
Subject: [PATCH 15/15] Small improvement

---
 src/index/IndexImpl.Text.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
index a27a5a7817..d1e6a70777 100644
--- a/src/index/IndexImpl.Text.cpp
+++ b/src/index/IndexImpl.Text.cpp
@@ -54,7 +54,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
       for (auto word : tokenizeAndNormalizeText(textView, localeManager)) {
-        WordsFileLine wordLine{word, false, contextId, 1};
+        WordsFileLine wordLine{std::move(word), false, contextId, 1};
         co_yield wordLine;
       }
       contextId = contextId.incremented();