From b93fde4d4e85ab4784b04b7acfcd9988eda91b97 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 28 Dec 2024 01:52:18 +0100 Subject: [PATCH 01/15] Extra classes for Words- and Docsfile parsing --- src/global/IndexTypes.h | 1 + src/index/IndexImpl.Text.cpp | 47 ++++++++++--------- src/index/IndexImpl.h | 4 +- src/parser/CMakeLists.txt | 2 +- src/parser/ContextFileParser.cpp | 46 ------------------- src/parser/ContextFileParser.h | 45 ------------------ src/parser/WordsAndDocsFileParser.cpp | 61 +++++++++++++++++++++++++ src/parser/WordsAndDocsFileParser.h | 63 +++++++++++++++++++++++++ test/CMakeLists.txt | 2 +- test/ContextFileParserTest.cpp | 59 ------------------------ test/WordsAndDocsFileParserTest.cpp | 66 +++++++++++++++++++++++++++ 11 files changed, 218 insertions(+), 178 deletions(-) delete mode 100644 src/parser/ContextFileParser.cpp delete mode 100644 src/parser/ContextFileParser.h create mode 100644 src/parser/WordsAndDocsFileParser.cpp create mode 100644 src/parser/WordsAndDocsFileParser.h delete mode 100644 test/ContextFileParserTest.cpp create mode 100644 test/WordsAndDocsFileParserTest.cpp diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index 08ee960d00..4868e59694 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; using BlankNodeIndex = ad_utility::TypedIndex; +using DocumentIndex = ad_utility::TypedIndex; diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 76c0015974..022c49b8cd 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -17,7 +17,7 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" #include "index/FTSAlgorithms.h" -#include "parser/ContextFileParser.h" +#include "parser/WordsAndDocsFileParser.h" #include "util/Conversions.h" #include "util/Simple8bCode.h" @@ -38,18 +38,17 @@ struct LiteralsTokenizationDelimiter { } // namespace // _____________________________________________________________________________ -cppcoro::generator IndexImpl::wordsInTextRecords( +cppcoro::generator IndexImpl::wordsInTextRecords( const std::string& contextFile, bool addWordsFromLiterals) { auto localeManager = textVocab_.getLocaleManager(); // ROUND 1: If context file aka wordsfile is not empty, read words from there. // Remember the last context id for the (optional) second round. TextRecordIndex contextId = TextRecordIndex::make(0); if (!contextFile.empty()) { - ContextFileParser::Line line; - ContextFileParser p(contextFile, localeManager); + WordsFileParser p(contextFile, localeManager); ad_utility::HashSet items; - while (p.getLine(line)) { - contextId = line._contextId; + for (auto line : p) { + contextId = line.contextId_; co_yield line; } if (contextId > TextRecordIndex::make(0)) { @@ -65,7 +64,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( if (!isLiteral(text)) { continue; } - ContextFileParser::Line entityLine{text, true, contextId, 1, true}; + WordsFileLine entityLine{text, true, contextId, 1, true}; co_yield entityLine; std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); @@ -73,7 +72,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})) { auto wordNormalized = localeManager.getLowercaseUtf8(word); - ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1}; + WordsFileLine wordLine{wordNormalized, false, contextId, 1}; co_yield wordLine; } contextId = contextId.incremented(); @@ -214,12 +213,12 @@ size_t IndexImpl::processWordsForVocabulary(string const& contextFile, for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) { ++numLines; // LOG(INFO) << "LINE: " - // << std::setw(50) << line._word << " " - // << line._isEntity << "\t" - // << line._contextId.get() << "\t" - // << line._score << std::endl; - if (!line._isEntity) { - distinctWords.insert(line._word); + // << std::setw(50) << line.word_ << " " + // << line.isEntity_ << "\t" + // << line.contextId_.get() << "\t" + // << line.score_ << std::endl; + if (!line.isEntity_) { + distinctWords.insert(line.word_); } } textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary"); @@ -243,29 +242,29 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile, size_t nofLiterals = 0; for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) { - if (line._contextId != currentContext) { + if (line.contextId_ != currentContext) { ++nofContexts; addContextToVector(writer, currentContext, wordsInContext, entitiesInContext); - currentContext = line._contextId; + currentContext = line.contextId_; wordsInContext.clear(); entitiesInContext.clear(); } - if (line._isEntity) { + if (line.isEntity_) { ++nofEntityPostings; // TODO Currently only IRIs and strings from the vocabulary can // be tagged entities in the text index (no doubles, ints, etc). VocabIndex eid; - if (getVocab().getId(line._word, &eid)) { + if (getVocab().getId(line.word_, &eid)) { // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have // to be contiguous. - entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score; - if (line._isLiteralEntity) { + entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_; + if (line.isLiteralEntity_) { ++nofLiterals; } } else { if (entityNotFoundErrorMsgCount < 20) { - LOG(WARN) << "Entity from text not in KB: " << line._word << '\n'; + LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n'; if (++entityNotFoundErrorMsgCount == 20) { LOG(WARN) << "There are more entities not in the KB..." << " suppressing further warnings...\n"; @@ -278,14 +277,14 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile, ++nofWordPostings; // TODO Let the `textVocab_` return a `WordIndex` directly. WordVocabIndex vid; - bool ret = textVocab_.getId(line._word, &vid); + bool ret = textVocab_.getId(line.word_, &vid); WordIndex wid = vid.get(); if (!ret) { - LOG(ERROR) << "ERROR: word \"" << line._word << "\" " + LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" " << "not found in textVocab. Terminating\n"; AD_FAIL(); } - wordsInContext[wid] += line._score; + wordsInContext[wid] += line.score_; } } if (entityNotFoundErrorMsgCount > 0) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index b98d0d5788..2fab089280 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -29,9 +29,9 @@ #include "index/TextMetaData.h" #include "index/Vocabulary.h" #include "index/VocabularyMerger.h" -#include "parser/ContextFileParser.h" #include "parser/RdfParser.h" #include "parser/TripleComponent.h" +#include "parser/WordsAndDocsFileParser.h" #include "util/BufferedVector.h" #include "util/CancellationHandle.h" #include "util/File.h" @@ -515,7 +515,7 @@ class IndexImpl { // TODO: So far, this is limited to the internal vocabulary (still in the // testing phase, once it works, it should be easy to include the IRIs and // literals from the external vocabulary as well). - cppcoro::generator wordsInTextRecords( + cppcoro::generator wordsInTextRecords( const std::string& contextFile, bool addWordsFromLiterals); size_t processWordsForVocabulary(const string& contextFile, diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt index be4b3db44c..6fa123a793 100644 --- a/src/parser/CMakeLists.txt +++ b/src/parser/CMakeLists.txt @@ -10,7 +10,7 @@ add_library(parser ParsedQuery.cpp RdfParser.cpp Tokenizer.cpp - ContextFileParser.cpp + WordsAndDocsFileParser.cpp TurtleTokenId.h ParallelBuffer.cpp SparqlParserHelpers.cpp diff --git a/src/parser/ContextFileParser.cpp b/src/parser/ContextFileParser.cpp deleted file mode 100644 index 523bde486b..0000000000 --- a/src/parser/ContextFileParser.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#include "./ContextFileParser.h" - -#include - -#include "../util/Exception.h" -#include "../util/StringUtils.h" - -// _____________________________________________________________________________ -ContextFileParser::ContextFileParser(const string& contextFile, - LocaleManager localeManager) - : _in(contextFile), _localeManager(std::move(localeManager)) {} - -// _____________________________________________________________________________ -ContextFileParser::~ContextFileParser() { _in.close(); } - -// _____________________________________________________________________________ -bool ContextFileParser::getLine(ContextFileParser::Line& line) { - string l; - if (std::getline(_in, l)) { - size_t i = l.find('\t'); - assert(i != string::npos); - size_t j = i + 2; - assert(j + 3 < l.size()); - size_t k = l.find('\t', j + 2); - assert(k != string::npos); - line._isEntity = (l[i + 1] == '1'); - line._word = - (line._isEntity ? l.substr(0, i) - : _localeManager.getLowercaseUtf8(l.substr(0, i))); - line._contextId = - TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); - line._score = static_cast(atol(l.substr(k + 1).c_str())); -#ifndef NDEBUG - if (_lastCId > line._contextId) { - AD_THROW("ContextFile has to be sorted by context Id."); - } - _lastCId = line._contextId; -#endif - return true; - } - return false; -} diff --git a/src/parser/ContextFileParser.h b/src/parser/ContextFileParser.h deleted file mode 100644 index ba8d7bac9c..0000000000 --- a/src/parser/ContextFileParser.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#pragma once - -#include - -#include -#include - -#include "../global/Id.h" -#include "../index/StringSortComparator.h" - -using std::string; - -class ContextFileParser { - public: - struct Line { - string _word; - bool _isEntity; - TextRecordIndex _contextId; - Score _score; - bool _isLiteralEntity = false; - }; - - explicit ContextFileParser(const string& contextFile, - LocaleManager localeManager); - ~ContextFileParser(); - // Don't allow copy & assignment - explicit ContextFileParser(const ContextFileParser& other) = delete; - ContextFileParser& operator=(const ContextFileParser& other) = delete; - - // Get the next line from the file. - // Returns true if something was stored. - bool getLine(Line&); - - private: - std::ifstream _in; -#ifndef NDEBUG - // Only used for sanity checks in debug builds - TextRecordIndex _lastCId = TextRecordIndex::make(0); -#endif - LocaleManager _localeManager; -}; diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp new file mode 100644 index 0000000000..e45f47c53a --- /dev/null +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -0,0 +1,61 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + +#include "parser/WordsAndDocsFileParser.h" + +#include + +#include "../util/Exception.h" +#include "../util/StringUtils.h" + +// _____________________________________________________________________________ +WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile, + LocaleManager localeManager) + : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {} + +// _____________________________________________________________________________ +WordsAndDocsFileParser::~WordsAndDocsFileParser() { in_.close(); } + +// _____________________________________________________________________________ +ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { + WordsFileLine line; + string l; + if (std::getline(in_, l)) { + size_t i = l.find('\t'); + assert(i != string::npos); + size_t j = i + 2; + assert(j + 3 < l.size()); + size_t k = l.find('\t', j + 2); + assert(k != string::npos); + line.isEntity_ = (l[i + 1] == '1'); + line.word_ = + (line.isEntity_ ? l.substr(0, i) + : localeManager_.getLowercaseUtf8(l.substr(0, i))); + line.contextId_ = + TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); + line.score_ = static_cast(atol(l.substr(k + 1).c_str())); +#ifndef NDEBUG + if (lastCId_ > line.contextId_) { + AD_THROW("ContextFile has to be sorted by context Id."); + } + lastCId_ = line.contextId_; +#endif + return line; + } + return std::nullopt; +} + +// _____________________________________________________________________________ +ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { + DocsFileLine line; + string l; + if (std::getline(in_, l)) { + size_t i = l.find('\t'); + assert(i != string::npos); + line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str())); + line.docContent_ = l.substr(i + 1); + return line; + } + return std::nullopt; +} diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h new file mode 100644 index 0000000000..c72bf24d61 --- /dev/null +++ b/src/parser/WordsAndDocsFileParser.h @@ -0,0 +1,63 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + +#pragma once + +#include + +#include +#include + +#include "global/Id.h" +#include "index/StringSortComparator.h" +#include "util/Iterators.h" + +using std::string; + +struct WordsFileLine { + string word_; + bool isEntity_; + TextRecordIndex contextId_; + Score score_; + bool isLiteralEntity_ = false; +}; + +struct DocsFileLine { + string docContent_; + DocumentIndex docId_; +}; + +class WordsAndDocsFileParser { + public: + explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, + LocaleManager localeManager); + ~WordsAndDocsFileParser(); + explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete; + WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) = + delete; + + protected: + std::ifstream in_; + LocaleManager localeManager_; +}; + +class WordsFileParser : public WordsAndDocsFileParser, + public ad_utility::InputRangeFromGet { + public: + using WordsAndDocsFileParser::WordsAndDocsFileParser; + Storage get() override; + + private: +#ifndef NDEBUG + // Only used for sanity checks in debug builds + TextRecordIndex lastCId_ = TextRecordIndex::make(0); +#endif +}; + +class DocsFileParser : public WordsAndDocsFileParser, + public ad_utility::InputRangeFromGet { + public: + using WordsAndDocsFileParser::WordsAndDocsFileParser; + Storage get() override; +}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 46a83d5b7a..a8cd128e5c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -137,7 +137,7 @@ addLinkAndDiscoverTestSerial(FileTest) addLinkAndDiscoverTest(Simple8bTest) -addLinkAndDiscoverTest(ContextFileParserTest parser) +addLinkAndDiscoverTest(WordsAndDocsFileParserTest parser) addLinkAndDiscoverTest(IndexMetaDataTest index) diff --git a/test/ContextFileParserTest.cpp b/test/ContextFileParserTest.cpp deleted file mode 100644 index 2b27c0f34d..0000000000 --- a/test/ContextFileParserTest.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2015, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) - -#include - -#include -#include - -#include "../src/parser/ContextFileParser.h" - -TEST(ContextFileParserTest, getLineTest) { - char* locale = setlocale(LC_CTYPE, ""); - std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; - - std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); - f << "Foo\t0\t0\t2\n" - "foo\t0\t0\t2\n" - "Bär\t1\t0\t1\n" - "Äü\t0\t0\t1\n" - "X\t0\t1\t1\n"; - - f.close(); - ContextFileParser p("_testtmp.contexts.tsv", - LocaleManager("en", "US", false)); - ContextFileParser::Line a; - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("foo", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(2u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("foo", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(2u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("Bär", a._word); - ASSERT_TRUE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("äü", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(0u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_TRUE(p.getLine(a)); - ASSERT_EQ("x", a._word); - ASSERT_FALSE(a._isEntity); - ASSERT_EQ(1u, a._contextId.get()); - ASSERT_EQ(1u, a._score); - - ASSERT_FALSE(p.getLine(a)); - remove("_testtmp.contexts.tsv"); -}; diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp new file mode 100644 index 0000000000..bc3d1950a5 --- /dev/null +++ b/test/WordsAndDocsFileParserTest.cpp @@ -0,0 +1,66 @@ +// Copyright 2015, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + +#include + +#include +#include + +#include "parser/WordsAndDocsFileParser.h" + +TEST(WordsAndDocsFileParserTest, wordsFileGetLineTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); + f << "Foo\t0\t0\t2\n" + "foo\t0\t0\t2\n" + "Bär\t1\t0\t1\n" + "Äü\t0\t0\t1\n" + "X\t0\t1\t1\n"; + + f.close(); + WordsFileParser p("_testtmp.contexts.tsv", LocaleManager("en", "US", false)); + size_t i = 0; + for (auto line : p) { + switch (i) { + case 0: + ASSERT_EQ("foo", line.word_); + ASSERT_FALSE(line.isEntity_); + ASSERT_EQ(0u, line.contextId_.get()); + ASSERT_EQ(2u, line.score_); + break; + case 1: + ASSERT_EQ("foo", line.word_); + ASSERT_FALSE(line.isEntity_); + ASSERT_EQ(0u, line.contextId_.get()); + ASSERT_EQ(2u, line.score_); + break; + case 2: + ASSERT_EQ("Bär", line.word_); + ASSERT_TRUE(line.isEntity_); + ASSERT_EQ(0u, line.contextId_.get()); + ASSERT_EQ(1u, line.score_); + break; + case 3: + ASSERT_EQ("äü", line.word_); + ASSERT_FALSE(line.isEntity_); + ASSERT_EQ(0u, line.contextId_.get()); + ASSERT_EQ(1u, line.score_); + break; + case 4: + ASSERT_EQ("x", line.word_); + ASSERT_FALSE(line.isEntity_); + ASSERT_EQ(1u, line.contextId_.get()); + ASSERT_EQ(1u, line.score_); + break; + default: + // shouldn't be reached + ASSERT_TRUE(false); + } + ++i; + } + ASSERT_EQ(i, 5); + remove("_testtmp.contexts.tsv"); +}; From 9c40084f65a30eebeb4c5669dab1f28547c3028f Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 28 Dec 2024 02:55:16 +0100 Subject: [PATCH 02/15] Added method to tokenize and normalize at the same time. --- src/index/IndexImpl.Text.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 022c49b8cd..1bc6101c7c 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -35,6 +35,16 @@ struct LiteralsTokenizationDelimiter { } }; +cppcoro::generator tokenizeAndNormalizeTextLine( + std::string_view lineView, LocaleManager localeManager) { + // Currently it is not possible to use std::views or std::ranges with the + // splitter object returned by absl::StrSplit. Every solution I have seen + // will remove the lazy nature of StrSplit and views/ranges. (2024-12-28) + for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{}, + absl::SkipEmpty{})) { + co_yield localeManager.getLowercaseUtf8(word); + } +} } // namespace // _____________________________________________________________________________ @@ -69,10 +79,8 @@ cppcoro::generator IndexImpl::wordsInTextRecords( std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{}, - absl::SkipEmpty{})) { - auto wordNormalized = localeManager.getLowercaseUtf8(word); - WordsFileLine wordLine{wordNormalized, false, contextId, 1}; + for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) { + WordsFileLine wordLine{word, false, contextId, 1}; co_yield wordLine; } contextId = contextId.incremented(); From c365935f1a54e4106dcebd8bbfaa5330b85b0030 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 28 Dec 2024 05:02:10 +0100 Subject: [PATCH 03/15] Added the tokenization to the ql_utility namespace --- src/index/IndexImpl.Text.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 1bc6101c7c..42d7bd7496 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -21,7 +21,7 @@ #include "util/Conversions.h" #include "util/Simple8bCode.h" -namespace { +namespace ql_utility { // Custom delimiter class for tokenization of literals using `absl::StrSplit`. // The `Find` function returns the next delimiter in `text` after the given @@ -45,7 +45,7 @@ cppcoro::generator tokenizeAndNormalizeTextLine( co_yield localeManager.getLowercaseUtf8(word); } } -} // namespace +} // namespace ql_utility // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( @@ -79,7 +79,8 @@ cppcoro::generator IndexImpl::wordsInTextRecords( std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) { + for (auto word : + ql_utility::tokenizeAndNormalizeTextLine(textView, localeManager)) { WordsFileLine wordLine{word, false, contextId, 1}; co_yield wordLine; } From 479b763534f7aec0bafd20348188fa0e0c6f6df7 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 28 Dec 2024 05:06:47 +0100 Subject: [PATCH 04/15] Revert "Added the tokenization to the ql_utility namespace" This reverts commit c365935f1a54e4106dcebd8bbfaa5330b85b0030. --- src/index/IndexImpl.Text.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 42d7bd7496..1bc6101c7c 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -21,7 +21,7 @@ #include "util/Conversions.h" #include "util/Simple8bCode.h" -namespace ql_utility { +namespace { // Custom delimiter class for tokenization of literals using `absl::StrSplit`. // The `Find` function returns the next delimiter in `text` after the given @@ -45,7 +45,7 @@ cppcoro::generator tokenizeAndNormalizeTextLine( co_yield localeManager.getLowercaseUtf8(word); } } -} // namespace ql_utility +} // namespace // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( @@ -79,8 +79,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - for (auto word : - ql_utility::tokenizeAndNormalizeTextLine(textView, localeManager)) { + for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) { WordsFileLine wordLine{word, false, contextId, 1}; co_yield wordLine; } From d0ec708744a9f634bd53c54a8d8e563d4dd8c215 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Thu, 2 Jan 2025 11:22:24 +0100 Subject: [PATCH 05/15] Used the custom InputRangeMixin to lazily tokenize and normalize words of texts --- src/index/IndexImpl.Text.cpp | 29 ++------------- src/parser/WordsAndDocsFileParser.cpp | 19 ++++++++++ src/parser/WordsAndDocsFileParser.h | 52 +++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 27 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 1bc6101c7c..9f9e11ec68 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -21,32 +21,6 @@ #include "util/Conversions.h" #include "util/Simple8bCode.h" -namespace { - -// Custom delimiter class for tokenization of literals using `absl::StrSplit`. -// The `Find` function returns the next delimiter in `text` after the given -// `pos` or an empty substring if there is no next delimiter. -struct LiteralsTokenizationDelimiter { - absl::string_view Find(absl::string_view text, size_t pos) { - auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; - auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); - if (found == text.end()) return text.substr(text.size()); - return {found, found + 1}; - } -}; - -cppcoro::generator tokenizeAndNormalizeTextLine( - std::string_view lineView, LocaleManager localeManager) { - // Currently it is not possible to use std::views or std::ranges with the - // splitter object returned by absl::StrSplit. Every solution I have seen - // will remove the lazy nature of StrSplit and views/ranges. (2024-12-28) - for (auto word : absl::StrSplit(lineView, LiteralsTokenizationDelimiter{}, - absl::SkipEmpty{})) { - co_yield localeManager.getLowercaseUtf8(word); - } -} -} // namespace - // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( const std::string& contextFile, bool addWordsFromLiterals) { @@ -79,7 +53,8 @@ cppcoro::generator IndexImpl::wordsInTextRecords( std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - for (auto word : tokenizeAndNormalizeTextLine(textView, localeManager)) { + TokenizeAndNormalizeText normalizedWords(textView, localeManager); + for (auto word : normalizedWords) { WordsFileLine wordLine{word, false, contextId, 1}; co_yield wordLine; } diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index e45f47c53a..b66932fbfe 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -9,6 +9,25 @@ #include "../util/Exception.h" #include "../util/StringUtils.h" +// _____________________________________________________________________________ +void TokenizeAndNormalizeText::start() { + if (current_ != end_) { + currentValue_ = normalizeToken(*current_); + } else { + currentValue_ = std::nullopt; + } +} + +// _____________________________________________________________________________ +void TokenizeAndNormalizeText::next() { + ++current_; + if (current_ != end_) { + currentValue_ = normalizeToken(*current_); + } else { + currentValue_ = std::nullopt; + } +} + // _____________________________________________________________________________ WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile, LocaleManager localeManager) diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index c72bf24d61..5216a29e13 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -28,6 +29,57 @@ struct DocsFileLine { DocumentIndex docId_; }; +// Custom delimiter class for tokenization of literals using `absl::StrSplit`. +// The `Find` function returns the next delimiter in `text` after the given +// `pos` or an empty substring if there is no next delimiter. +struct LiteralsTokenizationDelimiter { + absl::string_view Find(absl::string_view text, size_t pos) { + auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; + auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); + if (found == text.end()) return text.substr(text.size()); + return {found, found + 1}; + } +}; + +class TokenizeAndNormalizeText + : public ad_utility::InputRangeMixin { + public: + using StorageType = std::string; + explicit TokenizeAndNormalizeText(std::string_view text, + LocaleManager localeManager) + : splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{}, + absl::SkipEmpty{})}, + current_{splitter_.begin()}, + end_{splitter_.end()}, + localeManager_(localeManager){}; + + // Delete unsafe constructors + TokenizeAndNormalizeText() = delete; + TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete; + TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete; + + private: + using Splitter = decltype(absl::StrSplit( + std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})); + Splitter splitter_; + Splitter::const_iterator current_; + Splitter::const_iterator end_; + + std::optional currentValue_; + + LocaleManager localeManager_; + + std::string normalizeToken(std::string_view token) { + return localeManager_.getLowercaseUtf8(token); + } + + public: + void start(); + bool isFinished() const { return !currentValue_.has_value(); }; + const StorageType& get() const { return *currentValue_; }; + void next(); +}; + class WordsAndDocsFileParser { public: explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, From f129ecdd442bc4db1ef77d215f95e95392ca8e0e Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Wed, 8 Jan 2025 15:33:32 +0100 Subject: [PATCH 06/15] Added comments and necessary tests to WordsAndDocsFileParser --- src/parser/WordsAndDocsFileParser.h | 27 ++++- test/WordsAndDocsFileLineCreator.h | 19 ++++ test/WordsAndDocsFileParserTest.cpp | 124 +++++++++++++++++++++-- test/engine/TextIndexScanForWordTest.cpp | 83 +++++++-------- test/engine/TextIndexScanTestHelpers.h | 14 --- 5 files changed, 204 insertions(+), 63 deletions(-) create mode 100644 test/WordsAndDocsFileLineCreator.h diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index 5216a29e13..775768569c 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -16,6 +16,8 @@ using std::string; +// string word_, bool isEntity_, TextRecordIndex contextId_, Score score_, +// bool isLiteralEntity_ struct WordsFileLine { string word_; bool isEntity_; @@ -24,6 +26,7 @@ struct WordsFileLine { bool isLiteralEntity_ = false; }; +// string docContent_, DocumentIndex docId_ struct DocsFileLine { string docContent_; DocumentIndex docId_; @@ -41,6 +44,13 @@ struct LiteralsTokenizationDelimiter { } }; +// This class constructs an object that can be iterated to get the normalized +// words of the text given. The text gets split into tokens using the +// LiteralsTokenizationDelimiter and those tokens get normalized using +// the localeManager. You can use the constructed object like +// obj = TokenizeAndNormalizeText{text, localeManager} +// for (auto normalizedWord : obj) { code } +// The type of the value returned when iterating is std::string class TokenizeAndNormalizeText : public ad_utility::InputRangeMixin { public: @@ -51,7 +61,7 @@ class TokenizeAndNormalizeText absl::SkipEmpty{})}, current_{splitter_.begin()}, end_{splitter_.end()}, - localeManager_(localeManager){}; + localeManager_(std::move(localeManager)){}; // Delete unsafe constructors TokenizeAndNormalizeText() = delete; @@ -80,6 +90,9 @@ class TokenizeAndNormalizeText void next(); }; +// This class is the parent class of WordsFileParser and DocsFileParser and +// it exists to reduce code duplication since the only difference between the +// child classes is the line type returned class WordsAndDocsFileParser { public: explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, @@ -94,6 +107,13 @@ class WordsAndDocsFileParser { LocaleManager localeManager_; }; +// This class takes in the a pathToWordsFile and a localeManager. It then can +// be used to iterate the wordsFile while already normalizing the words using +// the localeManager. (If words are entities it doesn't normalize them) +// An object of this class can be iterated as follows: +// obj = WordsFileParser{wordsFile, localeManager} +// for (auto wordsFileLine : obj) { code } +// The type of the value returned when iterating is WordsFileLine class WordsFileParser : public WordsAndDocsFileParser, public ad_utility::InputRangeFromGet { public: @@ -106,7 +126,10 @@ class WordsFileParser : public WordsAndDocsFileParser, TextRecordIndex lastCId_ = TextRecordIndex::make(0); #endif }; - +// Works similar to WordsFileParser but it instead parses a docsFile and +// doesn't normalize the text found in docsFile. To parse the returned +// docContent_ of a DocsFileLine please refer to the TokenizeAndNormalizeText +// class class DocsFileParser : public WordsAndDocsFileParser, public ad_utility::InputRangeFromGet { public: diff --git a/test/WordsAndDocsFileLineCreator.h b/test/WordsAndDocsFileLineCreator.h new file mode 100644 index 0000000000..6675be2905 --- /dev/null +++ b/test/WordsAndDocsFileLineCreator.h @@ -0,0 +1,19 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Felix Meisen (fesemeisen@outlook.de) + +#pragma once + +inline std::string inlineSeparator = "\t"; +inline std::string lineSeparator = "\n"; + +inline std::string createWordsFileLine(std::string word, bool isEntity, + size_t contextId, size_t score) { + return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator + + std::to_string(contextId) + inlineSeparator + std::to_string(score) + + lineSeparator; +}; + +inline std::string createDocsFileLine(size_t docId, std::string docContent) { + return std::to_string(docId) + inlineSeparator + docContent + lineSeparator; +}; diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp index bc3d1950a5..8a82c5eefd 100644 --- a/test/WordsAndDocsFileParserTest.cpp +++ b/test/WordsAndDocsFileParserTest.cpp @@ -7,18 +7,19 @@ #include #include +#include "./WordsAndDocsFileLineCreator.h" #include "parser/WordsAndDocsFileParser.h" -TEST(WordsAndDocsFileParserTest, wordsFileGetLineTest) { +TEST(WordsAndDocsFileParserTest, wordsFileParserTest) { char* locale = setlocale(LC_CTYPE, ""); std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); - f << "Foo\t0\t0\t2\n" - "foo\t0\t0\t2\n" - "Bär\t1\t0\t1\n" - "Äü\t0\t0\t1\n" - "X\t0\t1\t1\n"; + f << createWordsFileLine("Foo", false, 0, 2) + << createWordsFileLine("foo", false, 0, 2) + << createWordsFileLine("Bär", true, 0, 1) + << createWordsFileLine("Äü", false, 0, 1) + << createWordsFileLine("X", false, 1, 1); f.close(); WordsFileParser p("_testtmp.contexts.tsv", LocaleManager("en", "US", false)); @@ -64,3 +65,114 @@ TEST(WordsAndDocsFileParserTest, wordsFileGetLineTest) { ASSERT_EQ(i, 5); remove("_testtmp.contexts.tsv"); }; + +TEST(WordsAndDocsFileParser, docsFileParserTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + + std::fstream f("_testtmp.documents.tsv", std::ios_base::out); + f << createDocsFileLine(4, "This TeSt is OnlyCharcters") + << createDocsFileLine(7, "Wh4t h4pp3ns t0 num83rs") + << createDocsFileLine(8, "An( sp@ci*l ch.ar,:act=_er+s") + << createDocsFileLine(190293, "Large docId"); + f.close(); + DocsFileParser p("_testtmp.documents.tsv", LocaleManager("en", "US", false)); + size_t i = 0; + for (auto line : p) { + switch (i) { + case 0: + ASSERT_EQ(4u, line.docId_.get()); + ASSERT_EQ("This TeSt is OnlyCharcters", line.docContent_); + break; + case 1: + ASSERT_EQ(7u, line.docId_.get()); + ASSERT_EQ("Wh4t h4pp3ns t0 num83rs", line.docContent_); + break; + case 2: + ASSERT_EQ(8u, line.docId_.get()); + ASSERT_EQ("An( sp@ci*l ch.ar,:act=_er+s", line.docContent_); + break; + case 3: + ASSERT_EQ(190293u, line.docId_.get()); + ASSERT_EQ("Large docId", line.docContent_); + break; + default: + // shouldn't be reached + ASSERT_TRUE(false); + } + ++i; + } + ASSERT_EQ(i, 4); + remove("_testtmp.documents.tsv"); +} + +TEST(TokenizeAndNormalizeText, tokenizeAndNormalizeTextTest) { + char* locale = setlocale(LC_CTYPE, ""); + std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; + using TestVec = std::vector; + char separator = ' '; + auto testVecToText = [separator](TestVec input) { + std::string output = ""; + for (auto word : input) { + output.append(word + separator); + } + if (output.size() >= 1) { + output.pop_back(); + } + return output; + }; + + TestVec wordsTest1 = {"already", "normalized", "text"}; + TestVec normalizedWordsTest1 = wordsTest1; + + std::string textTest1 = testVecToText(wordsTest1); + + TokenizeAndNormalizeText test1(textTest1, LocaleManager("en", "US", false)); + size_t i1 = 0; + for (auto normalizedWord : test1) { + ASSERT_EQ(normalizedWordsTest1.at(i1), normalizedWord); + ++i1; + } + ASSERT_EQ(i1, 3); + + TestVec wordsTest2 = {"TeXt", "WITH", "UpperCASe"}; + TestVec normalizedWordsTest2 = {"text", "with", "uppercase"}; + + std::string textTest2 = testVecToText(wordsTest2); + + TokenizeAndNormalizeText test2(textTest2, LocaleManager("en", "US", false)); + size_t i2 = 0; + for (auto normalizedWord : test2) { + ASSERT_EQ(normalizedWordsTest2.at(i2), normalizedWord); + ++i2; + } + ASSERT_EQ(i2, 3); + + TestVec wordsTest3 = {"41ph4num3r1c", "t3xt"}; + TestVec normalizedWordsTest3 = {"41ph4num3r1c", "t3xt"}; + + std::string textTest3 = testVecToText(wordsTest3); + + TokenizeAndNormalizeText test3(textTest3, LocaleManager("en", "US", false)); + size_t i3 = 0; + for (auto normalizedWord : test3) { + ASSERT_EQ(normalizedWordsTest3.at(i3), normalizedWord); + ++i3; + } + ASSERT_EQ(i3, 2); + + TestVec wordsTest4 = {"test\t", "with\n", "different,", + "separators.", "here", ",.\t"}; + TestVec normalizedWordsTest4 = {"test", "with", "different", "separators", + "here"}; + + std::string textTest4 = testVecToText(wordsTest4); + + TokenizeAndNormalizeText test4(textTest4, LocaleManager("en", "US", false)); + size_t i4 = 0; + for (auto normalizedWord : test4) { + ASSERT_EQ(normalizedWordsTest4.at(i4), normalizedWord); + ++i4; + } + ASSERT_EQ(i4, 5); +} diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp index eac3cb0d2f..c62d1ef136 100644 --- a/test/engine/TextIndexScanForWordTest.cpp +++ b/test/engine/TextIndexScanForWordTest.cpp @@ -5,6 +5,7 @@ #include #include +#include "../WordsAndDocsFileLineCreator.h" #include "../printers/VariablePrinters.h" #include "../util/GTestHelpers.h" #include "../util/IdTableHelpers.h" @@ -26,45 +27,45 @@ std::string kg = ". . . ."; std::string wordsFileContent = - h::createWordsFileLine("astronomer", false, 1, 1) + - h::createWordsFileLine("", true, 1, 0) + - h::createWordsFileLine("scientist", false, 1, 1) + - h::createWordsFileLine("field", false, 1, 1) + - h::createWordsFileLine("astronomy", false, 1, 1) + - h::createWordsFileLine("astronomer", false, 2, 0) + - h::createWordsFileLine("", true, 2, 0) + - h::createWordsFileLine(":s:firstsentence", false, 2, 0) + - h::createWordsFileLine("scientist", false, 2, 0) + - h::createWordsFileLine("field", false, 2, 0) + - h::createWordsFileLine("astronomy", false, 2, 0) + - h::createWordsFileLine("astronomy", false, 3, 1) + - h::createWordsFileLine("concentrates", false, 3, 1) + - h::createWordsFileLine("studies", false, 3, 1) + - h::createWordsFileLine("specific", false, 3, 1) + - h::createWordsFileLine("question", false, 3, 1) + - h::createWordsFileLine("outside", false, 3, 1) + - h::createWordsFileLine("scope", false, 3, 1) + - h::createWordsFileLine("earth", false, 3, 1) + - h::createWordsFileLine("astronomy", false, 4, 1) + - h::createWordsFileLine("concentrates", false, 4, 1) + - h::createWordsFileLine("studies", false, 4, 1) + - h::createWordsFileLine("field", false, 4, 1) + - h::createWordsFileLine("outside", false, 4, 1) + - h::createWordsFileLine("scope", false, 4, 1) + - h::createWordsFileLine("earth", false, 4, 1) + - h::createWordsFileLine("tester", false, 5, 1) + - h::createWordsFileLine("rockets", false, 5, 1) + - h::createWordsFileLine("astronomer", false, 5, 1) + - h::createWordsFileLine("", true, 5, 0) + - h::createWordsFileLine("although", false, 5, 1) + - h::createWordsFileLine("astronomer", false, 6, 0) + - h::createWordsFileLine("", true, 6, 0) + - h::createWordsFileLine("although", false, 6, 0) + - h::createWordsFileLine("", true, 6, 0) + - h::createWordsFileLine("space", false, 6, 1) + - h::createWordsFileLine("", true, 7, 0) + - h::createWordsFileLine("space", false, 7, 0) + - h::createWordsFileLine("earth", false, 7, 1); + createWordsFileLine("astronomer", false, 1, 1) + + createWordsFileLine("", true, 1, 0) + + createWordsFileLine("scientist", false, 1, 1) + + createWordsFileLine("field", false, 1, 1) + + createWordsFileLine("astronomy", false, 1, 1) + + createWordsFileLine("astronomer", false, 2, 0) + + createWordsFileLine("", true, 2, 0) + + createWordsFileLine(":s:firstsentence", false, 2, 0) + + createWordsFileLine("scientist", false, 2, 0) + + createWordsFileLine("field", false, 2, 0) + + createWordsFileLine("astronomy", false, 2, 0) + + createWordsFileLine("astronomy", false, 3, 1) + + createWordsFileLine("concentrates", false, 3, 1) + + createWordsFileLine("studies", false, 3, 1) + + createWordsFileLine("specific", false, 3, 1) + + createWordsFileLine("question", false, 3, 1) + + createWordsFileLine("outside", false, 3, 1) + + createWordsFileLine("scope", false, 3, 1) + + createWordsFileLine("earth", false, 3, 1) + + createWordsFileLine("astronomy", false, 4, 1) + + createWordsFileLine("concentrates", false, 4, 1) + + createWordsFileLine("studies", false, 4, 1) + + createWordsFileLine("field", false, 4, 1) + + createWordsFileLine("outside", false, 4, 1) + + createWordsFileLine("scope", false, 4, 1) + + createWordsFileLine("earth", false, 4, 1) + + createWordsFileLine("tester", false, 5, 1) + + createWordsFileLine("rockets", false, 5, 1) + + createWordsFileLine("astronomer", false, 5, 1) + + createWordsFileLine("", true, 5, 0) + + createWordsFileLine("although", false, 5, 1) + + createWordsFileLine("astronomer", false, 6, 0) + + createWordsFileLine("", true, 6, 0) + + createWordsFileLine("although", false, 6, 0) + + createWordsFileLine("", true, 6, 0) + + createWordsFileLine("space", false, 6, 1) + + createWordsFileLine("", true, 7, 0) + + createWordsFileLine("space", false, 7, 0) + + createWordsFileLine("earth", false, 7, 1); std::string firstDocText = "An astronomer is a scientist in the field of " @@ -77,8 +78,8 @@ std::string secondDocText = "too although they might not be in space but on " "earth."; -std::string docsFileContent = h::createDocsFileLine(4, firstDocText) + - h::createDocsFileLine(7, secondDocText); +std::string docsFileContent = + createDocsFileLine(4, firstDocText) + createDocsFileLine(7, secondDocText); std::pair contentsOfWordsFileAndDocsFile = { wordsFileContent, docsFileContent}; diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h index 83a72ddea4..6ba1b8c6de 100644 --- a/test/engine/TextIndexScanTestHelpers.h +++ b/test/engine/TextIndexScanTestHelpers.h @@ -66,18 +66,4 @@ inline string combineToString(const string& text, const string& word) { ss << "Text: " << text << ", Word: " << word << std::endl; return ss.str(); } - -inline std::string inlineSeparator = "\t"; -inline std::string lineSeparator = "\n"; - -inline string createWordsFileLine(std::string word, bool isEntity, - size_t contextId, size_t score) { - return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator + - std::to_string(contextId) + inlineSeparator + std::to_string(score) + - lineSeparator; -}; - -inline string createDocsFileLine(size_t docId, std::string docContent) { - return std::to_string(docId) + inlineSeparator + docContent + lineSeparator; -}; } // namespace textIndexScanTestHelpers From 8c8a1a1de6c811d3aa20ee884c9d2983c5684554 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Thu, 9 Jan 2025 16:46:18 +0100 Subject: [PATCH 07/15] Added comments to WordsAndDcosFileParser.h. Improved useability of tests in WordsAndDocsFileParserTest.cpp. Renamed methods in WordsAndDocsFileLineCreator.h to reduce ambiguity. Incorporated requested small changes of PR. --- src/index/IndexImpl.Text.cpp | 2 +- src/parser/WordsAndDocsFileParser.cpp | 66 +++--- src/parser/WordsAndDocsFileParser.h | 66 +++++- test/WordsAndDocsFileLineCreator.h | 21 +- test/WordsAndDocsFileParserTest.cpp | 260 +++++++++++------------ test/engine/TextIndexScanForWordTest.cpp | 82 +++---- 6 files changed, 266 insertions(+), 231 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 9f9e11ec68..b8bc919e39 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -31,7 +31,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( if (!contextFile.empty()) { WordsFileParser p(contextFile, localeManager); ad_utility::HashSet items; - for (auto line : p) { + for (auto& line : p) { contextId = line.contextId_; co_yield line; } diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index b66932fbfe..dbc638cb7a 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -1,13 +1,14 @@ // Copyright 2015, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) +// Felix Meisen (fesemeisen@outlook.de) #include "parser/WordsAndDocsFileParser.h" #include -#include "../util/Exception.h" -#include "../util/StringUtils.h" +#include "util/Exception.h" +#include "util/StringUtils.h" // _____________________________________________________________________________ void TokenizeAndNormalizeText::start() { @@ -33,48 +34,45 @@ WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile, LocaleManager localeManager) : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {} -// _____________________________________________________________________________ -WordsAndDocsFileParser::~WordsAndDocsFileParser() { in_.close(); } - // _____________________________________________________________________________ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { WordsFileLine line; string l; - if (std::getline(in_, l)) { - size_t i = l.find('\t'); - assert(i != string::npos); - size_t j = i + 2; - assert(j + 3 < l.size()); - size_t k = l.find('\t', j + 2); - assert(k != string::npos); - line.isEntity_ = (l[i + 1] == '1'); - line.word_ = - (line.isEntity_ ? l.substr(0, i) - : localeManager_.getLowercaseUtf8(l.substr(0, i))); - line.contextId_ = - TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); - line.score_ = static_cast(atol(l.substr(k + 1).c_str())); + if (!std::getline(in_, l)) { + return std::nullopt; + }; + size_t i = l.find('\t'); + assert(i != string::npos); + size_t j = i + 2; + assert(j + 3 < l.size()); + size_t k = l.find('\t', j + 2); + assert(k != string::npos); + line.isEntity_ = (l[i + 1] == '1'); + line.word_ = + (line.isEntity_ ? l.substr(0, i) + : localeManager_.getLowercaseUtf8(l.substr(0, i))); + line.contextId_ = + TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); + line.score_ = static_cast(atol(l.substr(k + 1).c_str())); #ifndef NDEBUG - if (lastCId_ > line.contextId_) { - AD_THROW("ContextFile has to be sorted by context Id."); - } - lastCId_ = line.contextId_; -#endif - return line; + if (lastCId_ > line.contextId_) { + AD_THROW("ContextFile has to be sorted by context Id."); } - return std::nullopt; + lastCId_ = line.contextId_; +#endif + return line; } // _____________________________________________________________________________ ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { DocsFileLine line; string l; - if (std::getline(in_, l)) { - size_t i = l.find('\t'); - assert(i != string::npos); - line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str())); - line.docContent_ = l.substr(i + 1); - return line; - } - return std::nullopt; + if (!std::getline(in_, l)) { + return std::nullopt; + }; + size_t i = l.find('\t'); + assert(i != string::npos); + line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str())); + line.docContent_ = l.substr(i + 1); + return line; } diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index 775768569c..c8f632d458 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -1,6 +1,7 @@ // Copyright 2015, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) +// Felix Meisen (fesemeisen@outlook.de) #pragma once @@ -15,9 +16,44 @@ #include "util/Iterators.h" using std::string; - -// string word_, bool isEntity_, TextRecordIndex contextId_, Score score_, -// bool isLiteralEntity_ +// Represents a line from the wordsfile.tsv, which stores everything given in +// the file line and extra information with isLiteralEntity. Also used to add +// literals to the text index through emulating wordsfile lines. +// +// The Fields are ordered in the same way the values follow in a line. +// Short field overview: string word_, bool isEntity, TextRecordIndex contextId, +// Score score_, bool isLiteralEntity (not found in +// wordsfile) +// +// Fields: +// - string word_: The string of the word, if it is an entity it will be +// . bool isEntity_: True if the given word is an +// entity, false if it's a word. +// - TextRecordIndex contextId_: When creating the wordsfile docs from the +// docsfile get split into so called contexts. +// Those contexts overlap, meaning words and +// entities are covered multiple times. Each +// contextId corresponds to the next bigger or +// equal docId. +// - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if +// isEntity is true. (this info is only constructed on the +// scientists.wordsfile.tsv) The score in the wordsfile is only +// relevant for the counting scoring metric. Because of the +// overlap of contexts the score is 1 if the word really has +// been seen for the first time and 0 if not. If a doc contains +// multiple mentions of a word there should be exactly as many +// wordsfile lines of that word with score 1 as there are +// mentions. The score for entities seems rather random and +// since no clear explanation of the creation of wordsfiles +// has been found yet they will stay rather random. +// - bool isLiteralEntity_: This does not directly stem from the wordsfile. +// When building the text index with literals, for +// every literal there will be WordsFileLines for all +// words in that literal. Additionally the whole +// literal itself will be added as word with isEntity +// being true. The need to count this comes only from +// a trick used in testing right now. To be specific +// the method getTextRecordFromResultTable struct WordsFileLine { string word_; bool isEntity_; @@ -26,10 +62,28 @@ struct WordsFileLine { bool isLiteralEntity_ = false; }; -// string docContent_, DocumentIndex docId_ +// Represents a line from the docsfile.tsv, which stores everything given in +// the file line. +// +// The Fields are ordered in the same way the values follow in a line. +// Short field overview: DocumentIndex docId_, string docContent_ +// +// Fields: +// +// - DocumentIndex docId_: The docId is needed to built inverted indices for +// Scoring and building of the docsDB. It is also used +// to return actual texts when searching for a word. +// The word (and entity) search returns a table with +// TextRecordIndex as type of one col. Those get mapped +// to the next bigger or equal docId which is then +// used to extract the text from the docsDB. +// TODO: check if this behaviour is consintently +// implemented +// - string docContent_: The whole text given after the first tab of a line of +// docsfile. struct DocsFileLine { - string docContent_; DocumentIndex docId_; + string docContent_; }; // Custom delimiter class for tokenization of literals using `absl::StrSplit`. @@ -97,7 +151,7 @@ class WordsAndDocsFileParser { public: explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, LocaleManager localeManager); - ~WordsAndDocsFileParser(); + ~WordsAndDocsFileParser() = default; explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete; WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) = delete; diff --git a/test/WordsAndDocsFileLineCreator.h b/test/WordsAndDocsFileLineCreator.h index 6675be2905..cb151216fd 100644 --- a/test/WordsAndDocsFileLineCreator.h +++ b/test/WordsAndDocsFileLineCreator.h @@ -4,16 +4,19 @@ #pragma once -inline std::string inlineSeparator = "\t"; -inline std::string lineSeparator = "\n"; +#include -inline std::string createWordsFileLine(std::string word, bool isEntity, - size_t contextId, size_t score) { - return word + inlineSeparator + (isEntity ? "1" : "0") + inlineSeparator + - std::to_string(contextId) + inlineSeparator + std::to_string(score) + - lineSeparator; +constexpr std::string_view inlineSeparator = "\t"; +constexpr std::string_view lineSeparator = "\n"; + +inline std::string createWordsFileLineAsString(std::string_view word, + bool isEntity, size_t contextId, + size_t score) { + return absl::StrCat(word, inlineSeparator, isEntity, inlineSeparator, + contextId, inlineSeparator, score, lineSeparator); }; -inline std::string createDocsFileLine(size_t docId, std::string docContent) { - return std::to_string(docId) + inlineSeparator + docContent + lineSeparator; +inline std::string createDocsFileLineAsString(size_t docId, + std::string_view docContent) { + return absl::StrCat(docId, inlineSeparator, docContent, lineSeparator); }; diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp index 8a82c5eefd..0672064f72 100644 --- a/test/WordsAndDocsFileParserTest.cpp +++ b/test/WordsAndDocsFileParserTest.cpp @@ -10,59 +10,109 @@ #include "./WordsAndDocsFileLineCreator.h" #include "parser/WordsAndDocsFileParser.h" +// All lambdas and type aliases used in this file contained here +namespace { + +/// Type aliases + +// Word, isEntity, contextId, score +using WordLine = std::tuple; +using WordLineVec = std::vector; + +// docId, docContent +using DocLine = std::tuple; +using DocLineVec = std::vector; + +using StringVec = std::vector; + +/// Lambdas + +auto getLocaleManager = []() { return LocaleManager("en", "US", false); }; + +auto wordsFileLineToWordLine = [](WordsFileLine wordsFileLine) -> WordLine { + return std::make_tuple(wordsFileLine.word_, wordsFileLine.isEntity_, + static_cast(wordsFileLine.contextId_.get()), + static_cast(wordsFileLine.score_)); +}; + +// Lambda that takes in a path to wordsFile to initialize the Parser and an +// expectedResult that is compared against the parsers outputs. +auto testWordsFileParser = [](std::string wordsFilePath, + WordLineVec expectedResult) { + WordsFileParser p(wordsFilePath, getLocaleManager()); + size_t i = 0; + for (auto wordsFileLine : p) { + ASSERT_TRUE(i < expectedResult.size()); + WordLine testLine = wordsFileLineToWordLine(wordsFileLine); + + // Not testing the whole tuples against each other to have a cleaner + // indication what exactly caused the assertion to fail + ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i))); + ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i))); + ASSERT_EQ(std::get<2>(testLine), std::get<2>(expectedResult.at(i))); + ASSERT_EQ(std::get<3>(testLine), std::get<3>(expectedResult.at(i))); + + ++i; + } + ASSERT_EQ(i, expectedResult.size()); +}; + +auto docsFileLineToDocLine = [](DocsFileLine docsFileLine) -> DocLine { + return std::make_tuple(static_cast(docsFileLine.docId_.get()), + docsFileLine.docContent_); +}; + +// Same as testWordsFileParser but for docsFile +auto testDocsFileParser = [](std::string docsFilePath, + DocLineVec expectedResult) { + DocsFileParser p(docsFilePath, getLocaleManager()); + size_t i = 0; + for (auto docsFileLine : p) { + ASSERT_TRUE(i < expectedResult.size()); + DocLine testLine = docsFileLineToDocLine(docsFileLine); + + // Not testing the whole tuples against each other to have a cleaner + // indication what exactly caused the assertion to fail + ASSERT_EQ(std::get<0>(testLine), std::get<0>(expectedResult.at(i))); + ASSERT_EQ(std::get<1>(testLine), std::get<1>(expectedResult.at(i))); + + ++i; + } +}; + +auto testTokenizeAndNormalizeText = [](std::string testText, + StringVec normalizedTextAsVec) { + TokenizeAndNormalizeText testTokenizer(testText, getLocaleManager()); + size_t i = 0; + for (auto normalizedWord : testTokenizer) { + ASSERT_TRUE(i < normalizedTextAsVec.size()); + ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i)); + + ++i; + } + ASSERT_EQ(i, normalizedTextAsVec.size()); +}; + +} // namespace + TEST(WordsAndDocsFileParserTest, wordsFileParserTest) { char* locale = setlocale(LC_CTYPE, ""); std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; std::fstream f("_testtmp.contexts.tsv", std::ios_base::out); - f << createWordsFileLine("Foo", false, 0, 2) - << createWordsFileLine("foo", false, 0, 2) - << createWordsFileLine("Bär", true, 0, 1) - << createWordsFileLine("Äü", false, 0, 1) - << createWordsFileLine("X", false, 1, 1); - + f << createWordsFileLineAsString("Foo", false, 0, 2) + << createWordsFileLineAsString("foo", false, 0, 2) + << createWordsFileLineAsString("Bär", true, 0, 1) + << createWordsFileLineAsString("Äü", false, 0, 1) + << createWordsFileLineAsString("X", false, 1, 1); f.close(); - WordsFileParser p("_testtmp.contexts.tsv", LocaleManager("en", "US", false)); - size_t i = 0; - for (auto line : p) { - switch (i) { - case 0: - ASSERT_EQ("foo", line.word_); - ASSERT_FALSE(line.isEntity_); - ASSERT_EQ(0u, line.contextId_.get()); - ASSERT_EQ(2u, line.score_); - break; - case 1: - ASSERT_EQ("foo", line.word_); - ASSERT_FALSE(line.isEntity_); - ASSERT_EQ(0u, line.contextId_.get()); - ASSERT_EQ(2u, line.score_); - break; - case 2: - ASSERT_EQ("Bär", line.word_); - ASSERT_TRUE(line.isEntity_); - ASSERT_EQ(0u, line.contextId_.get()); - ASSERT_EQ(1u, line.score_); - break; - case 3: - ASSERT_EQ("äü", line.word_); - ASSERT_FALSE(line.isEntity_); - ASSERT_EQ(0u, line.contextId_.get()); - ASSERT_EQ(1u, line.score_); - break; - case 4: - ASSERT_EQ("x", line.word_); - ASSERT_FALSE(line.isEntity_); - ASSERT_EQ(1u, line.contextId_.get()); - ASSERT_EQ(1u, line.score_); - break; - default: - // shouldn't be reached - ASSERT_TRUE(false); - } - ++i; - } - ASSERT_EQ(i, 5); + + WordLineVec expected = { + std::make_tuple("foo", false, 0, 2), std::make_tuple("foo", false, 0, 2), + std::make_tuple("Bär", true, 0, 1), std::make_tuple("äü", false, 0, 1), + std::make_tuple("x", false, 1, 1)}; + + testWordsFileParser("_testtmp.contexts.tsv", expected); remove("_testtmp.contexts.tsv"); }; @@ -71,108 +121,38 @@ TEST(WordsAndDocsFileParser, docsFileParserTest) { std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; std::fstream f("_testtmp.documents.tsv", std::ios_base::out); - f << createDocsFileLine(4, "This TeSt is OnlyCharcters") - << createDocsFileLine(7, "Wh4t h4pp3ns t0 num83rs") - << createDocsFileLine(8, "An( sp@ci*l ch.ar,:act=_er+s") - << createDocsFileLine(190293, "Large docId"); + f << createDocsFileLineAsString(4, "This TeSt is OnlyCharcters") + << createDocsFileLineAsString(7, "Wh4t h4pp3ns t0 num83rs") + << createDocsFileLineAsString(8, "An( sp@ci*l ch.ar,:act=_er+s") + << createDocsFileLineAsString(190293, "Large docId"); f.close(); - DocsFileParser p("_testtmp.documents.tsv", LocaleManager("en", "US", false)); - size_t i = 0; - for (auto line : p) { - switch (i) { - case 0: - ASSERT_EQ(4u, line.docId_.get()); - ASSERT_EQ("This TeSt is OnlyCharcters", line.docContent_); - break; - case 1: - ASSERT_EQ(7u, line.docId_.get()); - ASSERT_EQ("Wh4t h4pp3ns t0 num83rs", line.docContent_); - break; - case 2: - ASSERT_EQ(8u, line.docId_.get()); - ASSERT_EQ("An( sp@ci*l ch.ar,:act=_er+s", line.docContent_); - break; - case 3: - ASSERT_EQ(190293u, line.docId_.get()); - ASSERT_EQ("Large docId", line.docContent_); - break; - default: - // shouldn't be reached - ASSERT_TRUE(false); - } - ++i; - } - ASSERT_EQ(i, 4); + + DocLineVec expected = {std::make_pair(4, "This TeSt is OnlyCharcters"), + std::make_pair(7, "Wh4t h4pp3ns t0 num83rs"), + std::make_pair(8, "An( sp@ci*l ch.ar,:act=_er+s"), + std::make_pair(190293, "Large docId")}; + + testDocsFileParser("_testtmp.documents.tsv", expected); remove("_testtmp.documents.tsv"); } TEST(TokenizeAndNormalizeText, tokenizeAndNormalizeTextTest) { char* locale = setlocale(LC_CTYPE, ""); std::cout << "Set locale LC_CTYPE to: " << locale << std::endl; - using TestVec = std::vector; - char separator = ' '; - auto testVecToText = [separator](TestVec input) { - std::string output = ""; - for (auto word : input) { - output.append(word + separator); - } - if (output.size() >= 1) { - output.pop_back(); - } - return output; - }; - - TestVec wordsTest1 = {"already", "normalized", "text"}; - TestVec normalizedWordsTest1 = wordsTest1; - - std::string textTest1 = testVecToText(wordsTest1); - - TokenizeAndNormalizeText test1(textTest1, LocaleManager("en", "US", false)); - size_t i1 = 0; - for (auto normalizedWord : test1) { - ASSERT_EQ(normalizedWordsTest1.at(i1), normalizedWord); - ++i1; - } - ASSERT_EQ(i1, 3); - - TestVec wordsTest2 = {"TeXt", "WITH", "UpperCASe"}; - TestVec normalizedWordsTest2 = {"text", "with", "uppercase"}; - std::string textTest2 = testVecToText(wordsTest2); + // Test 1 + testTokenizeAndNormalizeText("already normalized text", + {"already", "normalized", "text"}); - TokenizeAndNormalizeText test2(textTest2, LocaleManager("en", "US", false)); - size_t i2 = 0; - for (auto normalizedWord : test2) { - ASSERT_EQ(normalizedWordsTest2.at(i2), normalizedWord); - ++i2; - } - ASSERT_EQ(i2, 3); + // Test 2 + testTokenizeAndNormalizeText("TeXt WITH UpperCASe", + {"text", "with", "uppercase"}); - TestVec wordsTest3 = {"41ph4num3r1c", "t3xt"}; - TestVec normalizedWordsTest3 = {"41ph4num3r1c", "t3xt"}; + // Test 3 + testTokenizeAndNormalizeText("41ph4num3r1c t3xt", {"41ph4num3r1c", "t3xt"}); - std::string textTest3 = testVecToText(wordsTest3); - - TokenizeAndNormalizeText test3(textTest3, LocaleManager("en", "US", false)); - size_t i3 = 0; - for (auto normalizedWord : test3) { - ASSERT_EQ(normalizedWordsTest3.at(i3), normalizedWord); - ++i3; - } - ASSERT_EQ(i3, 2); - - TestVec wordsTest4 = {"test\t", "with\n", "different,", - "separators.", "here", ",.\t"}; - TestVec normalizedWordsTest4 = {"test", "with", "different", "separators", - "here"}; - - std::string textTest4 = testVecToText(wordsTest4); - - TokenizeAndNormalizeText test4(textTest4, LocaleManager("en", "US", false)); - size_t i4 = 0; - for (auto normalizedWord : test4) { - ASSERT_EQ(normalizedWordsTest4.at(i4), normalizedWord); - ++i4; - } - ASSERT_EQ(i4, 5); + // Test 4 + testTokenizeAndNormalizeText( + "test\twith\ndifferent,separators.here ,.\t", + {"test", "with", "different", "separators", "here"}); } diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp index c62d1ef136..cc9b685ec8 100644 --- a/test/engine/TextIndexScanForWordTest.cpp +++ b/test/engine/TextIndexScanForWordTest.cpp @@ -27,45 +27,45 @@ std::string kg = ". . . ."; std::string wordsFileContent = - createWordsFileLine("astronomer", false, 1, 1) + - createWordsFileLine("", true, 1, 0) + - createWordsFileLine("scientist", false, 1, 1) + - createWordsFileLine("field", false, 1, 1) + - createWordsFileLine("astronomy", false, 1, 1) + - createWordsFileLine("astronomer", false, 2, 0) + - createWordsFileLine("", true, 2, 0) + - createWordsFileLine(":s:firstsentence", false, 2, 0) + - createWordsFileLine("scientist", false, 2, 0) + - createWordsFileLine("field", false, 2, 0) + - createWordsFileLine("astronomy", false, 2, 0) + - createWordsFileLine("astronomy", false, 3, 1) + - createWordsFileLine("concentrates", false, 3, 1) + - createWordsFileLine("studies", false, 3, 1) + - createWordsFileLine("specific", false, 3, 1) + - createWordsFileLine("question", false, 3, 1) + - createWordsFileLine("outside", false, 3, 1) + - createWordsFileLine("scope", false, 3, 1) + - createWordsFileLine("earth", false, 3, 1) + - createWordsFileLine("astronomy", false, 4, 1) + - createWordsFileLine("concentrates", false, 4, 1) + - createWordsFileLine("studies", false, 4, 1) + - createWordsFileLine("field", false, 4, 1) + - createWordsFileLine("outside", false, 4, 1) + - createWordsFileLine("scope", false, 4, 1) + - createWordsFileLine("earth", false, 4, 1) + - createWordsFileLine("tester", false, 5, 1) + - createWordsFileLine("rockets", false, 5, 1) + - createWordsFileLine("astronomer", false, 5, 1) + - createWordsFileLine("", true, 5, 0) + - createWordsFileLine("although", false, 5, 1) + - createWordsFileLine("astronomer", false, 6, 0) + - createWordsFileLine("", true, 6, 0) + - createWordsFileLine("although", false, 6, 0) + - createWordsFileLine("", true, 6, 0) + - createWordsFileLine("space", false, 6, 1) + - createWordsFileLine("", true, 7, 0) + - createWordsFileLine("space", false, 7, 0) + - createWordsFileLine("earth", false, 7, 1); + createWordsFileLineAsString("astronomer", false, 1, 1) + + createWordsFileLineAsString("", true, 1, 0) + + createWordsFileLineAsString("scientist", false, 1, 1) + + createWordsFileLineAsString("field", false, 1, 1) + + createWordsFileLineAsString("astronomy", false, 1, 1) + + createWordsFileLineAsString("astronomer", false, 2, 0) + + createWordsFileLineAsString("", true, 2, 0) + + createWordsFileLineAsString(":s:firstsentence", false, 2, 0) + + createWordsFileLineAsString("scientist", false, 2, 0) + + createWordsFileLineAsString("field", false, 2, 0) + + createWordsFileLineAsString("astronomy", false, 2, 0) + + createWordsFileLineAsString("astronomy", false, 3, 1) + + createWordsFileLineAsString("concentrates", false, 3, 1) + + createWordsFileLineAsString("studies", false, 3, 1) + + createWordsFileLineAsString("specific", false, 3, 1) + + createWordsFileLineAsString("question", false, 3, 1) + + createWordsFileLineAsString("outside", false, 3, 1) + + createWordsFileLineAsString("scope", false, 3, 1) + + createWordsFileLineAsString("earth", false, 3, 1) + + createWordsFileLineAsString("astronomy", false, 4, 1) + + createWordsFileLineAsString("concentrates", false, 4, 1) + + createWordsFileLineAsString("studies", false, 4, 1) + + createWordsFileLineAsString("field", false, 4, 1) + + createWordsFileLineAsString("outside", false, 4, 1) + + createWordsFileLineAsString("scope", false, 4, 1) + + createWordsFileLineAsString("earth", false, 4, 1) + + createWordsFileLineAsString("tester", false, 5, 1) + + createWordsFileLineAsString("rockets", false, 5, 1) + + createWordsFileLineAsString("astronomer", false, 5, 1) + + createWordsFileLineAsString("", true, 5, 0) + + createWordsFileLineAsString("although", false, 5, 1) + + createWordsFileLineAsString("astronomer", false, 6, 0) + + createWordsFileLineAsString("", true, 6, 0) + + createWordsFileLineAsString("although", false, 6, 0) + + createWordsFileLineAsString("", true, 6, 0) + + createWordsFileLineAsString("space", false, 6, 1) + + createWordsFileLineAsString("", true, 7, 0) + + createWordsFileLineAsString("space", false, 7, 0) + + createWordsFileLineAsString("earth", false, 7, 1); std::string firstDocText = "An astronomer is a scientist in the field of " @@ -78,8 +78,8 @@ std::string secondDocText = "too although they might not be in space but on " "earth."; -std::string docsFileContent = - createDocsFileLine(4, firstDocText) + createDocsFileLine(7, secondDocText); +std::string docsFileContent = createDocsFileLineAsString(4, firstDocText) + + createDocsFileLineAsString(7, secondDocText); std::pair contentsOfWordsFileAndDocsFile = { wordsFileContent, docsFileContent}; From 0369de68b9ae4f95eaa0d3469dec952a7da1c940 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 10 Jan 2025 10:30:06 +0100 Subject: [PATCH 08/15] Rewrite the tokenizer as a view. Signed-off-by: Johannes Kalmbach --- src/index/IndexImpl.Text.cpp | 2 +- src/parser/WordsAndDocsFileParser.cpp | 19 ---------- src/parser/WordsAndDocsFileParser.h | 53 ++++++++------------------- test/WordsAndDocsFileParserTest.cpp | 2 +- 4 files changed, 17 insertions(+), 59 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index b8bc919e39..502063a920 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -53,7 +53,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - TokenizeAndNormalizeText normalizedWords(textView, localeManager); + auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager); for (auto word : normalizedWords) { WordsFileLine wordLine{word, false, contextId, 1}; co_yield wordLine; diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index dbc638cb7a..35bd955700 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -10,25 +10,6 @@ #include "util/Exception.h" #include "util/StringUtils.h" -// _____________________________________________________________________________ -void TokenizeAndNormalizeText::start() { - if (current_ != end_) { - currentValue_ = normalizeToken(*current_); - } else { - currentValue_ = std::nullopt; - } -} - -// _____________________________________________________________________________ -void TokenizeAndNormalizeText::next() { - ++current_; - if (current_ != end_) { - currentValue_ = normalizeToken(*current_); - } else { - currentValue_ = std::nullopt; - } -} - // _____________________________________________________________________________ WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile, LocaleManager localeManager) diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index c8f632d458..9b6c73400a 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -14,6 +14,7 @@ #include "global/Id.h" #include "index/StringSortComparator.h" #include "util/Iterators.h" +#include "util/Views.h" using std::string; // Represents a line from the wordsfile.tsv, which stores everything given in @@ -105,44 +106,20 @@ struct LiteralsTokenizationDelimiter { // obj = TokenizeAndNormalizeText{text, localeManager} // for (auto normalizedWord : obj) { code } // The type of the value returned when iterating is std::string -class TokenizeAndNormalizeText - : public ad_utility::InputRangeMixin { - public: - using StorageType = std::string; - explicit TokenizeAndNormalizeText(std::string_view text, - LocaleManager localeManager) - : splitter_{absl::StrSplit(text, LiteralsTokenizationDelimiter{}, - absl::SkipEmpty{})}, - current_{splitter_.begin()}, - end_{splitter_.end()}, - localeManager_(std::move(localeManager)){}; - - // Delete unsafe constructors - TokenizeAndNormalizeText() = delete; - TokenizeAndNormalizeText(const TokenizeAndNormalizeText&) = delete; - TokenizeAndNormalizeText& operator=(const TokenizeAndNormalizeText&) = delete; - - private: - using Splitter = decltype(absl::StrSplit( - std::string_view{}, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})); - Splitter splitter_; - Splitter::const_iterator current_; - Splitter::const_iterator end_; - - std::optional currentValue_; - - LocaleManager localeManager_; - - std::string normalizeToken(std::string_view token) { - return localeManager_.getLowercaseUtf8(token); - } - - public: - void start(); - bool isFinished() const { return !currentValue_.has_value(); }; - const StorageType& get() const { return *currentValue_; }; - void next(); -}; +// TODO Adapt the comment (it is now a function, and you call it a +// little bit differently) +// TODO Also comment about the lifetime (the `text` and the +// `localeManager` have to be kept alive while the tokenizer is being used, the +// tokenizer only uses references. +inline auto tokenizeAndNormalizeText(std::string_view text, + const LocaleManager& localeManager) { + std::vector split{ + absl::StrSplit(text, LiteralsTokenizationDelimiter{}, absl::SkipEmpty{})}; + return ql::views::transform(ad_utility::OwningView{std::move(split)}, + [&localeManager](const auto& str) { + return localeManager.getLowercaseUtf8(str); + }); +} // This class is the parent class of WordsFileParser and DocsFileParser and // it exists to reduce code duplication since the only difference between the diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp index 0672064f72..8a31887d70 100644 --- a/test/WordsAndDocsFileParserTest.cpp +++ b/test/WordsAndDocsFileParserTest.cpp @@ -82,7 +82,7 @@ auto testDocsFileParser = [](std::string docsFilePath, auto testTokenizeAndNormalizeText = [](std::string testText, StringVec normalizedTextAsVec) { - TokenizeAndNormalizeText testTokenizer(testText, getLocaleManager()); + auto testTokenizer = tokenizeAndNormalizeText(testText, getLocaleManager()); size_t i = 0; for (auto normalizedWord : testTokenizer) { ASSERT_TRUE(i < normalizedTextAsVec.size()); From c412983e148fab5ad3a44f596c9b8adb31504ed4 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Fri, 10 Jan 2025 14:12:00 +0100 Subject: [PATCH 09/15] Improved comment, addressed small requested changes --- src/index/IndexImpl.Text.cpp | 3 +- src/parser/WordsAndDocsFileParser.h | 191 +++++++++++++++------------- test/WordsAndDocsFileParserTest.cpp | 45 ++++--- 3 files changed, 131 insertions(+), 108 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 502063a920..b793189d6a 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -53,8 +53,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( std::string_view textView = text; textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); - auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager); - for (auto word : normalizedWords) { + for (auto word : tokenizeAndNormalizeText(textView, localeManager)) { WordsFileLine wordLine{word, false, contextId, 1}; co_yield wordLine; } diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index 9b6c73400a..8365783a19 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -17,44 +17,50 @@ #include "util/Views.h" using std::string; -// Represents a line from the wordsfile.tsv, which stores everything given in -// the file line and extra information with isLiteralEntity. Also used to add -// literals to the text index through emulating wordsfile lines. -// -// The Fields are ordered in the same way the values follow in a line. -// Short field overview: string word_, bool isEntity, TextRecordIndex contextId, -// Score score_, bool isLiteralEntity (not found in -// wordsfile) -// -// Fields: -// - string word_: The string of the word, if it is an entity it will be -// . bool isEntity_: True if the given word is an -// entity, false if it's a word. -// - TextRecordIndex contextId_: When creating the wordsfile docs from the -// docsfile get split into so called contexts. -// Those contexts overlap, meaning words and -// entities are covered multiple times. Each -// contextId corresponds to the next bigger or -// equal docId. -// - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if -// isEntity is true. (this info is only constructed on the -// scientists.wordsfile.tsv) The score in the wordsfile is only -// relevant for the counting scoring metric. Because of the -// overlap of contexts the score is 1 if the word really has -// been seen for the first time and 0 if not. If a doc contains -// multiple mentions of a word there should be exactly as many -// wordsfile lines of that word with score 1 as there are -// mentions. The score for entities seems rather random and -// since no clear explanation of the creation of wordsfiles -// has been found yet they will stay rather random. -// - bool isLiteralEntity_: This does not directly stem from the wordsfile. -// When building the text index with literals, for -// every literal there will be WordsFileLines for all -// words in that literal. Additionally the whole -// literal itself will be added as word with isEntity -// being true. The need to count this comes only from -// a trick used in testing right now. To be specific -// the method getTextRecordFromResultTable + +/** + * @brief Represents a line in the words file. + * + * This struct holds information about a word or entity as it appears in the + * words file. + * + * The Fields are ordered in the same way the values follow in a line. + * Short field overview: string word_, bool isEntity, TextRecordIndex contextId, + * Score score_, bool isLiteralEntity (not found in + * wordsfile) + * + * @details + * + * Fields: + * - string word_: The string of the word, if it is an entity it will be + * . + * - bool isEntity_: True if the given word is an entity, false if it's a word. + * - TextRecordIndex contextId_: When creating the wordsfile docs from the + * docsfile get split into so called contexts. + * Those contexts overlap, meaning words and + * entities are covered multiple times. Each + * contextId corresponds to the next bigger or + * equal docId. + * - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if + * isEntity is true. (this info is only constructed on the + * scientists.wordsfile.tsv) The score in the wordsfile is only + * relevant for the counting scoring metric. Because of the + * overlap of contexts the score is 1 if the word really has + * been seen for the first time and 0 if not. If a doc contains + * multiple mentions of a word there should be exactly as many + * wordsfile lines of that word with score 1 as there are + * mentions. The score for entities seems rather random and + * since no clear explanation of the creation of wordsfiles + * has been found yet they will stay rather random. + * - bool isLiteralEntity_: This does not directly stem from the wordsfile. + * When building the text index with literals, for + * every literal there will be WordsFileLines for all + * words in that literal. Additionally the whole + * literal itself will be added as word with isEntity + * being true. The need to count this comes only from + * a trick used in testing right now. To be specific + * the method getTextRecordFromResultTable + */ struct WordsFileLine { string word_; bool isEntity_; @@ -63,25 +69,29 @@ struct WordsFileLine { bool isLiteralEntity_ = false; }; -// Represents a line from the docsfile.tsv, which stores everything given in -// the file line. -// -// The Fields are ordered in the same way the values follow in a line. -// Short field overview: DocumentIndex docId_, string docContent_ -// -// Fields: -// -// - DocumentIndex docId_: The docId is needed to built inverted indices for -// Scoring and building of the docsDB. It is also used -// to return actual texts when searching for a word. -// The word (and entity) search returns a table with -// TextRecordIndex as type of one col. Those get mapped -// to the next bigger or equal docId which is then -// used to extract the text from the docsDB. -// TODO: check if this behaviour is consintently -// implemented -// - string docContent_: The whole text given after the first tab of a line of -// docsfile. +/** + * @brief Represents a line from the docsfile.tsv. + * + * This struct stores everything given in a line of the docsfile.tsv. + * + * The Fields are ordered in the same way the values follow in a line. + * Short field overview: DocumentIndex docId_, string docContent_ + * + * @details + * + * Fields: + * - DocumentIndex docId_: The docId is needed to build inverted indices for + * scoring and building of the docsDB. It is also used + * to return actual texts when searching for a word. + * The word (and entity) search returns a table with + * TextRecordIndex as type of one column. Those get + * mapped to the next bigger or equal docId which is + * then used to extract the text from the docsDB. + * TODO: check if this behaviour is consistently + * implemented. + * - string docContent_: The whole text given after the first tab of a line of + * docsfile. + */ struct DocsFileLine { DocumentIndex docId_; string docContent_; @@ -99,18 +109,17 @@ struct LiteralsTokenizationDelimiter { } }; -// This class constructs an object that can be iterated to get the normalized -// words of the text given. The text gets split into tokens using the -// LiteralsTokenizationDelimiter and those tokens get normalized using -// the localeManager. You can use the constructed object like -// obj = TokenizeAndNormalizeText{text, localeManager} -// for (auto normalizedWord : obj) { code } -// The type of the value returned when iterating is std::string -// TODO Adapt the comment (it is now a function, and you call it a -// little bit differently) -// TODO Also comment about the lifetime (the `text` and the -// `localeManager` have to be kept alive while the tokenizer is being used, the -// tokenizer only uses references. +/** + * @brief A function that can be used to tokenize and normalize a given text. + * @warning Both params are const refs where the original objects have to be + * kept alive during the usage of the returned object. + * @param text The text to be tokenized and normalized. + * @param localeManager The localeManager to be used for normalization. + * @details This function can be used in the following way: + * for (auto normalizedWord : tokenizeAndNormalizeText(text, localeManager)) { + * code; + * } + */ inline auto tokenizeAndNormalizeText(std::string_view text, const LocaleManager& localeManager) { std::vector split{ @@ -120,15 +129,16 @@ inline auto tokenizeAndNormalizeText(std::string_view text, return localeManager.getLowercaseUtf8(str); }); } - -// This class is the parent class of WordsFileParser and DocsFileParser and -// it exists to reduce code duplication since the only difference between the -// child classes is the line type returned +/** + * @brief This class is the parent class of WordsFileParser and DocsFileParser + * + * @details It exists to reduce code duplication since the only difference + * between the child classes is the line type returned. + */ class WordsAndDocsFileParser { public: explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, LocaleManager localeManager); - ~WordsAndDocsFileParser() = default; explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete; WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) = delete; @@ -138,13 +148,17 @@ class WordsAndDocsFileParser { LocaleManager localeManager_; }; -// This class takes in the a pathToWordsFile and a localeManager. It then can -// be used to iterate the wordsFile while already normalizing the words using -// the localeManager. (If words are entities it doesn't normalize them) -// An object of this class can be iterated as follows: -// obj = WordsFileParser{wordsFile, localeManager} -// for (auto wordsFileLine : obj) { code } -// The type of the value returned when iterating is WordsFileLine +/** + * @brief This class takes in the a pathToWordsFile and a localeManager. It then + * can be used to iterate the wordsFile while already normalizing the words + * using the localeManager. (If words are entities it doesn't normalize them) + * + * @details An object of this class can be iterated as follows: + * for (auto wordsFileLine : WordsFileParser{wordsFile, localeManager}) { + * code; + * } + * The type of the value returned when iterating is WordsFileLine + */ class WordsFileParser : public WordsAndDocsFileParser, public ad_utility::InputRangeFromGet { public: @@ -157,10 +171,17 @@ class WordsFileParser : public WordsAndDocsFileParser, TextRecordIndex lastCId_ = TextRecordIndex::make(0); #endif }; -// Works similar to WordsFileParser but it instead parses a docsFile and -// doesn't normalize the text found in docsFile. To parse the returned -// docContent_ of a DocsFileLine please refer to the TokenizeAndNormalizeText -// class + +/** + * @brief This class takes in the a pathToDocsFile and a localeManager. It then + * can be used to iterate over the docsFile to get the lines. + * + * @details An object of this class can be iterated as follows: + * for (auto docsFileLine : DocsFileParser{docsFile, localeManager}) { + * code; + * } + * The type of the value returned when iterating is DocsFileLine + */ class DocsFileParser : public WordsAndDocsFileParser, public ad_utility::InputRangeFromGet { public: diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp index 8a31887d70..35ba0c4ee8 100644 --- a/test/WordsAndDocsFileParserTest.cpp +++ b/test/WordsAndDocsFileParserTest.cpp @@ -29,7 +29,8 @@ using StringVec = std::vector; auto getLocaleManager = []() { return LocaleManager("en", "US", false); }; -auto wordsFileLineToWordLine = [](WordsFileLine wordsFileLine) -> WordLine { +auto wordsFileLineToWordLine = + [](const WordsFileLine& wordsFileLine) -> WordLine { return std::make_tuple(wordsFileLine.word_, wordsFileLine.isEntity_, static_cast(wordsFileLine.contextId_.get()), static_cast(wordsFileLine.score_)); @@ -37,11 +38,11 @@ auto wordsFileLineToWordLine = [](WordsFileLine wordsFileLine) -> WordLine { // Lambda that takes in a path to wordsFile to initialize the Parser and an // expectedResult that is compared against the parsers outputs. -auto testWordsFileParser = [](std::string wordsFilePath, - WordLineVec expectedResult) { - WordsFileParser p(wordsFilePath, getLocaleManager()); +auto testWordsFileParser = [](const std::string& wordsFilePath, + const WordLineVec& expectedResult) { size_t i = 0; - for (auto wordsFileLine : p) { + for (auto wordsFileLine : + WordsFileParser{wordsFilePath, getLocaleManager()}) { ASSERT_TRUE(i < expectedResult.size()); WordLine testLine = wordsFileLineToWordLine(wordsFileLine); @@ -57,17 +58,16 @@ auto testWordsFileParser = [](std::string wordsFilePath, ASSERT_EQ(i, expectedResult.size()); }; -auto docsFileLineToDocLine = [](DocsFileLine docsFileLine) -> DocLine { +auto docsFileLineToDocLine = [](const DocsFileLine& docsFileLine) -> DocLine { return std::make_tuple(static_cast(docsFileLine.docId_.get()), docsFileLine.docContent_); }; // Same as testWordsFileParser but for docsFile -auto testDocsFileParser = [](std::string docsFilePath, - DocLineVec expectedResult) { - DocsFileParser p(docsFilePath, getLocaleManager()); +auto testDocsFileParser = [](const std::string& docsFilePath, + const DocLineVec& expectedResult) { size_t i = 0; - for (auto docsFileLine : p) { + for (auto docsFileLine : DocsFileParser{docsFilePath, getLocaleManager()}) { ASSERT_TRUE(i < expectedResult.size()); DocLine testLine = docsFileLineToDocLine(docsFileLine); @@ -80,11 +80,13 @@ auto testDocsFileParser = [](std::string docsFilePath, } }; +// Passing the testText as copy to make sure it stays alive during the usage of +// tokenizer auto testTokenizeAndNormalizeText = [](std::string testText, - StringVec normalizedTextAsVec) { - auto testTokenizer = tokenizeAndNormalizeText(testText, getLocaleManager()); + const StringVec& normalizedTextAsVec) { size_t i = 0; - for (auto normalizedWord : testTokenizer) { + for (auto normalizedWord : + tokenizeAndNormalizeText(testText, getLocaleManager())) { ASSERT_TRUE(i < normalizedTextAsVec.size()); ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i)); @@ -107,10 +109,11 @@ TEST(WordsAndDocsFileParserTest, wordsFileParserTest) { << createWordsFileLineAsString("X", false, 1, 1); f.close(); - WordLineVec expected = { - std::make_tuple("foo", false, 0, 2), std::make_tuple("foo", false, 0, 2), - std::make_tuple("Bär", true, 0, 1), std::make_tuple("äü", false, 0, 1), - std::make_tuple("x", false, 1, 1)}; + WordLineVec expected = {{"foo", false, 0, 2}, + {"foo", false, 0, 2}, + {"Bär", true, 0, 1}, + {"äü", false, 0, 1}, + {"x", false, 1, 1}}; testWordsFileParser("_testtmp.contexts.tsv", expected); remove("_testtmp.contexts.tsv"); @@ -127,10 +130,10 @@ TEST(WordsAndDocsFileParser, docsFileParserTest) { << createDocsFileLineAsString(190293, "Large docId"); f.close(); - DocLineVec expected = {std::make_pair(4, "This TeSt is OnlyCharcters"), - std::make_pair(7, "Wh4t h4pp3ns t0 num83rs"), - std::make_pair(8, "An( sp@ci*l ch.ar,:act=_er+s"), - std::make_pair(190293, "Large docId")}; + DocLineVec expected = {{4, "This TeSt is OnlyCharcters"}, + {7, "Wh4t h4pp3ns t0 num83rs"}, + {8, "An( sp@ci*l ch.ar,:act=_er+s"}, + {190293, "Large docId"}}; testDocsFileParser("_testtmp.documents.tsv", expected); remove("_testtmp.documents.tsv"); From 46fbb980ccebc2888745041ebbe748ea6e4764e9 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Fri, 10 Jan 2025 15:33:25 +0100 Subject: [PATCH 10/15] Addressed sonar issues --- src/index/IndexImpl.Text.cpp | 2 +- src/index/IndexImpl.h | 2 +- src/parser/WordsAndDocsFileParser.cpp | 28 ++++++++++++++------------- src/parser/WordsAndDocsFileParser.h | 10 +++++++--- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index b793189d6a..08afa9a112 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -23,7 +23,7 @@ // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) { + const std::string& contextFile, bool addWordsFromLiterals) const { auto localeManager = textVocab_.getLocaleManager(); // ROUND 1: If context file aka wordsfile is not empty, read words from there. // Remember the last context id for the (optional) second round. diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 5ce5eeaea3..1b491b04ab 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -522,7 +522,7 @@ class IndexImpl { // testing phase, once it works, it should be easy to include the IRIs and // literals from the external vocabulary as well). cppcoro::generator wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals); + const std::string& contextFile, bool addWordsFromLiterals) const; size_t processWordsForVocabulary(const string& contextFile, bool addWordsFromLiterals); diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index 35bd955700..d5756e01d4 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -11,30 +11,32 @@ #include "util/StringUtils.h" // _____________________________________________________________________________ -WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile, - LocaleManager localeManager) - : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {} +WordsAndDocsFileParser::WordsAndDocsFileParser( + const string& wordsOrDocsFile, const LocaleManager& localeManager) + : in_(wordsOrDocsFile), localeManager_(localeManager) {} // _____________________________________________________________________________ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { WordsFileLine line; string l; - if (!std::getline(in_, l)) { + if (!std::getline(getInputStream(), l)) { return std::nullopt; }; - size_t i = l.find('\t'); + std::string_view lineView(l); + size_t i = lineView.find('\t'); assert(i != string::npos); size_t j = i + 2; - assert(j + 3 < l.size()); - size_t k = l.find('\t', j + 2); + assert(j + 3 < lineView.size()); + size_t k = lineView.find('\t', j + 2); assert(k != string::npos); - line.isEntity_ = (l[i + 1] == '1'); + line.isEntity_ = (lineView[i + 1] == '1'); line.word_ = - (line.isEntity_ ? l.substr(0, i) - : localeManager_.getLowercaseUtf8(l.substr(0, i))); + (line.isEntity_ + ? lineView.substr(0, i) + : getLocaleManager().getLowercaseUtf8(lineView.substr(0, i))); line.contextId_ = - TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); - line.score_ = static_cast(atol(l.substr(k + 1).c_str())); + TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data())); + line.score_ = static_cast(atol(lineView.substr(k + 1).data())); #ifndef NDEBUG if (lastCId_ > line.contextId_) { AD_THROW("ContextFile has to be sorted by context Id."); @@ -48,7 +50,7 @@ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { DocsFileLine line; string l; - if (!std::getline(in_, l)) { + if (!std::getline(getInputStream(), l)) { return std::nullopt; }; size_t i = l.find('\t'); diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index 8365783a19..ebacfc2ebe 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -101,7 +101,7 @@ struct DocsFileLine { // The `Find` function returns the next delimiter in `text` after the given // `pos` or an empty substring if there is no next delimiter. struct LiteralsTokenizationDelimiter { - absl::string_view Find(absl::string_view text, size_t pos) { + absl::string_view Find(absl::string_view text, size_t pos) const { auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); if (found == text.end()) return text.substr(text.size()); @@ -138,12 +138,16 @@ inline auto tokenizeAndNormalizeText(std::string_view text, class WordsAndDocsFileParser { public: explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, - LocaleManager localeManager); + const LocaleManager& localeManager); explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete; WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) = delete; protected: + std::ifstream& getInputStream() { return in_; } + const LocaleManager& getLocaleManager() { return localeManager_; } + + private: std::ifstream in_; LocaleManager localeManager_; }; @@ -165,8 +169,8 @@ class WordsFileParser : public WordsAndDocsFileParser, using WordsAndDocsFileParser::WordsAndDocsFileParser; Storage get() override; - private: #ifndef NDEBUG + private: // Only used for sanity checks in debug builds TextRecordIndex lastCId_ = TextRecordIndex::make(0); #endif From 1e0fc147c207234821248a0afae2434c57874c79 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Fri, 10 Jan 2025 16:21:35 +0100 Subject: [PATCH 11/15] Removed the temporary localeManagers in WordsAndDocsFileParserTest.cpp --- test/WordsAndDocsFileParserTest.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/test/WordsAndDocsFileParserTest.cpp b/test/WordsAndDocsFileParserTest.cpp index 35ba0c4ee8..de7216ada7 100644 --- a/test/WordsAndDocsFileParserTest.cpp +++ b/test/WordsAndDocsFileParserTest.cpp @@ -27,7 +27,9 @@ using StringVec = std::vector; /// Lambdas -auto getLocaleManager = []() { return LocaleManager("en", "US", false); }; +auto getLocaleManager = []() -> LocaleManager { + return LocaleManager("en", "US", false); +}; auto wordsFileLineToWordLine = [](const WordsFileLine& wordsFileLine) -> WordLine { @@ -41,8 +43,8 @@ auto wordsFileLineToWordLine = auto testWordsFileParser = [](const std::string& wordsFilePath, const WordLineVec& expectedResult) { size_t i = 0; - for (auto wordsFileLine : - WordsFileParser{wordsFilePath, getLocaleManager()}) { + LocaleManager localeManager = getLocaleManager(); + for (auto wordsFileLine : WordsFileParser{wordsFilePath, localeManager}) { ASSERT_TRUE(i < expectedResult.size()); WordLine testLine = wordsFileLineToWordLine(wordsFileLine); @@ -67,7 +69,8 @@ auto docsFileLineToDocLine = [](const DocsFileLine& docsFileLine) -> DocLine { auto testDocsFileParser = [](const std::string& docsFilePath, const DocLineVec& expectedResult) { size_t i = 0; - for (auto docsFileLine : DocsFileParser{docsFilePath, getLocaleManager()}) { + LocaleManager localeManager = getLocaleManager(); + for (auto docsFileLine : DocsFileParser{docsFilePath, localeManager}) { ASSERT_TRUE(i < expectedResult.size()); DocLine testLine = docsFileLineToDocLine(docsFileLine); @@ -85,8 +88,9 @@ auto testDocsFileParser = [](const std::string& docsFilePath, auto testTokenizeAndNormalizeText = [](std::string testText, const StringVec& normalizedTextAsVec) { size_t i = 0; + LocaleManager localeManager = getLocaleManager(); for (auto normalizedWord : - tokenizeAndNormalizeText(testText, getLocaleManager())) { + tokenizeAndNormalizeText(testText, localeManager)) { ASSERT_TRUE(i < normalizedTextAsVec.size()); ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i)); From 9f9738c76d294482d57c250c464873eca6b4c983 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 11 Jan 2025 12:46:24 +0100 Subject: [PATCH 12/15] Addressed more SonarQube problems --- src/index/IndexImpl.Text.cpp | 86 +++++++++++++++++---------- src/index/IndexImpl.h | 14 ++++- src/parser/WordsAndDocsFileParser.cpp | 6 +- src/parser/WordsAndDocsFileParser.h | 4 +- 4 files changed, 71 insertions(+), 39 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 08afa9a112..a27a5a7817 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -23,7 +23,7 @@ // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) const { + std::string contextFile, bool addWordsFromLiterals) const { auto localeManager = textVocab_.getLocaleManager(); // ROUND 1: If context file aka wordsfile is not empty, read words from there. // Remember the last context id for the (optional) second round. @@ -62,6 +62,56 @@ cppcoro::generator IndexImpl::wordsInTextRecords( } } +// _____________________________________________________________________________ +void IndexImpl::processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContext, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const { + VocabIndex eid; + // TODO Currently only IRIs and strings from the vocabulary can + // be tagged entities in the text index (no doubles, ints, etc). + if (getVocab().getId(line.word_, &eid)) { + // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have + // to be contiguous. + entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_; + if (line.isLiteralEntity_) { + ++nofLiterals; + } + } else { + logEntityNotFound(line.word_, entityNotFoundErrorMsgCount); + } +} + +// _____________________________________________________________________________ +void IndexImpl::processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const { + // TODO Let the `textVocab_` return a `WordIndex` directly. + WordVocabIndex vid; + bool ret = textVocab_.getId(line.word_, &vid); + WordIndex wid = vid.get(); + if (!ret) { + LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" " + << "not found in textVocab. Terminating\n"; + AD_FAIL(); + } + wordsInContext[wid] += line.score_; +} + +// _____________________________________________________________________________ +void IndexImpl::logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const { + if (entityNotFoundErrorMsgCount < 20) { + LOG(WARN) << "Entity from text not in KB: " << word << '\n'; + if (++entityNotFoundErrorMsgCount == 20) { + LOG(WARN) << "There are more entities not in the KB..." + << " suppressing further warnings...\n"; + } + } else { + entityNotFoundErrorMsgCount++; + } +} + // _____________________________________________________________________________ void IndexImpl::addTextFromContextFile(const string& contextFile, bool addWordsFromLiterals) { @@ -234,39 +284,11 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile, } if (line.isEntity_) { ++nofEntityPostings; - // TODO Currently only IRIs and strings from the vocabulary can - // be tagged entities in the text index (no doubles, ints, etc). - VocabIndex eid; - if (getVocab().getId(line.word_, &eid)) { - // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have - // to be contiguous. - entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_; - if (line.isLiteralEntity_) { - ++nofLiterals; - } - } else { - if (entityNotFoundErrorMsgCount < 20) { - LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n'; - if (++entityNotFoundErrorMsgCount == 20) { - LOG(WARN) << "There are more entities not in the KB..." - << " suppressing further warnings...\n"; - } - } else { - entityNotFoundErrorMsgCount++; - } - } + processEntityCaseDuringInvertedListProcessing( + line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount); } else { ++nofWordPostings; - // TODO Let the `textVocab_` return a `WordIndex` directly. - WordVocabIndex vid; - bool ret = textVocab_.getId(line.word_, &vid); - WordIndex wid = vid.get(); - if (!ret) { - LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" " - << "not found in textVocab. Terminating\n"; - AD_FAIL(); - } - wordsInContext[wid] += line.score_; + processWordCaseDuringInvertedListProcessing(line, wordsInContext); } } if (entityNotFoundErrorMsgCount > 0) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 1b491b04ab..ac0003db87 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -522,7 +522,19 @@ class IndexImpl { // testing phase, once it works, it should be easy to include the IRIs and // literals from the external vocabulary as well). cppcoro::generator wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) const; + std::string contextFile, bool addWordsFromLiterals) const; + + void processEntityCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& entitiesInContxt, size_t& nofLiterals, + size_t& entityNotFoundErrorMsgCount) const; + + void processWordCaseDuringInvertedListProcessing( + const WordsFileLine& line, + ad_utility::HashMap& wordsInContext) const; + + void logEntityNotFound(const string& word, + size_t& entityNotFoundErrorMsgCount) const; size_t processWordsForVocabulary(const string& contextFile, bool addWordsFromLiterals); diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index d5756e01d4..e7d36974c6 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -21,7 +21,7 @@ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { string l; if (!std::getline(getInputStream(), l)) { return std::nullopt; - }; + } std::string_view lineView(l); size_t i = lineView.find('\t'); assert(i != string::npos); @@ -48,11 +48,11 @@ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { // _____________________________________________________________________________ ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { - DocsFileLine line; string l; if (!std::getline(getInputStream(), l)) { return std::nullopt; - }; + } + DocsFileLine line; size_t i = l.find('\t'); assert(i != string::npos); line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str())); diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index ebacfc2ebe..1fc80523ff 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -87,8 +87,6 @@ struct WordsFileLine { * TextRecordIndex as type of one column. Those get * mapped to the next bigger or equal docId which is * then used to extract the text from the docsDB. - * TODO: check if this behaviour is consistently - * implemented. * - string docContent_: The whole text given after the first tab of a line of * docsfile. */ @@ -145,7 +143,7 @@ class WordsAndDocsFileParser { protected: std::ifstream& getInputStream() { return in_; } - const LocaleManager& getLocaleManager() { return localeManager_; } + const LocaleManager& getLocaleManager() const { return localeManager_; } private: std::ifstream in_; From a55f2be2ba377e2f1f8c8b3c0e90b419a6fbb55a Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 11 Jan 2025 14:48:14 +0100 Subject: [PATCH 13/15] For now excluding helper functions from code coverage since they could be outsourced in further refactorings --- src/index/IndexImpl.Text.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index a27a5a7817..bde9440aa6 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -62,6 +62,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( } } +// LCOV_EXCL_START // _____________________________________________________________________________ void IndexImpl::processEntityCaseDuringInvertedListProcessing( const WordsFileLine& line, @@ -111,6 +112,7 @@ void IndexImpl::logEntityNotFound(const string& word, entityNotFoundErrorMsgCount++; } } +// LCOV_EXCL_STOP // _____________________________________________________________________________ void IndexImpl::addTextFromContextFile(const string& contextFile, From bea5936cc0c5ea7468cdd41757920a08155a3951 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Sat, 11 Jan 2025 15:39:33 +0100 Subject: [PATCH 14/15] Reverting last commit --- src/index/IndexImpl.Text.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index bde9440aa6..a27a5a7817 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -62,7 +62,6 @@ cppcoro::generator IndexImpl::wordsInTextRecords( } } -// LCOV_EXCL_START // _____________________________________________________________________________ void IndexImpl::processEntityCaseDuringInvertedListProcessing( const WordsFileLine& line, @@ -112,7 +111,6 @@ void IndexImpl::logEntityNotFound(const string& word, entityNotFoundErrorMsgCount++; } } -// LCOV_EXCL_STOP // _____________________________________________________________________________ void IndexImpl::addTextFromContextFile(const string& contextFile, From 349be6d4518148ad207f90881b3cec2329fa7676 Mon Sep 17 00:00:00 2001 From: Felix Meisen Date: Wed, 15 Jan 2025 14:14:49 +0100 Subject: [PATCH 15/15] Small improvement --- src/index/IndexImpl.Text.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index a27a5a7817..d1e6a70777 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -54,7 +54,7 @@ cppcoro::generator IndexImpl::wordsInTextRecords( textView = textView.substr(0, textView.rfind('"')); textView.remove_prefix(1); for (auto word : tokenizeAndNormalizeText(textView, localeManager)) { - WordsFileLine wordLine{word, false, contextId, 1}; + WordsFileLine wordLine{std::move(word), false, contextId, 1}; co_yield wordLine; } contextId = contextId.incremented();