Skip to content

Commit

Permalink
Improved comment, addressed small requested changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Flixtastic committed Jan 10, 2025
1 parent 0369de6 commit c412983
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 108 deletions.
3 changes: 1 addition & 2 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,7 @@ cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
textView.remove_prefix(1);
auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager);
for (auto word : normalizedWords) {
for (auto word : tokenizeAndNormalizeText(textView, localeManager)) {
WordsFileLine wordLine{word, false, contextId, 1};
co_yield wordLine;
}
Expand Down
191 changes: 106 additions & 85 deletions src/parser/WordsAndDocsFileParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,44 +17,50 @@
#include "util/Views.h"

using std::string;
// Represents a line from the wordsfile.tsv, which stores everything given in
// the file line and extra information with isLiteralEntity. Also used to add
// literals to the text index through emulating wordsfile lines.
//
// The Fields are ordered in the same way the values follow in a line.
// Short field overview: string word_, bool isEntity, TextRecordIndex contextId,
// Score score_, bool isLiteralEntity (not found in
// wordsfile)
//
// Fields:
// - string word_: The string of the word, if it is an entity it will be
// <Entity_Name>. bool isEntity_: True if the given word is an
// entity, false if it's a word.
// - TextRecordIndex contextId_: When creating the wordsfile docs from the
// docsfile get split into so called contexts.
// Those contexts overlap, meaning words and
// entities are covered multiple times. Each
// contextId corresponds to the next bigger or
// equal docId.
// - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if
// isEntity is true. (this info is only constructed on the
// scientists.wordsfile.tsv) The score in the wordsfile is only
// relevant for the counting scoring metric. Because of the
// overlap of contexts the score is 1 if the word really has
// been seen for the first time and 0 if not. If a doc contains
// multiple mentions of a word there should be exactly as many
// wordsfile lines of that word with score 1 as there are
// mentions. The score for entities seems rather random and
// since no clear explanation of the creation of wordsfiles
// has been found yet they will stay rather random.
// - bool isLiteralEntity_: This does not directly stem from the wordsfile.
// When building the text index with literals, for
// every literal there will be WordsFileLines for all
// words in that literal. Additionally the whole
// literal itself will be added as word with isEntity
// being true. The need to count this comes only from
// a trick used in testing right now. To be specific
// the method getTextRecordFromResultTable

/**
* @brief Represents a line in the words file.
*
* This struct holds information about a word or entity as it appears in the
* words file.
*
* The Fields are ordered in the same way the values follow in a line.
* Short field overview: string word_, bool isEntity, TextRecordIndex contextId,
* Score score_, bool isLiteralEntity (not found in
* wordsfile)
*
* @details
*
* Fields:
* - string word_: The string of the word, if it is an entity it will be
* <Entity_Name>.
* - bool isEntity_: True if the given word is an entity, false if it's a word.
* - TextRecordIndex contextId_: When creating the wordsfile docs from the
* docsfile get split into so called contexts.
* Those contexts overlap, meaning words and
* entities are covered multiple times. Each
* contextId corresponds to the next bigger or
* equal docId.
* - Score score_: Either 1 or 0 if isEntity is false. 0, 1, 100, 150 if
* isEntity is true. (this info is only constructed on the
* scientists.wordsfile.tsv) The score in the wordsfile is only
* relevant for the counting scoring metric. Because of the
* overlap of contexts the score is 1 if the word really has
* been seen for the first time and 0 if not. If a doc contains
* multiple mentions of a word there should be exactly as many
* wordsfile lines of that word with score 1 as there are
* mentions. The score for entities seems rather random and
* since no clear explanation of the creation of wordsfiles
* has been found yet they will stay rather random.
* - bool isLiteralEntity_: This does not directly stem from the wordsfile.
* When building the text index with literals, for
* every literal there will be WordsFileLines for all
* words in that literal. Additionally the whole
* literal itself will be added as word with isEntity
* being true. The need to count this comes only from
* a trick used in testing right now. To be specific
* the method getTextRecordFromResultTable
*/
struct WordsFileLine {
string word_;
bool isEntity_;
Expand All @@ -63,25 +69,29 @@ struct WordsFileLine {
bool isLiteralEntity_ = false;
};

// Represents a line from the docsfile.tsv, which stores everything given in
// the file line.
//
// The Fields are ordered in the same way the values follow in a line.
// Short field overview: DocumentIndex docId_, string docContent_
//
// Fields:
//
// - DocumentIndex docId_: The docId is needed to built inverted indices for
// Scoring and building of the docsDB. It is also used
// to return actual texts when searching for a word.
// The word (and entity) search returns a table with
// TextRecordIndex as type of one col. Those get mapped
// to the next bigger or equal docId which is then
// used to extract the text from the docsDB.
// TODO: check if this behaviour is consintently
// implemented
// - string docContent_: The whole text given after the first tab of a line of
// docsfile.
/**
* @brief Represents a line from the docsfile.tsv.
*
* This struct stores everything given in a line of the docsfile.tsv.
*
* The Fields are ordered in the same way the values follow in a line.
* Short field overview: DocumentIndex docId_, string docContent_
*
* @details
*
* Fields:
* - DocumentIndex docId_: The docId is needed to build inverted indices for
* scoring and building of the docsDB. It is also used
* to return actual texts when searching for a word.
* The word (and entity) search returns a table with
* TextRecordIndex as type of one column. Those get
* mapped to the next bigger or equal docId which is
* then used to extract the text from the docsDB.
* TODO: check if this behaviour is consistently
* implemented.
* - string docContent_: The whole text given after the first tab of a line of
* docsfile.
*/
struct DocsFileLine {
DocumentIndex docId_;
string docContent_;
Expand All @@ -99,18 +109,17 @@ struct LiteralsTokenizationDelimiter {
}
};

// This class constructs an object that can be iterated to get the normalized
// words of the text given. The text gets split into tokens using the
// LiteralsTokenizationDelimiter and those tokens get normalized using
// the localeManager. You can use the constructed object like
// obj = TokenizeAndNormalizeText{text, localeManager}
// for (auto normalizedWord : obj) { code }
// The type of the value returned when iterating is std::string
// TODO<flixtastic> Adapt the comment (it is now a function, and you call it a
// little bit differently)
// TODO<flixtastic> Also comment about the lifetime (the `text` and the
// `localeManager` have to be kept alive while the tokenizer is being used, the
// tokenizer only uses references.
/**
* @brief A function that can be used to tokenize and normalize a given text.
* @warning Both params are const refs where the original objects have to be
* kept alive during the usage of the returned object.
* @param text The text to be tokenized and normalized.
* @param localeManager The localeManager to be used for normalization.
* @details This function can be used in the following way:
* for (auto normalizedWord : tokenizeAndNormalizeText(text, localeManager)) {
* code;
* }
*/
inline auto tokenizeAndNormalizeText(std::string_view text,
const LocaleManager& localeManager) {
std::vector<std::string_view> split{
Expand All @@ -120,15 +129,16 @@ inline auto tokenizeAndNormalizeText(std::string_view text,
return localeManager.getLowercaseUtf8(str);
});
}

// This class is the parent class of WordsFileParser and DocsFileParser and
// it exists to reduce code duplication since the only difference between the
// child classes is the line type returned
/**
* @brief This class is the parent class of WordsFileParser and DocsFileParser
*
* @details It exists to reduce code duplication since the only difference
* between the child classes is the line type returned.
*/
class WordsAndDocsFileParser {
public:
explicit WordsAndDocsFileParser(const string& wordsOrDocsFile,
LocaleManager localeManager);
~WordsAndDocsFileParser() = default;
explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete;
WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) =
delete;
Expand All @@ -138,13 +148,17 @@ class WordsAndDocsFileParser {
LocaleManager localeManager_;
};

// This class takes in the a pathToWordsFile and a localeManager. It then can
// be used to iterate the wordsFile while already normalizing the words using
// the localeManager. (If words are entities it doesn't normalize them)
// An object of this class can be iterated as follows:
// obj = WordsFileParser{wordsFile, localeManager}
// for (auto wordsFileLine : obj) { code }
// The type of the value returned when iterating is WordsFileLine
/**
* @brief This class takes in the a pathToWordsFile and a localeManager. It then
* can be used to iterate the wordsFile while already normalizing the words
* using the localeManager. (If words are entities it doesn't normalize them)
*
* @details An object of this class can be iterated as follows:
* for (auto wordsFileLine : WordsFileParser{wordsFile, localeManager}) {
* code;
* }
* The type of the value returned when iterating is WordsFileLine
*/
class WordsFileParser : public WordsAndDocsFileParser,
public ad_utility::InputRangeFromGet<WordsFileLine> {
public:
Expand All @@ -157,10 +171,17 @@ class WordsFileParser : public WordsAndDocsFileParser,
TextRecordIndex lastCId_ = TextRecordIndex::make(0);
#endif
};
// Works similar to WordsFileParser but it instead parses a docsFile and
// doesn't normalize the text found in docsFile. To parse the returned
// docContent_ of a DocsFileLine please refer to the TokenizeAndNormalizeText
// class

/**
* @brief This class takes in the a pathToDocsFile and a localeManager. It then
* can be used to iterate over the docsFile to get the lines.
*
* @details An object of this class can be iterated as follows:
* for (auto docsFileLine : DocsFileParser{docsFile, localeManager}) {
* code;
* }
* The type of the value returned when iterating is DocsFileLine
*/
class DocsFileParser : public WordsAndDocsFileParser,
public ad_utility::InputRangeFromGet<DocsFileLine> {
public:
Expand Down
45 changes: 24 additions & 21 deletions test/WordsAndDocsFileParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,20 @@ using StringVec = std::vector<std::string>;

auto getLocaleManager = []() { return LocaleManager("en", "US", false); };

auto wordsFileLineToWordLine = [](WordsFileLine wordsFileLine) -> WordLine {
auto wordsFileLineToWordLine =
[](const WordsFileLine& wordsFileLine) -> WordLine {
return std::make_tuple(wordsFileLine.word_, wordsFileLine.isEntity_,
static_cast<size_t>(wordsFileLine.contextId_.get()),
static_cast<size_t>(wordsFileLine.score_));
};

// Lambda that takes in a path to wordsFile to initialize the Parser and an
// expectedResult that is compared against the parsers outputs.
auto testWordsFileParser = [](std::string wordsFilePath,
WordLineVec expectedResult) {
WordsFileParser p(wordsFilePath, getLocaleManager());
auto testWordsFileParser = [](const std::string& wordsFilePath,
const WordLineVec& expectedResult) {
size_t i = 0;
for (auto wordsFileLine : p) {
for (auto wordsFileLine :
WordsFileParser{wordsFilePath, getLocaleManager()}) {
ASSERT_TRUE(i < expectedResult.size());
WordLine testLine = wordsFileLineToWordLine(wordsFileLine);

Expand All @@ -57,17 +58,16 @@ auto testWordsFileParser = [](std::string wordsFilePath,
ASSERT_EQ(i, expectedResult.size());
};

auto docsFileLineToDocLine = [](DocsFileLine docsFileLine) -> DocLine {
auto docsFileLineToDocLine = [](const DocsFileLine& docsFileLine) -> DocLine {
return std::make_tuple(static_cast<size_t>(docsFileLine.docId_.get()),
docsFileLine.docContent_);
};

// Same as testWordsFileParser but for docsFile
auto testDocsFileParser = [](std::string docsFilePath,
DocLineVec expectedResult) {
DocsFileParser p(docsFilePath, getLocaleManager());
auto testDocsFileParser = [](const std::string& docsFilePath,
const DocLineVec& expectedResult) {
size_t i = 0;
for (auto docsFileLine : p) {
for (auto docsFileLine : DocsFileParser{docsFilePath, getLocaleManager()}) {
ASSERT_TRUE(i < expectedResult.size());
DocLine testLine = docsFileLineToDocLine(docsFileLine);

Expand All @@ -80,11 +80,13 @@ auto testDocsFileParser = [](std::string docsFilePath,
}
};

// Passing the testText as copy to make sure it stays alive during the usage of
// tokenizer
auto testTokenizeAndNormalizeText = [](std::string testText,
StringVec normalizedTextAsVec) {
auto testTokenizer = tokenizeAndNormalizeText(testText, getLocaleManager());
const StringVec& normalizedTextAsVec) {
size_t i = 0;
for (auto normalizedWord : testTokenizer) {
for (auto normalizedWord :
tokenizeAndNormalizeText(testText, getLocaleManager())) {
ASSERT_TRUE(i < normalizedTextAsVec.size());
ASSERT_EQ(normalizedWord, normalizedTextAsVec.at(i));

Expand All @@ -107,10 +109,11 @@ TEST(WordsAndDocsFileParserTest, wordsFileParserTest) {
<< createWordsFileLineAsString("X", false, 1, 1);
f.close();

WordLineVec expected = {
std::make_tuple("foo", false, 0, 2), std::make_tuple("foo", false, 0, 2),
std::make_tuple("Bär", true, 0, 1), std::make_tuple("äü", false, 0, 1),
std::make_tuple("x", false, 1, 1)};
WordLineVec expected = {{"foo", false, 0, 2},
{"foo", false, 0, 2},
{"Bär", true, 0, 1},
{"äü", false, 0, 1},
{"x", false, 1, 1}};

testWordsFileParser("_testtmp.contexts.tsv", expected);
remove("_testtmp.contexts.tsv");
Expand All @@ -127,10 +130,10 @@ TEST(WordsAndDocsFileParser, docsFileParserTest) {
<< createDocsFileLineAsString(190293, "Large docId");
f.close();

DocLineVec expected = {std::make_pair(4, "This TeSt is OnlyCharcters"),
std::make_pair(7, "Wh4t h4pp3ns t0 num83rs"),
std::make_pair(8, "An( sp@ci*l ch.ar,:act=_er+s"),
std::make_pair(190293, "Large docId")};
DocLineVec expected = {{4, "This TeSt is OnlyCharcters"},
{7, "Wh4t h4pp3ns t0 num83rs"},
{8, "An( sp@ci*l ch.ar,:act=_er+s"},
{190293, "Large docId"}};

testDocsFileParser("_testtmp.documents.tsv", expected);
remove("_testtmp.documents.tsv");
Expand Down

0 comments on commit c412983

Please sign in to comment.