diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index b793189d6a..08afa9a112 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -23,7 +23,7 @@ // _____________________________________________________________________________ cppcoro::generator IndexImpl::wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals) { + const std::string& contextFile, bool addWordsFromLiterals) const { auto localeManager = textVocab_.getLocaleManager(); // ROUND 1: If context file aka wordsfile is not empty, read words from there. // Remember the last context id for the (optional) second round. diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 5ce5eeaea3..1b491b04ab 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -522,7 +522,7 @@ class IndexImpl { // testing phase, once it works, it should be easy to include the IRIs and // literals from the external vocabulary as well). cppcoro::generator wordsInTextRecords( - const std::string& contextFile, bool addWordsFromLiterals); + const std::string& contextFile, bool addWordsFromLiterals) const; size_t processWordsForVocabulary(const string& contextFile, bool addWordsFromLiterals); diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp index 35bd955700..d5756e01d4 100644 --- a/src/parser/WordsAndDocsFileParser.cpp +++ b/src/parser/WordsAndDocsFileParser.cpp @@ -11,30 +11,32 @@ #include "util/StringUtils.h" // _____________________________________________________________________________ -WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile, - LocaleManager localeManager) - : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {} +WordsAndDocsFileParser::WordsAndDocsFileParser( + const string& wordsOrDocsFile, const LocaleManager& localeManager) + : in_(wordsOrDocsFile), localeManager_(localeManager) {} // _____________________________________________________________________________ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { WordsFileLine line; string l; - if (!std::getline(in_, l)) { + if (!std::getline(getInputStream(), l)) { return std::nullopt; }; - size_t i = l.find('\t'); + std::string_view lineView(l); + size_t i = lineView.find('\t'); assert(i != string::npos); size_t j = i + 2; - assert(j + 3 < l.size()); - size_t k = l.find('\t', j + 2); + assert(j + 3 < lineView.size()); + size_t k = lineView.find('\t', j + 2); assert(k != string::npos); - line.isEntity_ = (l[i + 1] == '1'); + line.isEntity_ = (lineView[i + 1] == '1'); line.word_ = - (line.isEntity_ ? l.substr(0, i) - : localeManager_.getLowercaseUtf8(l.substr(0, i))); + (line.isEntity_ + ? lineView.substr(0, i) + : getLocaleManager().getLowercaseUtf8(lineView.substr(0, i))); line.contextId_ = - TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str())); - line.score_ = static_cast(atol(l.substr(k + 1).c_str())); + TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data())); + line.score_ = static_cast(atol(lineView.substr(k + 1).data())); #ifndef NDEBUG if (lastCId_ > line.contextId_) { AD_THROW("ContextFile has to be sorted by context Id."); @@ -48,7 +50,7 @@ ad_utility::InputRangeFromGet::Storage WordsFileParser::get() { ad_utility::InputRangeFromGet::Storage DocsFileParser::get() { DocsFileLine line; string l; - if (!std::getline(in_, l)) { + if (!std::getline(getInputStream(), l)) { return std::nullopt; }; size_t i = l.find('\t'); diff --git a/src/parser/WordsAndDocsFileParser.h b/src/parser/WordsAndDocsFileParser.h index 8365783a19..ebacfc2ebe 100644 --- a/src/parser/WordsAndDocsFileParser.h +++ b/src/parser/WordsAndDocsFileParser.h @@ -101,7 +101,7 @@ struct DocsFileLine { // The `Find` function returns the next delimiter in `text` after the given // `pos` or an empty substring if there is no next delimiter. struct LiteralsTokenizationDelimiter { - absl::string_view Find(absl::string_view text, size_t pos) { + absl::string_view Find(absl::string_view text, size_t pos) const { auto isWordChar = [](char c) -> bool { return std::isalnum(c); }; auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar); if (found == text.end()) return text.substr(text.size()); @@ -138,12 +138,16 @@ inline auto tokenizeAndNormalizeText(std::string_view text, class WordsAndDocsFileParser { public: explicit WordsAndDocsFileParser(const string& wordsOrDocsFile, - LocaleManager localeManager); + const LocaleManager& localeManager); explicit WordsAndDocsFileParser(const WordsAndDocsFileParser& other) = delete; WordsAndDocsFileParser& operator=(const WordsAndDocsFileParser& other) = delete; protected: + std::ifstream& getInputStream() { return in_; } + const LocaleManager& getLocaleManager() { return localeManager_; } + + private: std::ifstream in_; LocaleManager localeManager_; }; @@ -165,8 +169,8 @@ class WordsFileParser : public WordsAndDocsFileParser, using WordsAndDocsFileParser::WordsAndDocsFileParser; Storage get() override; - private: #ifndef NDEBUG + private: // Only used for sanity checks in debug builds TextRecordIndex lastCId_ = TextRecordIndex::make(0); #endif