From e913c6cf89486ec10afa0d53c8b4032f0ed1b441 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 5 Feb 2025 17:46:04 +0100 Subject: [PATCH 01/13] Update ctre dependency --- CMakeLists.txt | 2 +- src/util/StringUtilsImpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3679de4c51..0d541d8a32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,7 +231,7 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") FetchContent_Declare( ctre GIT_REPOSITORY https://github.com/hanickadot/compile-time-regular-expressions.git - GIT_TAG b3d7788b559e34d985c8530c3e0e7260b67505a6 # v3.8.1 + GIT_TAG eb9577aae3515d14e6c5564f9aeb046d2e7c1124 # v3.9.0 ) ################################ diff --git a/src/util/StringUtilsImpl.h b/src/util/StringUtilsImpl.h index 87db3aa645..1e677bf729 100644 --- a/src/util/StringUtilsImpl.h +++ b/src/util/StringUtilsImpl.h @@ -74,7 +74,7 @@ std::string insertThousandSeparator(const std::string_view str, "])(?\\d{4,})"}; auto parseIterator = std::begin(str); ql::ranges::for_each( - ctre::range(str), + ctre::search_all(str), [&parseIterator, &ostream, &insertSeparator](const auto& match) { /* The digit sequence, that must be transformed. Note: The string view From 2ac535401083d1d8638db6616ad4394aa5c41cfb Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 5 Feb 2025 18:03:11 +0100 Subject: [PATCH 02/13] Implement unicode unescape before parsing --- src/parser/SparqlParserHelpers.cpp | 39 ++++++++++++++++++++++++++++-- src/parser/SparqlParserHelpers.h | 4 +++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index ed7717f35b..9a9f462019 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -4,6 +4,11 @@ #include "SparqlParserHelpers.h" +#include + +#include +#include + #include "sparqlParser/generated/SparqlAutomaticLexer.h" namespace sparqlParserHelpers { @@ -13,7 +18,8 @@ using std::string; ParserAndVisitor::ParserAndVisitor( std::string input, SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks) - : input_{std::move(input)}, visitor_{{}, disableSomeChecks} { + : input_{unescapeUnicodeSequences(std::move(input))}, + visitor_{{}, disableSomeChecks} { // The default in ANTLR is to log all errors to the console and to continue // the parsing. We need to turn parse errors into exceptions instead to // propagate them to the user. @@ -27,7 +33,36 @@ ParserAndVisitor::ParserAndVisitor( ParserAndVisitor::ParserAndVisitor( std::string input, SparqlQleverVisitor::PrefixMap prefixes, SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks) - : ParserAndVisitor{std::move(input), disableSomeChecks} { + : ParserAndVisitor{unescapeUnicodeSequences(std::move(input)), + disableSomeChecks} { visitor_.setPrefixMapManually(std::move(prefixes)); } + +// _____________________________________________________________________________ +std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { + std::u8string_view utf8View{reinterpret_cast(input.data()), + input.size()}; + std::string output; + size_t lastPos = 0; + + for (auto match : + ctre::search_all(utf8View)) { + output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos)); + lastPos = match.data() + match.size() - utf8View.data(); + + auto hexValue = match.to_view(); + + UChar32 codePoint; + const char* startPointer = reinterpret_cast(hexValue.data()); + auto result = std::from_chars( + startPointer + 2, startPointer + hexValue.size(), codePoint, 16); + AD_CORRECTNESS_CHECK(result.ec == std::errc{}); + + icu::UnicodeString helper{codePoint}; + helper.toUTF8String(output); + } + + output += input.substr(lastPos); + return output; +} } // namespace sparqlParserHelpers diff --git a/src/parser/SparqlParserHelpers.h b/src/parser/SparqlParserHelpers.h index 5db09cc27e..55d597d0eb 100644 --- a/src/parser/SparqlParserHelpers.h +++ b/src/parser/SparqlParserHelpers.h @@ -35,6 +35,10 @@ struct ParserAndVisitor { ad_utility::antlr_utility::ThrowingErrorListener errorListener_{}; + // Unescapes unicode sequences like \U01234567 and \u0123 in the input string + // before beginning with actual parsing as the SPARQL standard mandates. + static std::string unescapeUnicodeSequences(std::string input); + public: SparqlAutomaticParser parser_{&tokens_}; SparqlQleverVisitor visitor_; From d425f27ba7fe0150490bf85c76f6d262589e1c60 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 5 Feb 2025 18:25:44 +0100 Subject: [PATCH 03/13] Implement check for partial surrogates --- src/parser/SparqlParserHelpers.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index 9a9f462019..1035f87a91 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -58,6 +58,10 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { startPointer + 2, startPointer + hexValue.size(), codePoint, 16); AD_CORRECTNESS_CHECK(result.ec == std::errc{}); + AD_CORRECTNESS_CHECK(codePoint < 0xD800 || codePoint > 0xDFFF, + "High or low surrogate escape sequence detected, " + "please use a valid Unicode code point instead."); + icu::UnicodeString helper{codePoint}; helper.toUTF8String(output); } From d3b533144bb0e545ea9027866d20fdcf92e2ff0a Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 10:21:04 +0100 Subject: [PATCH 04/13] Implement surrogate decoding correctly --- src/parser/SparqlParserHelpers.cpp | 33 +++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index 1035f87a91..c2e33f2faf 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -44,6 +44,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { input.size()}; std::string output; size_t lastPos = 0; + UChar32 highSurrogate = '\0'; for (auto match : ctre::search_all(utf8View)) { @@ -58,14 +59,40 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { startPointer + 2, startPointer + hexValue.size(), codePoint, 16); AD_CORRECTNESS_CHECK(result.ec == std::errc{}); - AD_CORRECTNESS_CHECK(codePoint < 0xD800 || codePoint > 0xDFFF, - "High or low surrogate escape sequence detected, " - "please use a valid Unicode code point instead."); + bool isFullCodePoint = hexValue.size() == 10; + + if (U16_IS_LEAD(codePoint)) { + AD_CORRECTNESS_CHECK(!isFullCodePoint, + "Surrogates should not be encoded " + "as full code points."); + AD_CORRECTNESS_CHECK(highSurrogate == '\0', + "A high surrogate cannot be " + "followed by another high " + "surrogate."); + highSurrogate = codePoint; + continue; + } else if (U16_IS_TRAIL(codePoint)) { + AD_CORRECTNESS_CHECK(!isFullCodePoint, + "Surrogates should not be encoded " + "as full code points."); + AD_CORRECTNESS_CHECK(highSurrogate != '\0', + "A low surrogate cannot " + "be the first surrogate."); + codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint); + highSurrogate = '\0'; + } else { + AD_CORRECTNESS_CHECK(highSurrogate == '\0', + "A high surrogate cannot be " + "followed by a code point."); + } icu::UnicodeString helper{codePoint}; helper.toUTF8String(output); } + AD_CORRECTNESS_CHECK(highSurrogate == '\0', + "A high surrogate must be followed by a low surrogate."); + output += input.substr(lastPos); return output; } From fdb6c173c545223c78a3ed8ba855755a5725370b Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 10:37:13 +0100 Subject: [PATCH 05/13] Add user-friendly error messages --- src/parser/SparqlParserHelpers.cpp | 46 ++++++++++++++++++------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index c2e33f2faf..bf3690f837 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -43,12 +43,23 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { std::u8string_view utf8View{reinterpret_cast(input.data()), input.size()}; std::string output; + size_t currentPos = 0; size_t lastPos = 0; UChar32 highSurrogate = '\0'; + auto throwError = [&input, ¤tPos](bool condition, + const std::string& message) { + if (!condition) { + throw InvalidSparqlQueryException{ + absl::StrCat("Error in unicode escape sequence in input: ", input, + " at position ", currentPos, ": ", message)}; + } + }; + for (auto match : ctre::search_all(utf8View)) { output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos)); + currentPos = match.data() - utf8View.data(); lastPos = match.data() + match.size() - utf8View.data(); auto hexValue = match.to_view(); @@ -60,38 +71,37 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { AD_CORRECTNESS_CHECK(result.ec == std::errc{}); bool isFullCodePoint = hexValue.size() == 10; + throwError( + hexValue.size() == 10 || hexValue.size() == 6, + "Unicode escape sequences must be either 8 or 4 characters long."); if (U16_IS_LEAD(codePoint)) { - AD_CORRECTNESS_CHECK(!isFullCodePoint, - "Surrogates should not be encoded " - "as full code points."); - AD_CORRECTNESS_CHECK(highSurrogate == '\0', - "A high surrogate cannot be " - "followed by another high " - "surrogate."); + throwError(!isFullCodePoint, + "Surrogates should not be encoded as full code points."); + throwError( + highSurrogate == '\0', + "A high surrogate cannot be followed by another high surrogate."); highSurrogate = codePoint; continue; } else if (U16_IS_TRAIL(codePoint)) { - AD_CORRECTNESS_CHECK(!isFullCodePoint, - "Surrogates should not be encoded " - "as full code points."); - AD_CORRECTNESS_CHECK(highSurrogate != '\0', - "A low surrogate cannot " - "be the first surrogate."); + throwError(!isFullCodePoint, + "Surrogates should not be encoded as full code points."); + throwError(highSurrogate != '\0', + "A low surrogate cannot be the first surrogate."); codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint); highSurrogate = '\0'; } else { - AD_CORRECTNESS_CHECK(highSurrogate == '\0', - "A high surrogate cannot be " - "followed by a code point."); + throwError(highSurrogate == '\0', + "A high surrogate cannot be followed by a code point."); } icu::UnicodeString helper{codePoint}; helper.toUTF8String(output); } - AD_CORRECTNESS_CHECK(highSurrogate == '\0', - "A high surrogate must be followed by a low surrogate."); + currentPos = lastPos; + throwError(highSurrogate == '\0', + "A high surrogate must be followed by a low surrogate."); output += input.substr(lastPos); return output; From 1fd94cf57a4bfb47de94b4c3269c1a76c5787440 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 12:04:52 +0100 Subject: [PATCH 06/13] Simplify code and add unit tests --- src/parser/SparqlParserHelpers.cpp | 18 +++---- test/SparqlParserTest.cpp | 79 ++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index bf3690f837..82d2b3c608 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -43,23 +43,19 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { std::u8string_view utf8View{reinterpret_cast(input.data()), input.size()}; std::string output; - size_t currentPos = 0; size_t lastPos = 0; UChar32 highSurrogate = '\0'; - auto throwError = [&input, ¤tPos](bool condition, - const std::string& message) { + auto throwError = [](bool condition, const std::string& message) { if (!condition) { throw InvalidSparqlQueryException{ - absl::StrCat("Error in unicode escape sequence in input: ", input, - " at position ", currentPos, ": ", message)}; + absl::StrCat("Error in unicode escape sequence. ", message)}; } }; for (auto match : ctre::search_all(utf8View)) { output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos)); - currentPos = match.data() - utf8View.data(); lastPos = match.data() + match.size() - utf8View.data(); auto hexValue = match.to_view(); @@ -71,9 +67,9 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { AD_CORRECTNESS_CHECK(result.ec == std::errc{}); bool isFullCodePoint = hexValue.size() == 10; - throwError( + AD_CORRECTNESS_CHECK( hexValue.size() == 10 || hexValue.size() == 6, - "Unicode escape sequences must be either 8 or 4 characters long."); + "Unicode escape sequences must be either 8 or 4 characters long."); if (U16_IS_LEAD(codePoint)) { throwError(!isFullCodePoint, @@ -91,15 +87,15 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint); highSurrogate = '\0'; } else { - throwError(highSurrogate == '\0', - "A high surrogate cannot be followed by a code point."); + throwError( + highSurrogate == '\0', + "A high surrogate cannot be followed by a regular code point."); } icu::UnicodeString helper{codePoint}; helper.toUTF8String(output); } - currentPos = lastPos; throwError(highSurrogate == '\0', "A high surrogate must be followed by a low surrogate."); diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp index d3f2352b1e..6b23526578 100644 --- a/test/SparqlParserTest.cpp +++ b/test/SparqlParserTest.cpp @@ -1246,3 +1246,82 @@ TEST(ParserTest, LanguageFilterPostProcessing) { triples[2]); } } + +namespace { +std::string getFirstTriple(const ParsedQuery& q) { + return q._rootGraphPattern._graphPatterns.at(0) + .getBasic() + ._triples.at(0) + .asString(); +} +} // namespace + +TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) { + ParsedQuery q1 = SparqlParser::parseQuery( + R"(SELECT * WHERE { ?s '\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD'})"); + EXPECT_EQ(getFirstTriple(q1), + "{s: ?s, p: , o: " + "\"\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD" + "\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD\"}"); + + ParsedQuery q2 = + SparqlParser::parseQuery(R"(SELECT * WHERE { ?s ?p "\U0001f46a" . })"); + EXPECT_EQ(getFirstTriple(q2), "{s: ?s, p: ?p, o: \"\U0001f46a\"}"); + + ParsedQuery q3 = SparqlParser::parseQuery( + R"(PREFIX \u03B1: SELECT * WHERE { ?s ?p α\u003Aba . })"); + EXPECT_EQ(getFirstTriple(q3), + "{s: ?s, p: ?p, o: }"); + + ParsedQuery q4 = SparqlParser::parseQuery( + R"(SELECT * WHERE { ?p\u00201. })"); + EXPECT_EQ(getFirstTriple(q4), + "{s: , p: ?p, o: 1}"); +} + +TEST(ParserTest, HandlesSurrogatesCorrectly) { + using SP = SparqlParser; + using ::testing::HasSubstr; + ParsedQuery q = SP::parseQuery( + R"(SELECT * WHERE { "\uD83E\udD37\uD83C\uDFFD\u200D\u2642\uFE0F" ?p 1. })"); + EXPECT_EQ(getFirstTriple(q), "{s: \"🤷🏽‍♂️\", p: ?p, o: 1}"); + + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800' })"), + HasSubstr("A high surrogate must be followed by a low surrogate."), + InvalidSparqlQueryException); + + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000D800' })"), + HasSubstr("Surrogates should not be encoded as full code points."), + InvalidSparqlQueryException); + + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\uD800' })"), + HasSubstr( + "A high surrogate cannot be followed by another high surrogate."), + InvalidSparqlQueryException); + + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000DFFD' })"), + HasSubstr("Surrogates should not be encoded as full code points."), + InvalidSparqlQueryException); + + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uDFFD' })"), + HasSubstr("A low surrogate cannot be the first surrogate."), + InvalidSparqlQueryException); + + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\u0020' })"), + HasSubstr("A high surrogate cannot be followed by a regular code point."), + InvalidSparqlQueryException); + + // Note: We don't allow mixing escaped and unescape surrogates, that's just + // weird and the C++ compiler rightfully won't compile strings like these: + // SELECT * WHERE { ?s ?p '\\uD83C\uDFFD' } + // SELECT * WHERE { ?s ?p '\uD83C\\uDFFD' } + + // So writing unit tests for these cases is not possible without creating + // semi-invalid UTF-8 strings. +} From f42bdf1d519c2a27ea669b5732742e28577ac8df Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 12:15:17 +0100 Subject: [PATCH 07/13] Use regular ints --- src/parser/SparqlParserHelpers.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index 82d2b3c608..b9ecae205d 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -44,7 +44,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { input.size()}; std::string output; size_t lastPos = 0; - UChar32 highSurrogate = '\0'; + UChar32 highSurrogate = 0; auto throwError = [](bool condition, const std::string& message) { if (!condition) { @@ -75,20 +75,20 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { throwError(!isFullCodePoint, "Surrogates should not be encoded as full code points."); throwError( - highSurrogate == '\0', + highSurrogate == 0, "A high surrogate cannot be followed by another high surrogate."); highSurrogate = codePoint; continue; } else if (U16_IS_TRAIL(codePoint)) { throwError(!isFullCodePoint, "Surrogates should not be encoded as full code points."); - throwError(highSurrogate != '\0', + throwError(highSurrogate != 0, "A low surrogate cannot be the first surrogate."); codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint); - highSurrogate = '\0'; + highSurrogate = 0; } else { throwError( - highSurrogate == '\0', + highSurrogate == 0, "A high surrogate cannot be followed by a regular code point."); } @@ -96,7 +96,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { helper.toUTF8String(output); } - throwError(highSurrogate == '\0', + throwError(highSurrogate == 0, "A high surrogate must be followed by a low surrogate."); output += input.substr(lastPos); From 947ad6a22be5e72c37b1fa3fa390bb157fa3a65e Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 12:18:58 +0100 Subject: [PATCH 08/13] Don't escape twice --- src/parser/SparqlParserHelpers.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index b9ecae205d..b6e7b7c96d 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -33,8 +33,7 @@ ParserAndVisitor::ParserAndVisitor( ParserAndVisitor::ParserAndVisitor( std::string input, SparqlQleverVisitor::PrefixMap prefixes, SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks) - : ParserAndVisitor{unescapeUnicodeSequences(std::move(input)), - disableSomeChecks} { + : ParserAndVisitor{std::move(input), disableSomeChecks} { visitor_.setPrefixMapManually(std::move(prefixes)); } From f1c8834f06e7b9b0d32250da039e00f4edeb2e41 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 15:08:48 +0100 Subject: [PATCH 09/13] Address sonarcloud issues --- src/parser/SparqlParserHelpers.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index b6e7b7c96d..614d2d42be 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -39,7 +39,7 @@ ParserAndVisitor::ParserAndVisitor( // _____________________________________________________________________________ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { - std::u8string_view utf8View{reinterpret_cast(input.data()), + std::u8string_view utf8View{std::bit_cast(input.data()), input.size()}; std::string output; size_t lastPos = 0; @@ -52,6 +52,8 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { } }; + constexpr size_t prefixLength = std::string_view{"\\U"}.size(); + for (auto match : ctre::search_all(utf8View)) { output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos)); @@ -60,14 +62,16 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { auto hexValue = match.to_view(); UChar32 codePoint; - const char* startPointer = reinterpret_cast(hexValue.data()); - auto result = std::from_chars( - startPointer + 2, startPointer + hexValue.size(), codePoint, 16); + auto* startPointer = std::bit_cast(hexValue.data()); + auto result = + std::from_chars(startPointer + prefixLength, + startPointer + hexValue.size(), codePoint, 16); AD_CORRECTNESS_CHECK(result.ec == std::errc{}); bool isFullCodePoint = hexValue.size() == 10; AD_CORRECTNESS_CHECK( - hexValue.size() == 10 || hexValue.size() == 6, + hexValue.size() == 8 + prefixLength || + hexValue.size() == 4 + prefixLength, "Unicode escape sequences must be either 8 or 4 characters long."); if (U16_IS_LEAD(codePoint)) { From 8a9f7309c57472169dbf6eeb2c2835b4eac75a69 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 17:33:00 +0100 Subject: [PATCH 10/13] Simplify code and avoid redundant copies --- src/parser/SparqlParserHelpers.cpp | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index 614d2d42be..51f0ceb696 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -39,39 +39,34 @@ ParserAndVisitor::ParserAndVisitor( // _____________________________________________________________________________ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { - std::u8string_view utf8View{std::bit_cast(input.data()), - input.size()}; + std::string_view view{input}; std::string output; size_t lastPos = 0; UChar32 highSurrogate = 0; - auto throwError = [](bool condition, const std::string& message) { + auto throwError = [](bool condition, std::string_view message) { if (!condition) { throw InvalidSparqlQueryException{ absl::StrCat("Error in unicode escape sequence. ", message)}; } }; - constexpr size_t prefixLength = std::string_view{"\\U"}.size(); - for (auto match : - ctre::search_all(utf8View)) { - output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos)); - lastPos = match.data() + match.size() - utf8View.data(); + ctre::search_all(view)) { + output += view.substr(lastPos, match.data() - (view.data() + lastPos)); + lastPos = match.data() + match.size() - view.data(); auto hexValue = match.to_view(); + hexValue.remove_prefix(std::string_view{"\\U"}.size()); UChar32 codePoint; - auto* startPointer = std::bit_cast(hexValue.data()); - auto result = - std::from_chars(startPointer + prefixLength, - startPointer + hexValue.size(), codePoint, 16); + auto result = std::from_chars( + hexValue.data(), hexValue.data() + hexValue.size(), codePoint, 16); AD_CORRECTNESS_CHECK(result.ec == std::errc{}); - bool isFullCodePoint = hexValue.size() == 10; + bool isFullCodePoint = hexValue.size() == 8; AD_CORRECTNESS_CHECK( - hexValue.size() == 8 + prefixLength || - hexValue.size() == 4 + prefixLength, + hexValue.size() == 8 || hexValue.size() == 4, "Unicode escape sequences must be either 8 or 4 characters long."); if (U16_IS_LEAD(codePoint)) { @@ -102,7 +97,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { throwError(highSurrogate == 0, "A high surrogate must be followed by a low surrogate."); - output += input.substr(lastPos); + output += view.substr(lastPos); return output; } } // namespace sparqlParserHelpers From b44d77cc323d9ed7259d7171031fe0eb3a07cad9 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 17:49:26 +0100 Subject: [PATCH 11/13] Fix bugs and optimize code --- src/parser/SparqlParserHelpers.cpp | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp index 51f0ceb696..541725ec39 100644 --- a/src/parser/SparqlParserHelpers.cpp +++ b/src/parser/SparqlParserHelpers.cpp @@ -41,6 +41,7 @@ ParserAndVisitor::ParserAndVisitor( std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { std::string_view view{input}; std::string output; + bool noEscapeSequenceFound = true; size_t lastPos = 0; UChar32 highSurrogate = 0; @@ -51,9 +52,20 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { } }; - for (auto match : + for (const auto& match : ctre::search_all(view)) { - output += view.substr(lastPos, match.data() - (view.data() + lastPos)); + if (noEscapeSequenceFound) { + output.reserve(input.size()); + noEscapeSequenceFound = false; + } + auto inBetweenPart = + view.substr(lastPos, match.data() - (view.data() + lastPos)); + + throwError( + inBetweenPart.empty() || highSurrogate == 0, + "A high surrogate must be directly followed by a low surrogate."); + + output += inBetweenPart; lastPos = match.data() + match.size() - view.data(); auto hexValue = match.to_view(); @@ -63,12 +75,14 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { auto result = std::from_chars( hexValue.data(), hexValue.data() + hexValue.size(), codePoint, 16); AD_CORRECTNESS_CHECK(result.ec == std::errc{}); - - bool isFullCodePoint = hexValue.size() == 8; AD_CORRECTNESS_CHECK( hexValue.size() == 8 || hexValue.size() == 4, "Unicode escape sequences must be either 8 or 4 characters long."); + bool isFullCodePoint = hexValue.size() == 8; + + // See https://symbl.cc/en/unicode/blocks/high-surrogates/ for more + // information. if (U16_IS_LEAD(codePoint)) { throwError(!isFullCodePoint, "Surrogates should not be encoded as full code points."); @@ -94,6 +108,11 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) { helper.toUTF8String(output); } + // Avoid redundant copy if no escape sequences were found. + if (noEscapeSequenceFound) { + return input; + } + throwError(highSurrogate == 0, "A high surrogate must be followed by a low surrogate."); From 24d66202ed1d450e5c34436d6be09c7ee51e84c4 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 18:07:24 +0100 Subject: [PATCH 12/13] Add additional unit tests --- test/SparqlParserTest.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp index 6b23526578..c232f2186a 100644 --- a/test/SparqlParserTest.cpp +++ b/test/SparqlParserTest.cpp @@ -23,6 +23,7 @@ auto lit = ad_utility::testing::tripleComponentLiteral; auto iri = ad_utility::testing::iri; } // namespace +// _____________________________________________________________________________ TEST(ParserTest, testParse) { { auto pq = SparqlParser::parseQuery("SELECT ?x WHERE {?x ?y ?z}"); @@ -698,6 +699,7 @@ TEST(ParserTest, testParse) { } } +// _____________________________________________________________________________ TEST(ParserTest, testFilterWithoutDot) { ParsedQuery pq = SparqlParser::parseQuery( "PREFIX fb: \n" @@ -726,6 +728,7 @@ TEST(ParserTest, testFilterWithoutDot) { ASSERT_EQ("(?1 != fb:m.018mts)", filters[2].expression_.getDescriptor()); } +// _____________________________________________________________________________ TEST(ParserTest, testExpandPrefixes) { ParsedQuery pq = SparqlParser::parseQuery( "PREFIX : \n" @@ -754,6 +757,7 @@ TEST(ParserTest, testExpandPrefixes) { ASSERT_EQ(0, pq._limitOffset._offset); } +// _____________________________________________________________________________ TEST(ParserTest, testLiterals) { ParsedQuery pq = SparqlParser::parseQuery( "PREFIX xsd: SELECT * WHERE { " @@ -772,6 +776,7 @@ TEST(ParserTest, testLiterals) { ASSERT_EQ(DateYearOrDuration{Date(2000, 1, 1, -1)}, c._triples[1].o_); } +// _____________________________________________________________________________ TEST(ParserTest, testSolutionModifiers) { { ParsedQuery pq = @@ -993,6 +998,7 @@ TEST(ParserTest, testSolutionModifiers) { } } +// _____________________________________________________________________________ TEST(ParserTest, testGroupByAndAlias) { ParsedQuery pq = SparqlParser::parseQuery( "SELECT (COUNT(?a) as ?count) WHERE { ?b ?a } GROUP BY ?b"); @@ -1008,6 +1014,7 @@ TEST(ParserTest, testGroupByAndAlias) { EXPECT_THAT(pq, m::GroupByVariables({Var{"?b"}})); } +// _____________________________________________________________________________ TEST(ParserTest, Bind) { ParsedQuery pq = SparqlParser::parseQuery("SELECT ?a WHERE { BIND (10 - 5 as ?a) . }"); @@ -1020,6 +1027,7 @@ TEST(ParserTest, Bind) { ASSERT_EQ(bind._expression.getDescriptor(), "10 - 5"); } +// _____________________________________________________________________________ TEST(ParserTest, Order) { { ParsedQuery pq = @@ -1101,6 +1109,7 @@ TEST(ParserTest, Order) { */ } +// _____________________________________________________________________________ TEST(ParserTest, Group) { { ParsedQuery pq = SparqlParser::parseQuery( @@ -1165,6 +1174,7 @@ TEST(ParserTest, Group) { } } +// _____________________________________________________________________________ TEST(ParserTest, LanguageFilterPostProcessing) { { ParsedQuery q = SparqlParser::parseQuery( @@ -1247,6 +1257,7 @@ TEST(ParserTest, LanguageFilterPostProcessing) { } } +// _____________________________________________________________________________ namespace { std::string getFirstTriple(const ParsedQuery& q) { return q._rootGraphPattern._graphPatterns.at(0) @@ -1256,6 +1267,7 @@ std::string getFirstTriple(const ParsedQuery& q) { } } // namespace +// _____________________________________________________________________________ TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) { ParsedQuery q1 = SparqlParser::parseQuery( R"(SELECT * WHERE { ?s '\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD'})"); @@ -1277,8 +1289,14 @@ TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) { R"(SELECT * WHERE { ?p\u00201. })"); EXPECT_EQ(getFirstTriple(q4), "{s: , p: ?p, o: 1}"); + + // Ensure we don't double-unescape, \u sequences are not allowed in literals + EXPECT_THROW( + SparqlParser::parseQuery(R"(SELECT * WHERE { "\u005Cu2764" ?p 1. })"), + InvalidSparqlQueryException); } +// _____________________________________________________________________________ TEST(ParserTest, HandlesSurrogatesCorrectly) { using SP = SparqlParser; using ::testing::HasSubstr; @@ -1286,6 +1304,12 @@ TEST(ParserTest, HandlesSurrogatesCorrectly) { R"(SELECT * WHERE { "\uD83E\udD37\uD83C\uDFFD\u200D\u2642\uFE0F" ?p 1. })"); EXPECT_EQ(getFirstTriple(q), "{s: \"🤷🏽‍♂️\", p: ?p, o: 1}"); + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( + SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD83C \uDFFD' })"), + HasSubstr( + "A high surrogate must be directly followed by a low surrogate."), + InvalidSparqlQueryException); + AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE( SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800' })"), HasSubstr("A high surrogate must be followed by a low surrogate."), From 777224de8c38a47e06818d1acb752c08250ade1d Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 6 Feb 2025 18:18:59 +0100 Subject: [PATCH 13/13] Ignore URLs in codespell --- .codespellrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codespellrc b/.codespellrc index a1e8aa7c0f..0132eac027 100644 --- a/.codespellrc +++ b/.codespellrc @@ -3,6 +3,6 @@ skip = .git*,.codespellrc,*.pdf,generated check-hidden = true # Ignore mixedCase variables, lines with latin, lines with codespell-ignore pragma, etc -ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).* +ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*|https?://\S+ # alph - is used frequently in tests, just ignore altogether ignore-words-list = ser,alph,inbetween,interm