From e913c6cf89486ec10afa0d53c8b4032f0ed1b441 Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Wed, 5 Feb 2025 17:46:04 +0100
Subject: [PATCH 01/13] Update ctre dependency

---
 CMakeLists.txt             | 2 +-
 src/util/StringUtilsImpl.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3679de4c51..0d541d8a32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -231,7 +231,7 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
 FetchContent_Declare(
         ctre
         GIT_REPOSITORY https://github.com/hanickadot/compile-time-regular-expressions.git
-        GIT_TAG b3d7788b559e34d985c8530c3e0e7260b67505a6 # v3.8.1
+        GIT_TAG eb9577aae3515d14e6c5564f9aeb046d2e7c1124 # v3.9.0
 )
 
 ################################
diff --git a/src/util/StringUtilsImpl.h b/src/util/StringUtilsImpl.h
index 87db3aa645..1e677bf729 100644
--- a/src/util/StringUtilsImpl.h
+++ b/src/util/StringUtilsImpl.h
@@ -74,7 +74,7 @@ std::string insertThousandSeparator(const std::string_view str,
       "])(?<digit>\\d{4,})"};
   auto parseIterator = std::begin(str);
   ql::ranges::for_each(
-      ctre::range<regexPatDigitSequence>(str),
+      ctre::search_all<regexPatDigitSequence>(str),
       [&parseIterator, &ostream, &insertSeparator](const auto& match) {
         /*
         The digit sequence, that must be transformed. Note: The string view

From 2ac535401083d1d8638db6616ad4394aa5c41cfb Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Wed, 5 Feb 2025 18:03:11 +0100
Subject: [PATCH 02/13] Implement unicode unescape before parsing

---
 src/parser/SparqlParserHelpers.cpp | 39 ++++++++++++++++++++++++++++--
 src/parser/SparqlParserHelpers.h   |  4 +++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index ed7717f35b..9a9f462019 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -4,6 +4,11 @@
 
 #include "SparqlParserHelpers.h"
 
+#include <unicode/unistr.h>
+
+#include <charconv>
+#include <ctre-unicode.hpp>
+
 #include "sparqlParser/generated/SparqlAutomaticLexer.h"
 
 namespace sparqlParserHelpers {
@@ -13,7 +18,8 @@ using std::string;
 ParserAndVisitor::ParserAndVisitor(
     std::string input,
     SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks)
-    : input_{std::move(input)}, visitor_{{}, disableSomeChecks} {
+    : input_{unescapeUnicodeSequences(std::move(input))},
+      visitor_{{}, disableSomeChecks} {
   // The default in ANTLR is to log all errors to the console and to continue
   // the parsing. We need to turn parse errors into exceptions instead to
   // propagate them to the user.
@@ -27,7 +33,36 @@ ParserAndVisitor::ParserAndVisitor(
 ParserAndVisitor::ParserAndVisitor(
     std::string input, SparqlQleverVisitor::PrefixMap prefixes,
     SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks)
-    : ParserAndVisitor{std::move(input), disableSomeChecks} {
+    : ParserAndVisitor{unescapeUnicodeSequences(std::move(input)),
+                       disableSomeChecks} {
   visitor_.setPrefixMapManually(std::move(prefixes));
 }
+
+// _____________________________________________________________________________
+std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
+  std::u8string_view utf8View{reinterpret_cast<char8_t*>(input.data()),
+                              input.size()};
+  std::string output;
+  size_t lastPos = 0;
+
+  for (auto match :
+       ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(utf8View)) {
+    output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos));
+    lastPos = match.data() + match.size() - utf8View.data();
+
+    auto hexValue = match.to_view();
+
+    UChar32 codePoint;
+    const char* startPointer = reinterpret_cast<const char*>(hexValue.data());
+    auto result = std::from_chars(
+        startPointer + 2, startPointer + hexValue.size(), codePoint, 16);
+    AD_CORRECTNESS_CHECK(result.ec == std::errc{});
+
+    icu::UnicodeString helper{codePoint};
+    helper.toUTF8String(output);
+  }
+
+  output += input.substr(lastPos);
+  return output;
+}
 }  // namespace sparqlParserHelpers
diff --git a/src/parser/SparqlParserHelpers.h b/src/parser/SparqlParserHelpers.h
index 5db09cc27e..55d597d0eb 100644
--- a/src/parser/SparqlParserHelpers.h
+++ b/src/parser/SparqlParserHelpers.h
@@ -35,6 +35,10 @@ struct ParserAndVisitor {
   ad_utility::antlr_utility::ThrowingErrorListener<InvalidSparqlQueryException>
       errorListener_{};
 
+  // Unescapes unicode sequences like \U01234567 and \u0123 in the input string
+  // before beginning with actual parsing as the SPARQL standard mandates.
+  static std::string unescapeUnicodeSequences(std::string input);
+
  public:
   SparqlAutomaticParser parser_{&tokens_};
   SparqlQleverVisitor visitor_;

From d425f27ba7fe0150490bf85c76f6d262589e1c60 Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Wed, 5 Feb 2025 18:25:44 +0100
Subject: [PATCH 03/13] Implement check for partial surrogates

---
 src/parser/SparqlParserHelpers.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index 9a9f462019..1035f87a91 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -58,6 +58,10 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
         startPointer + 2, startPointer + hexValue.size(), codePoint, 16);
     AD_CORRECTNESS_CHECK(result.ec == std::errc{});
 
+    AD_CORRECTNESS_CHECK(codePoint < 0xD800 || codePoint > 0xDFFF,
+                         "High or low surrogate escape sequence detected, "
+                         "please use a valid Unicode code point instead.");
+
     icu::UnicodeString helper{codePoint};
     helper.toUTF8String(output);
   }

From d3b533144bb0e545ea9027866d20fdcf92e2ff0a Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 10:21:04 +0100
Subject: [PATCH 04/13] Implement surrogate decoding correctly

---
 src/parser/SparqlParserHelpers.cpp | 33 +++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index 1035f87a91..c2e33f2faf 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -44,6 +44,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
                               input.size()};
   std::string output;
   size_t lastPos = 0;
+  UChar32 highSurrogate = '\0';
 
   for (auto match :
        ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(utf8View)) {
@@ -58,14 +59,40 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
         startPointer + 2, startPointer + hexValue.size(), codePoint, 16);
     AD_CORRECTNESS_CHECK(result.ec == std::errc{});
 
-    AD_CORRECTNESS_CHECK(codePoint < 0xD800 || codePoint > 0xDFFF,
-                         "High or low surrogate escape sequence detected, "
-                         "please use a valid Unicode code point instead.");
+    bool isFullCodePoint = hexValue.size() == 10;
+
+    if (U16_IS_LEAD(codePoint)) {
+      AD_CORRECTNESS_CHECK(!isFullCodePoint,
+                           "Surrogates should not be encoded "
+                           "as full code points.");
+      AD_CORRECTNESS_CHECK(highSurrogate == '\0',
+                           "A high surrogate cannot be "
+                           "followed by another high "
+                           "surrogate.");
+      highSurrogate = codePoint;
+      continue;
+    } else if (U16_IS_TRAIL(codePoint)) {
+      AD_CORRECTNESS_CHECK(!isFullCodePoint,
+                           "Surrogates should not be encoded "
+                           "as full code points.");
+      AD_CORRECTNESS_CHECK(highSurrogate != '\0',
+                           "A low surrogate cannot "
+                           "be the first surrogate.");
+      codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint);
+      highSurrogate = '\0';
+    } else {
+      AD_CORRECTNESS_CHECK(highSurrogate == '\0',
+                           "A high surrogate cannot be "
+                           "followed by a code point.");
+    }
 
     icu::UnicodeString helper{codePoint};
     helper.toUTF8String(output);
   }
 
+  AD_CORRECTNESS_CHECK(highSurrogate == '\0',
+                       "A high surrogate must be followed by a low surrogate.");
+
   output += input.substr(lastPos);
   return output;
 }

From fdb6c173c545223c78a3ed8ba855755a5725370b Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 10:37:13 +0100
Subject: [PATCH 05/13] Add user-friendly error messages

---
 src/parser/SparqlParserHelpers.cpp | 46 ++++++++++++++++++------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index c2e33f2faf..bf3690f837 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -43,12 +43,23 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
   std::u8string_view utf8View{reinterpret_cast<char8_t*>(input.data()),
                               input.size()};
   std::string output;
+  size_t currentPos = 0;
   size_t lastPos = 0;
   UChar32 highSurrogate = '\0';
 
+  auto throwError = [&input, &currentPos](bool condition,
+                                          const std::string& message) {
+    if (!condition) {
+      throw InvalidSparqlQueryException{
+          absl::StrCat("Error in unicode escape sequence in input: ", input,
+                       " at position ", currentPos, ": ", message)};
+    }
+  };
+
   for (auto match :
        ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(utf8View)) {
     output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos));
+    currentPos = match.data() - utf8View.data();
     lastPos = match.data() + match.size() - utf8View.data();
 
     auto hexValue = match.to_view();
@@ -60,38 +71,37 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     AD_CORRECTNESS_CHECK(result.ec == std::errc{});
 
     bool isFullCodePoint = hexValue.size() == 10;
+    throwError(
+        hexValue.size() == 10 || hexValue.size() == 6,
+        "Unicode escape sequences must be either 8 or 4  characters long.");
 
     if (U16_IS_LEAD(codePoint)) {
-      AD_CORRECTNESS_CHECK(!isFullCodePoint,
-                           "Surrogates should not be encoded "
-                           "as full code points.");
-      AD_CORRECTNESS_CHECK(highSurrogate == '\0',
-                           "A high surrogate cannot be "
-                           "followed by another high "
-                           "surrogate.");
+      throwError(!isFullCodePoint,
+                 "Surrogates should not be encoded as full code points.");
+      throwError(
+          highSurrogate == '\0',
+          "A high surrogate cannot be followed by another high surrogate.");
       highSurrogate = codePoint;
       continue;
     } else if (U16_IS_TRAIL(codePoint)) {
-      AD_CORRECTNESS_CHECK(!isFullCodePoint,
-                           "Surrogates should not be encoded "
-                           "as full code points.");
-      AD_CORRECTNESS_CHECK(highSurrogate != '\0',
-                           "A low surrogate cannot "
-                           "be the first surrogate.");
+      throwError(!isFullCodePoint,
+                 "Surrogates should not be encoded as full code points.");
+      throwError(highSurrogate != '\0',
+                 "A low surrogate cannot be the first surrogate.");
       codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint);
       highSurrogate = '\0';
     } else {
-      AD_CORRECTNESS_CHECK(highSurrogate == '\0',
-                           "A high surrogate cannot be "
-                           "followed by a code point.");
+      throwError(highSurrogate == '\0',
+                 "A high surrogate cannot be followed by a code point.");
     }
 
     icu::UnicodeString helper{codePoint};
     helper.toUTF8String(output);
   }
 
-  AD_CORRECTNESS_CHECK(highSurrogate == '\0',
-                       "A high surrogate must be followed by a low surrogate.");
+  currentPos = lastPos;
+  throwError(highSurrogate == '\0',
+             "A high surrogate must be followed by a low surrogate.");
 
   output += input.substr(lastPos);
   return output;

From 1fd94cf57a4bfb47de94b4c3269c1a76c5787440 Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 12:04:52 +0100
Subject: [PATCH 06/13] Simplify code and add unit tests

---
 src/parser/SparqlParserHelpers.cpp | 18 +++----
 test/SparqlParserTest.cpp          | 79 ++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index bf3690f837..82d2b3c608 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -43,23 +43,19 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
   std::u8string_view utf8View{reinterpret_cast<char8_t*>(input.data()),
                               input.size()};
   std::string output;
-  size_t currentPos = 0;
   size_t lastPos = 0;
   UChar32 highSurrogate = '\0';
 
-  auto throwError = [&input, &currentPos](bool condition,
-                                          const std::string& message) {
+  auto throwError = [](bool condition, const std::string& message) {
     if (!condition) {
       throw InvalidSparqlQueryException{
-          absl::StrCat("Error in unicode escape sequence in input: ", input,
-                       " at position ", currentPos, ": ", message)};
+          absl::StrCat("Error in unicode escape sequence. ", message)};
     }
   };
 
   for (auto match :
        ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(utf8View)) {
     output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos));
-    currentPos = match.data() - utf8View.data();
     lastPos = match.data() + match.size() - utf8View.data();
 
     auto hexValue = match.to_view();
@@ -71,9 +67,9 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     AD_CORRECTNESS_CHECK(result.ec == std::errc{});
 
     bool isFullCodePoint = hexValue.size() == 10;
-    throwError(
+    AD_CORRECTNESS_CHECK(
         hexValue.size() == 10 || hexValue.size() == 6,
-        "Unicode escape sequences must be either 8 or 4  characters long.");
+        "Unicode escape sequences must be either 8 or 4 characters long.");
 
     if (U16_IS_LEAD(codePoint)) {
       throwError(!isFullCodePoint,
@@ -91,15 +87,15 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
       codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint);
       highSurrogate = '\0';
     } else {
-      throwError(highSurrogate == '\0',
-                 "A high surrogate cannot be followed by a code point.");
+      throwError(
+          highSurrogate == '\0',
+          "A high surrogate cannot be followed by a regular code point.");
     }
 
     icu::UnicodeString helper{codePoint};
     helper.toUTF8String(output);
   }
 
-  currentPos = lastPos;
   throwError(highSurrogate == '\0',
              "A high surrogate must be followed by a low surrogate.");
 
diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp
index d3f2352b1e..6b23526578 100644
--- a/test/SparqlParserTest.cpp
+++ b/test/SparqlParserTest.cpp
@@ -1246,3 +1246,82 @@ TEST(ParserTest, LanguageFilterPostProcessing) {
         triples[2]);
   }
 }
+
+namespace {
+std::string getFirstTriple(const ParsedQuery& q) {
+  return q._rootGraphPattern._graphPatterns.at(0)
+      .getBasic()
+      ._triples.at(0)
+      .asString();
+}
+}  // namespace
+
+TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) {
+  ParsedQuery q1 = SparqlParser::parseQuery(
+      R"(SELECT * WHERE { ?s <http://a.example/p1> '\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD'})");
+  EXPECT_EQ(getFirstTriple(q1),
+            "{s: ?s, p: <http://a.example/p1>, o: "
+            "\"\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD"
+            "\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD\"}");
+
+  ParsedQuery q2 =
+      SparqlParser::parseQuery(R"(SELECT * WHERE { ?s ?p "\U0001f46a" . })");
+  EXPECT_EQ(getFirstTriple(q2), "{s: ?s, p: ?p, o: \"\U0001f46a\"}");
+
+  ParsedQuery q3 = SparqlParser::parseQuery(
+      R"(PREFIX \u03B1: <http://example.com/\u00E9fg> SELECT * WHERE { ?s ?p α\u003Aba . })");
+  EXPECT_EQ(getFirstTriple(q3),
+            "{s: ?s, p: ?p, o: <http://example.com/éfgba>}");
+
+  ParsedQuery q4 = SparqlParser::parseQuery(
+      R"(SELECT * WHERE { <http://example.com/\U0001F937\U0001F3FD\u200D\U00002642\ufe0F> ?p\u00201. })");
+  EXPECT_EQ(getFirstTriple(q4),
+            "{s: <http://example.com/🤷🏽‍♂️>, p: ?p, o: 1}");
+}
+
+TEST(ParserTest, HandlesSurrogatesCorrectly) {
+  using SP = SparqlParser;
+  using ::testing::HasSubstr;
+  ParsedQuery q = SP::parseQuery(
+      R"(SELECT * WHERE { "\uD83E\udD37\uD83C\uDFFD\u200D\u2642\uFE0F" ?p 1. })");
+  EXPECT_EQ(getFirstTriple(q), "{s: \"🤷🏽‍♂️\", p: ?p, o: 1}");
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800' })"),
+      HasSubstr("A high surrogate must be followed by a low surrogate."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000D800' })"),
+      HasSubstr("Surrogates should not be encoded as full code points."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\uD800' })"),
+      HasSubstr(
+          "A high surrogate cannot be followed by another high surrogate."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\U0000DFFD' })"),
+      HasSubstr("Surrogates should not be encoded as full code points."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uDFFD' })"),
+      HasSubstr("A low surrogate cannot be the first surrogate."),
+      InvalidSparqlQueryException);
+
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800\u0020' })"),
+      HasSubstr("A high surrogate cannot be followed by a regular code point."),
+      InvalidSparqlQueryException);
+
+  // Note: We don't allow mixing escaped and unescape surrogates, that's just
+  // weird and the C++ compiler rightfully won't compile strings like these:
+  // SELECT * WHERE { ?s ?p '\\uD83C\uDFFD' }
+  // SELECT * WHERE { ?s ?p '\uD83C\\uDFFD' }
+
+  // So writing unit tests for these cases is not possible without creating
+  // semi-invalid UTF-8 strings.
+}

From f42bdf1d519c2a27ea669b5732742e28577ac8df Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 12:15:17 +0100
Subject: [PATCH 07/13] Use regular ints

---
 src/parser/SparqlParserHelpers.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index 82d2b3c608..b9ecae205d 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -44,7 +44,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
                               input.size()};
   std::string output;
   size_t lastPos = 0;
-  UChar32 highSurrogate = '\0';
+  UChar32 highSurrogate = 0;
 
   auto throwError = [](bool condition, const std::string& message) {
     if (!condition) {
@@ -75,20 +75,20 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
       throwError(!isFullCodePoint,
                  "Surrogates should not be encoded as full code points.");
       throwError(
-          highSurrogate == '\0',
+          highSurrogate == 0,
           "A high surrogate cannot be followed by another high surrogate.");
       highSurrogate = codePoint;
       continue;
     } else if (U16_IS_TRAIL(codePoint)) {
       throwError(!isFullCodePoint,
                  "Surrogates should not be encoded as full code points.");
-      throwError(highSurrogate != '\0',
+      throwError(highSurrogate != 0,
                  "A low surrogate cannot be the first surrogate.");
       codePoint = U16_GET_SUPPLEMENTARY(highSurrogate, codePoint);
-      highSurrogate = '\0';
+      highSurrogate = 0;
     } else {
       throwError(
-          highSurrogate == '\0',
+          highSurrogate == 0,
           "A high surrogate cannot be followed by a regular code point.");
     }
 
@@ -96,7 +96,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     helper.toUTF8String(output);
   }
 
-  throwError(highSurrogate == '\0',
+  throwError(highSurrogate == 0,
              "A high surrogate must be followed by a low surrogate.");
 
   output += input.substr(lastPos);

From 947ad6a22be5e72c37b1fa3fa390bb157fa3a65e Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 12:18:58 +0100
Subject: [PATCH 08/13] Don't escape twice

---
 src/parser/SparqlParserHelpers.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index b9ecae205d..b6e7b7c96d 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -33,8 +33,7 @@ ParserAndVisitor::ParserAndVisitor(
 ParserAndVisitor::ParserAndVisitor(
     std::string input, SparqlQleverVisitor::PrefixMap prefixes,
     SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks)
-    : ParserAndVisitor{unescapeUnicodeSequences(std::move(input)),
-                       disableSomeChecks} {
+    : ParserAndVisitor{std::move(input), disableSomeChecks} {
   visitor_.setPrefixMapManually(std::move(prefixes));
 }
 

From f1c8834f06e7b9b0d32250da039e00f4edeb2e41 Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 15:08:48 +0100
Subject: [PATCH 09/13] Address sonarcloud issues

---
 src/parser/SparqlParserHelpers.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index b6e7b7c96d..614d2d42be 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -39,7 +39,7 @@ ParserAndVisitor::ParserAndVisitor(
 
 // _____________________________________________________________________________
 std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
-  std::u8string_view utf8View{reinterpret_cast<char8_t*>(input.data()),
+  std::u8string_view utf8View{std::bit_cast<char8_t*>(input.data()),
                               input.size()};
   std::string output;
   size_t lastPos = 0;
@@ -52,6 +52,8 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     }
   };
 
+  constexpr size_t prefixLength = std::string_view{"\\U"}.size();
+
   for (auto match :
        ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(utf8View)) {
     output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos));
@@ -60,14 +62,16 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     auto hexValue = match.to_view();
 
     UChar32 codePoint;
-    const char* startPointer = reinterpret_cast<const char*>(hexValue.data());
-    auto result = std::from_chars(
-        startPointer + 2, startPointer + hexValue.size(), codePoint, 16);
+    auto* startPointer = std::bit_cast<const char*>(hexValue.data());
+    auto result =
+        std::from_chars(startPointer + prefixLength,
+                        startPointer + hexValue.size(), codePoint, 16);
     AD_CORRECTNESS_CHECK(result.ec == std::errc{});
 
     bool isFullCodePoint = hexValue.size() == 10;
     AD_CORRECTNESS_CHECK(
-        hexValue.size() == 10 || hexValue.size() == 6,
+        hexValue.size() == 8 + prefixLength ||
+            hexValue.size() == 4 + prefixLength,
         "Unicode escape sequences must be either 8 or 4 characters long.");
 
     if (U16_IS_LEAD(codePoint)) {

From 8a9f7309c57472169dbf6eeb2c2835b4eac75a69 Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 17:33:00 +0100
Subject: [PATCH 10/13] Simplify code and avoid redundant copies

---
 src/parser/SparqlParserHelpers.cpp | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index 614d2d42be..51f0ceb696 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -39,39 +39,34 @@ ParserAndVisitor::ParserAndVisitor(
 
 // _____________________________________________________________________________
 std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
-  std::u8string_view utf8View{std::bit_cast<char8_t*>(input.data()),
-                              input.size()};
+  std::string_view view{input};
   std::string output;
   size_t lastPos = 0;
   UChar32 highSurrogate = 0;
 
-  auto throwError = [](bool condition, const std::string& message) {
+  auto throwError = [](bool condition, std::string_view message) {
     if (!condition) {
       throw InvalidSparqlQueryException{
           absl::StrCat("Error in unicode escape sequence. ", message)};
     }
   };
 
-  constexpr size_t prefixLength = std::string_view{"\\U"}.size();
-
   for (auto match :
-       ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(utf8View)) {
-    output += input.substr(lastPos, match.data() - (utf8View.data() + lastPos));
-    lastPos = match.data() + match.size() - utf8View.data();
+       ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(view)) {
+    output += view.substr(lastPos, match.data() - (view.data() + lastPos));
+    lastPos = match.data() + match.size() - view.data();
 
     auto hexValue = match.to_view();
+    hexValue.remove_prefix(std::string_view{"\\U"}.size());
 
     UChar32 codePoint;
-    auto* startPointer = std::bit_cast<const char*>(hexValue.data());
-    auto result =
-        std::from_chars(startPointer + prefixLength,
-                        startPointer + hexValue.size(), codePoint, 16);
+    auto result = std::from_chars(
+        hexValue.data(), hexValue.data() + hexValue.size(), codePoint, 16);
     AD_CORRECTNESS_CHECK(result.ec == std::errc{});
 
-    bool isFullCodePoint = hexValue.size() == 10;
+    bool isFullCodePoint = hexValue.size() == 8;
     AD_CORRECTNESS_CHECK(
-        hexValue.size() == 8 + prefixLength ||
-            hexValue.size() == 4 + prefixLength,
+        hexValue.size() == 8 || hexValue.size() == 4,
         "Unicode escape sequences must be either 8 or 4 characters long.");
 
     if (U16_IS_LEAD(codePoint)) {
@@ -102,7 +97,7 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
   throwError(highSurrogate == 0,
              "A high surrogate must be followed by a low surrogate.");
 
-  output += input.substr(lastPos);
+  output += view.substr(lastPos);
   return output;
 }
 }  // namespace sparqlParserHelpers

From b44d77cc323d9ed7259d7171031fe0eb3a07cad9 Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 17:49:26 +0100
Subject: [PATCH 11/13] Fix bugs and optimize code

---
 src/parser/SparqlParserHelpers.cpp | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/parser/SparqlParserHelpers.cpp b/src/parser/SparqlParserHelpers.cpp
index 51f0ceb696..541725ec39 100644
--- a/src/parser/SparqlParserHelpers.cpp
+++ b/src/parser/SparqlParserHelpers.cpp
@@ -41,6 +41,7 @@ ParserAndVisitor::ParserAndVisitor(
 std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
   std::string_view view{input};
   std::string output;
+  bool noEscapeSequenceFound = true;
   size_t lastPos = 0;
   UChar32 highSurrogate = 0;
 
@@ -51,9 +52,20 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     }
   };
 
-  for (auto match :
+  for (const auto& match :
        ctre::search_all<R"(\\U[0-9A-Fa-f]{8}|\\u[0-9A-Fa-f]{4})">(view)) {
-    output += view.substr(lastPos, match.data() - (view.data() + lastPos));
+    if (noEscapeSequenceFound) {
+      output.reserve(input.size());
+      noEscapeSequenceFound = false;
+    }
+    auto inBetweenPart =
+        view.substr(lastPos, match.data() - (view.data() + lastPos));
+
+    throwError(
+        inBetweenPart.empty() || highSurrogate == 0,
+        "A high surrogate must be directly followed by a low surrogate.");
+
+    output += inBetweenPart;
     lastPos = match.data() + match.size() - view.data();
 
     auto hexValue = match.to_view();
@@ -63,12 +75,14 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     auto result = std::from_chars(
         hexValue.data(), hexValue.data() + hexValue.size(), codePoint, 16);
     AD_CORRECTNESS_CHECK(result.ec == std::errc{});
-
-    bool isFullCodePoint = hexValue.size() == 8;
     AD_CORRECTNESS_CHECK(
         hexValue.size() == 8 || hexValue.size() == 4,
         "Unicode escape sequences must be either 8 or 4 characters long.");
 
+    bool isFullCodePoint = hexValue.size() == 8;
+
+    // See https://symbl.cc/en/unicode/blocks/high-surrogates/ for more
+    // information.
     if (U16_IS_LEAD(codePoint)) {
       throwError(!isFullCodePoint,
                  "Surrogates should not be encoded as full code points.");
@@ -94,6 +108,11 @@ std::string ParserAndVisitor::unescapeUnicodeSequences(std::string input) {
     helper.toUTF8String(output);
   }
 
+  // Avoid redundant copy if no escape sequences were found.
+  if (noEscapeSequenceFound) {
+    return input;
+  }
+
   throwError(highSurrogate == 0,
              "A high surrogate must be followed by a low surrogate.");
 

From 24d66202ed1d450e5c34436d6be09c7ee51e84c4 Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 18:07:24 +0100
Subject: [PATCH 12/13] Add additional unit tests

---
 test/SparqlParserTest.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp
index 6b23526578..c232f2186a 100644
--- a/test/SparqlParserTest.cpp
+++ b/test/SparqlParserTest.cpp
@@ -23,6 +23,7 @@ auto lit = ad_utility::testing::tripleComponentLiteral;
 auto iri = ad_utility::testing::iri;
 }  // namespace
 
+// _____________________________________________________________________________
 TEST(ParserTest, testParse) {
   {
     auto pq = SparqlParser::parseQuery("SELECT ?x WHERE {?x ?y ?z}");
@@ -698,6 +699,7 @@ TEST(ParserTest, testParse) {
   }
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testFilterWithoutDot) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "PREFIX fb: <http://rdf.freebase.com/ns/>\n"
@@ -726,6 +728,7 @@ TEST(ParserTest, testFilterWithoutDot) {
   ASSERT_EQ("(?1 != fb:m.018mts)", filters[2].expression_.getDescriptor());
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testExpandPrefixes) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "PREFIX : <http://rdf.myprefix.com/>\n"
@@ -754,6 +757,7 @@ TEST(ParserTest, testExpandPrefixes) {
   ASSERT_EQ(0, pq._limitOffset._offset);
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testLiterals) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> SELECT * WHERE { "
@@ -772,6 +776,7 @@ TEST(ParserTest, testLiterals) {
   ASSERT_EQ(DateYearOrDuration{Date(2000, 1, 1, -1)}, c._triples[1].o_);
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testSolutionModifiers) {
   {
     ParsedQuery pq =
@@ -993,6 +998,7 @@ TEST(ParserTest, testSolutionModifiers) {
   }
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, testGroupByAndAlias) {
   ParsedQuery pq = SparqlParser::parseQuery(
       "SELECT (COUNT(?a) as ?count) WHERE { ?b <rel> ?a } GROUP BY ?b");
@@ -1008,6 +1014,7 @@ TEST(ParserTest, testGroupByAndAlias) {
   EXPECT_THAT(pq, m::GroupByVariables({Var{"?b"}}));
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, Bind) {
   ParsedQuery pq =
       SparqlParser::parseQuery("SELECT ?a WHERE { BIND (10 - 5 as ?a) . }");
@@ -1020,6 +1027,7 @@ TEST(ParserTest, Bind) {
   ASSERT_EQ(bind._expression.getDescriptor(), "10 - 5");
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, Order) {
   {
     ParsedQuery pq =
@@ -1101,6 +1109,7 @@ TEST(ParserTest, Order) {
    */
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, Group) {
   {
     ParsedQuery pq = SparqlParser::parseQuery(
@@ -1165,6 +1174,7 @@ TEST(ParserTest, Group) {
   }
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, LanguageFilterPostProcessing) {
   {
     ParsedQuery q = SparqlParser::parseQuery(
@@ -1247,6 +1257,7 @@ TEST(ParserTest, LanguageFilterPostProcessing) {
   }
 }
 
+// _____________________________________________________________________________
 namespace {
 std::string getFirstTriple(const ParsedQuery& q) {
   return q._rootGraphPattern._graphPatterns.at(0)
@@ -1256,6 +1267,7 @@ std::string getFirstTriple(const ParsedQuery& q) {
 }
 }  // namespace
 
+// _____________________________________________________________________________
 TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) {
   ParsedQuery q1 = SparqlParser::parseQuery(
       R"(SELECT * WHERE { ?s <http://a.example/p1> '\u0080\u07FF\u0800\u0FFF\u1000\uCFFF\uD000\uD7FF\uE000\uFFFD\U00010000\U0003FFFD\U00040000\U000FFFFD\U00100000\U0010FFFD'})");
@@ -1277,8 +1289,14 @@ TEST(ParserTest, HandlesBasicUnicodeEscapeSequences) {
       R"(SELECT * WHERE { <http://example.com/\U0001F937\U0001F3FD\u200D\U00002642\ufe0F> ?p\u00201. })");
   EXPECT_EQ(getFirstTriple(q4),
             "{s: <http://example.com/🤷🏽‍♂️>, p: ?p, o: 1}");
+
+  // Ensure we don't double-unescape, \u sequences are not allowed in literals
+  EXPECT_THROW(
+      SparqlParser::parseQuery(R"(SELECT * WHERE { "\u005Cu2764" ?p 1. })"),
+      InvalidSparqlQueryException);
 }
 
+// _____________________________________________________________________________
 TEST(ParserTest, HandlesSurrogatesCorrectly) {
   using SP = SparqlParser;
   using ::testing::HasSubstr;
@@ -1286,6 +1304,12 @@ TEST(ParserTest, HandlesSurrogatesCorrectly) {
       R"(SELECT * WHERE { "\uD83E\udD37\uD83C\uDFFD\u200D\u2642\uFE0F" ?p 1. })");
   EXPECT_EQ(getFirstTriple(q), "{s: \"🤷🏽‍♂️\", p: ?p, o: 1}");
 
+  AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
+      SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD83C \uDFFD' })"),
+      HasSubstr(
+          "A high surrogate must be directly followed by a low surrogate."),
+      InvalidSparqlQueryException);
+
   AD_EXPECT_THROW_WITH_MESSAGE_AND_TYPE(
       SP::parseQuery(R"(SELECT * WHERE { ?s ?p '\uD800' })"),
       HasSubstr("A high surrogate must be followed by a low surrogate."),

From 777224de8c38a47e06818d1acb752c08250ade1d Mon Sep 17 00:00:00 2001
From: RobinTF <83676088+RobinTF@users.noreply.github.com>
Date: Thu, 6 Feb 2025 18:18:59 +0100
Subject: [PATCH 13/13] Ignore URLs in codespell

---
 .codespellrc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.codespellrc b/.codespellrc
index a1e8aa7c0f..0132eac027 100644
--- a/.codespellrc
+++ b/.codespellrc
@@ -3,6 +3,6 @@
 skip = .git*,.codespellrc,*.pdf,generated
 check-hidden = true
 # Ignore mixedCase variables, lines with latin, lines with codespell-ignore pragma, etc
-ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*
+ignore-regex = \b([A-Z]*[a-z]+[A-Z][a-zA-Z]*)\b|.*(Lorem ipsum|eleifend|feugait|codespell-ignore).*|https?://\S+
 # alph - is used frequently in tests, just ignore altogether
 ignore-words-list = ser,alph,inbetween,interm