From aa550575c990869a44bc469490c0371e77243b48 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <joka921@users.noreply.github.com>
Date: Tue, 4 Feb 2025 03:09:53 +0100
Subject: [PATCH] Account for `FILTER`s when considering greedy query planning
 (#1705)

Since #1442, QLever switches to greedy query planning for large connected components. A connected component is considered large when the number of connected subgraphs is above the threshold determined by the runtime parameter `query-planning-budget`.

So far, `FILTER`s were simply ignored when counting the number of subgraphs. However, `FILTER`s can add significant complexity to the standard query planning because for each subplan, our query planner considers either adding all applicable `FILTER`s to it or none of them. As a result, for certain queries with a medium-sized component but a significant number of `FILTER`s, the query planning complexity was underestimated and the query was not planned greedily and the standard query planning took very long.

This is now fixed by replacing, for the purpose of query planning, each `FILTER` by a dummy `VALUES` clause which uses the set of distinct variables from the `FILTER`. A `FILTER` that has many variables in common with other triples will then increase the subgraph count substantially. If multiple `FILTER`s have the same set of distinct variables, the dummy `VALUES` clause is added only once (because our query planner either adds all applicable `FILTER`s at a certain point or none of them). Note that this trick overestimates the true query planning complexity. That is, the worst that can happen now is that with many `FILTER`s, we switch to greedy planning even though standard query planning would have still been feasible,
---
 src/engine/QueryPlanner.cpp | 39 +++++++++++++++++++++++++++++++++++--
 src/engine/QueryPlanner.h   |  7 +++++--
 2 files changed, 42 insertions(+), 4 deletions(-)
diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp
index 43caa71f02..a8bf59d1c6 100644
--- a/src/engine/QueryPlanner.cpp
+++ b/src/engine/QueryPlanner.cpp
@@ -1343,7 +1343,8 @@ QueryPlanner::runDynamicProgrammingOnConnectedComponent(
 
 // _____________________________________________________________________________
 size_t QueryPlanner::countSubgraphs(
-    std::vector<const QueryPlanner::SubtreePlan*> graph, size_t budget) {
+    std::vector<const QueryPlanner::SubtreePlan*> graph,
+    const std::vector<SparqlFilter>& filters, size_t budget) {
   // Remove duplicate plans from `graph`.
   auto getId = [](const SubtreePlan* v) { return v->_idsOfIncludedNodes; };
   ql::ranges::sort(graph, ql::ranges::less{}, getId);
@@ -1354,6 +1355,36 @@ size_t QueryPlanner::countSubgraphs(
   graph.erase(uniqueIter.begin(), graph.end());
 #endif
 
+  // We also have to consider the `filters`. To make life easy, we temporarily
+  // create simple `SubtreePlans` for them which just have the correct
+  // variables. We only create one subtree plan for each set of variables that
+  // is contained in the `filters`, because this will bring the estimate of this
+  // function closer to the actual behavior of the DP query planner (it always
+  // applies either all possible filters at once, or none of them).
+  std::vector<QueryPlanner::SubtreePlan> dummyPlansForFilter;
+  ad_utility::HashSet<ad_utility::HashSet<Variable>>
+      deduplicatedFilterVariables;
+  for (const auto& filter : filters) {
+    const auto& vars = filter.expression_.containedVariables();
+    ad_utility::HashSet<Variable> varSet;
+    // We use a `VALUES` clause as the dummy because this operation is the
+    // easiest to setup for a number of given variables.
+    parsedQuery::SparqlValues values;
+    for (auto* var : vars) {
+      values._variables.push_back(*var);
+      varSet.insert(*var);
+    }
+    if (deduplicatedFilterVariables.insert(std::move(varSet)).second) {
+      dummyPlansForFilter.push_back(
+          makeSubtreePlan<Values>(_qec, std::move(values)));
+    }
+  }
+
+  const size_t numPlansWithoutFilters = graph.size();
+  for (const auto& filterPlan : dummyPlansForFilter) {
+    graph.push_back(&filterPlan);
+  }
+
   // Qlever currently limits the number of triples etc. per group to be <= 64
   // anyway, so we can simply assert here.
   AD_CORRECTNESS_CHECK(graph.size() <= 64,
@@ -1367,7 +1398,11 @@ size_t QueryPlanner::countSubgraphs(
   for (size_t i = 0; i < graph.size(); ++i) {
     countConnectedSubgraphs::Node v{0};
     for (size_t k = 0; k < graph.size(); ++k) {
+      // Don't connect nodes to themselves, don't connect filters with other
+      // filters, otherwise connect `i` and `k` if they have at least one
+      // variable in common.
       if ((k != i) &&
+          (k < numPlansWithoutFilters || i < numPlansWithoutFilters) &&
           !QueryPlanner::getJoinColumns(*graph.at(k), *graph.at(i)).empty()) {
         v.neighbors_ |= (1ULL << k);
       }
@@ -1477,7 +1512,7 @@ vector<vector<QueryPlanner::SubtreePlan>> QueryPlanner::fillDpTab(
       g.push_back(&plan);
     }
     const size_t budget = RuntimeParameters().get<"query-planning-budget">();
-    bool useGreedyPlanning = countSubgraphs(g, budget) > budget;
+    bool useGreedyPlanning = countSubgraphs(g, filters, budget) > budget;
     if (useGreedyPlanning) {
       LOG(INFO)
           << "Using the greedy query planner for a large connected component"
diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h
index b51523baed..5b4a1c67f3 100644
--- a/src/engine/QueryPlanner.h
+++ b/src/engine/QueryPlanner.h
@@ -488,8 +488,11 @@ class QueryPlanner {
   // if the number of subgraphs is `> budget`. This is used to analyze the
   // complexity of the query graph and to choose between the DP and the greedy
   // query planner see above.
-  static size_t countSubgraphs(std::vector<const SubtreePlan*> graph,
-                               size_t budget);
+  // Note: We also need the added filters, because they behave like additional
+  // graph nodes wrt the performance of the DP based query planner.
+  size_t countSubgraphs(std::vector<const SubtreePlan*> graph,
+                        const std::vector<SparqlFilter>& filters,
+                        size_t budget);
 
   // Creates a SubtreePlan for the given text leaf node in the triple graph.
   // While doing this the TextLimitMetaObjects are created and updated according