diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 96e5d5edf6..bdf9132f6a 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -5,12 +5,22 @@ #include "Bind.h" #include "engine/CallFixedSize.h" +#include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "util/ChunkedForLoop.h" #include "util/Exception.h" +// _____________________________________________________________________________ +Bind::Bind(QueryExecutionContext* qec, + std::shared_ptr subtree, parsedQuery::Bind b) + : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { + _subtree = ExistsJoin::addExistsJoinsToSubtree( + _bind._expression, std::move(_subtree), getExecutionContext(), + cancellationHandle_); +} + // BIND adds exactly one new column size_t Bind::getResultWidth() const { return _subtree->getResultWidth() + 1; } diff --git a/src/engine/Bind.h b/src/engine/Bind.h index 0abd5b2cec..5613f8cd6f 100644 --- a/src/engine/Bind.h +++ b/src/engine/Bind.h @@ -8,14 +8,14 @@ #include "engine/sparqlExpressions/SparqlExpressionPimpl.h" #include "parser/ParsedQuery.h" -/// BIND operation, currently only supports a very limited subset of expressions +// BIND operation. class Bind : public Operation { public: static constexpr size_t CHUNK_SIZE = 10'000; + // ____________________________________________________________________________ Bind(QueryExecutionContext* qec, std::shared_ptr subtree, - parsedQuery::Bind b) - : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) {} + parsedQuery::Bind b); private: std::shared_ptr _subtree; diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 98faba8743..7e8cbbc953 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -15,5 +15,5 @@ add_library(engine TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp Describe.cpp GraphStoreProtocol.cpp - QueryExecutionContext.cpp) + QueryExecutionContext.cpp ExistsJoin.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp new file mode 100644 index 0000000000..902f551ddb --- /dev/null +++ b/src/engine/ExistsJoin.cpp @@ -0,0 +1,207 @@ +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach + +#include "engine/ExistsJoin.h" + +#include "CallFixedSize.h" +#include "engine/QueryPlanner.h" +#include "engine/sparqlExpressions/ExistsExpression.h" +#include "engine/sparqlExpressions/SparqlExpression.h" +#include "util/JoinAlgorithms/JoinAlgorithms.h" + +// _____________________________________________________________________________ +ExistsJoin::ExistsJoin(QueryExecutionContext* qec, + std::shared_ptr left, + std::shared_ptr right, + Variable existsVariable) + : Operation{qec}, + left_{std::move(left)}, + right_{std::move(right)}, + joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)}, + existsVariable_{std::move(existsVariable)} { + // Make sure that the left and right input are sorted on the join columns. + std::tie(left_, right_) = QueryExecutionTree::createSortedTrees( + std::move(left_), std::move(right_), joinColumns_); +} + +// _____________________________________________________________________________ +string ExistsJoin::getCacheKeyImpl() const { + return absl::StrCat("EXISTS JOIN left: ", left_->getCacheKey(), + " right: ", right_->getCacheKey()); +} + +// _____________________________________________________________________________ +string ExistsJoin::getDescriptor() const { return "Exists Join"; } + +// ____________________________________________________________________________ +VariableToColumnMap ExistsJoin::computeVariableToColumnMap() const { + auto res = left_->getVariableColumns(); + AD_CONTRACT_CHECK( + !res.contains(existsVariable_), + "The target variable of an EXISTS join must be a new variable"); + res[existsVariable_] = makeAlwaysDefinedColumn(getResultWidth() - 1); + return res; +} + +// ____________________________________________________________________________ +size_t ExistsJoin::getResultWidth() const { + // We add one column to the input. + return left_->getResultWidth() + 1; +} + +// ____________________________________________________________________________ +vector ExistsJoin::resultSortedOn() const { + // We add one column to `left_`, but do not change the order of the rows. + return left_->resultSortedOn(); +} + +// ____________________________________________________________________________ +float ExistsJoin::getMultiplicity(size_t col) { + // The multiplicities of all columns except the last one are the same as in + // `left_`. + if (col < getResultWidth() - 1) { + return left_->getMultiplicity(col); + } + // For the added (Boolean) column we take a dummy value, assuming that it + // will not be used for subsequent joins or other operations that make use of + // the multiplicities. + return 1; +} + +// ____________________________________________________________________________ +uint64_t ExistsJoin::getSizeEstimateBeforeLimit() { + return left_->getSizeEstimate(); +} + +// ____________________________________________________________________________ +size_t ExistsJoin::getCostEstimate() { + // The implementation is a linear zipper join. + return left_->getCostEstimate() + right_->getCostEstimate() + + left_->getSizeEstimate() + right_->getSizeEstimate(); +} + +// ____________________________________________________________________________ +ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { + auto leftRes = left_->getResult(); + auto rightRes = right_->getResult(); + const auto& left = leftRes->idTable(); + const auto& right = rightRes->idTable(); + + // We reuse the generic `zipperJoinWithUndef` function, which has two two + // callbacks: one for each matching pair of rows from `left` and `right`, and + // one for rows in the left input that have no matching counterpart in the + // right input. The first callback can be a noop, and the second callback + // gives us exactly those rows, where the value in the to-be-added result + // column should be `false`. + + // Extract the join columns from both inputs to make the following code + // easier. + ad_utility::JoinColumnMapping joinColumnData{joinColumns_, left.numColumns(), + right.numColumns()}; + IdTableView<0> joinColumnsLeft = + left.asColumnSubsetView(joinColumnData.jcsLeft()); + IdTableView<0> joinColumnsRight = + right.asColumnSubsetView(joinColumnData.jcsRight()); + checkCancellation(); + + // Compute `isCheap`, which is true iff there are no UNDEF values in the join + // columns (in which case we can use a simpler and cheaper join algorithm). + // + // TODO This is the most common case. There are many other cases + // where the generic `zipperJoinWithUndef` can be optimized. This is work for + // a future PR. + size_t numJoinColumns = joinColumnsLeft.numColumns(); + AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.numColumns()); + bool isCheap = ql::ranges::none_of( + ad_utility::integerRange(numJoinColumns), [&](const auto& col) { + return (ql::ranges::any_of(joinColumnsRight.getColumn(col), + &Id::isUndefined)) || + (ql::ranges::any_of(joinColumnsLeft.getColumn(col), + &Id::isUndefined)); + }); + + // Nothing to do for the actual matches. + auto noopRowAdder = ad_utility::noop; + + // Store the indices of rows for which the value of the `EXISTS` (in the added + // Boolean column) should be `false`. + std::vector> notExistsIndices{ + allocator()}; + // Helper lambda for computing the exists join with `callFixedSize`, which + // makes the number of join columns a template parameter. + auto runForNumJoinCols = [¬ExistsIndices, isCheap, &noopRowAdder, + &colsLeftDynamic = joinColumnsLeft, + &colsRightDynamic = joinColumnsRight, + this]() { + // The `actionForNotExisting` callback gets iterators as input, but should + // output indices, hence the pointer arithmetic. + auto joinColumnsLeft = colsLeftDynamic.asStaticView(); + auto joinColumnsRight = colsRightDynamic.asStaticView(); + auto actionForNotExisting = + [¬ExistsIndices, begin = joinColumnsLeft.begin()]( + const auto& itLeft) { notExistsIndices.push_back(itLeft - begin); }; + + // Run `zipperJoinWithUndef` with the described callbacks and the mentioned + // optimization in case we know that there are no UNDEF values in the join + // columns. + auto checkCancellationLambda = [this] { checkCancellation(); }; + auto runZipperJoin = [&](auto findUndef) { + [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( + joinColumnsLeft, joinColumnsRight, + ql::ranges::lexicographical_compare, noopRowAdder, findUndef, + findUndef, actionForNotExisting, checkCancellationLambda); + }; + if (isCheap) { + runZipperJoin(ad_utility::noop); + } else { + runZipperJoin(ad_utility::findSmallerUndefRanges); + } + }; + ad_utility::callFixedSize(numJoinColumns, runForNumJoinCols); + + // Add the result column from the computed `notExistsIndices` (which tell us + // where the value should be `false`). + IdTable result = left.clone(); + result.addEmptyColumn(); + decltype(auto) existsCol = result.getColumn(getResultWidth() - 1); + ql::ranges::fill(existsCol, Id::makeFromBool(true)); + for (size_t notExistsIndex : notExistsIndices) { + existsCol[notExistsIndex] = Id::makeFromBool(false); + } + + // The added column only contains Boolean values, and adds no new words to the + // local vocabulary, so we can simply copy the local vocab from `leftRes`. + return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; +} + +// _____________________________________________________________________________ +std::shared_ptr ExistsJoin::addExistsJoinsToSubtree( + const sparqlExpression::SparqlExpressionPimpl& expression, + std::shared_ptr subtree, QueryExecutionContext* qec, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + // Extract all `EXISTS` functions from the given `expression`. + std::vector existsExpressions; + expression.getPimpl()->getExistsExpressions(existsExpressions); + + // For each `EXISTS` function, add the corresponding `ExistsJoin`. + for (auto* expr : existsExpressions) { + const auto& exists = + dynamic_cast(*expr); + // If we have already considered this `EXIST` (which we can detect by its + // variable), skip it. This can happen because some `FILTER`s (which may + // contain `EXISTS` functions) are applied multiple times (for example, + // when there are OPTIONAL joins in the query). + if (subtree->isVariableCovered(exists.variable())) { + continue; + } + + QueryPlanner qp{qec, cancellationHandle}; + auto pq = exists.argument(); + auto tree = + std::make_shared(qp.createExecutionTree(pq)); + subtree = ad_utility::makeExecutionTree( + qec, std::move(subtree), std::move(tree), exists.variable()); + } + return subtree; +} diff --git a/src/engine/ExistsJoin.h b/src/engine/ExistsJoin.h new file mode 100644 index 0000000000..43dbbe074f --- /dev/null +++ b/src/engine/ExistsJoin.h @@ -0,0 +1,82 @@ +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach + +#pragma once + +#include "engine/Operation.h" +#include "engine/QueryExecutionTree.h" + +// The implementation of an "EXISTS join", which we use to realize the semantics +// of the SPARQL `EXISTS` function. The join takes two subtrees as input, and +// returns the left subtree with an additional boolean column that is `true` iff +// at least one matching row is contained in the right subtree. +class ExistsJoin : public Operation { + private: + // The left and right child. + std::shared_ptr left_; + std::shared_ptr right_; + std::vector> joinColumns_; + + // The variable of the added (Boolean) result column. + Variable existsVariable_; + + public: + // Constructor. The `existsVariable` (the variable for the added column) must + // not yet be bound in `left`. + ExistsJoin(QueryExecutionContext* qec, + std::shared_ptr left, + std::shared_ptr right, + Variable existsVariable); + + // Extract all `ExistsExpression`s from the given `expression`. For each + // `ExistsExpression`, add an `ExistsJoin`. The left side of the first + // `ExistsJoin` is the input `subtree`. The left side of subsequent + // `ExistsJoin`s is the previous `ExistsJoin`. The right side of each + // `ExistsJoin` is the argument of the respective `ExistsExpression`. When + // there are no `ExistsExpression`s, return the input `subtree` unchanged. + // + // The returned subtree will contain one additional column for each + // `ExistsExpression`, which contains the result of the respective + // `ExistsJoin`. The `ExistsExpression` just reads the values of this column. + // The main work is done by the `ExistsJoin`. + // + // This function should be called in the constructor of each `Operation`, + // where an `EXISTS` expression can occur. For example, in the constructor of + // `BIND` and `FILTER`. + static std::shared_ptr addExistsJoinsToSubtree( + const sparqlExpression::SparqlExpressionPimpl& expression, + std::shared_ptr subtree, QueryExecutionContext* qec, + const ad_utility::SharedCancellationHandle& cancellationHandle); + + // All following functions are inherited from `Operation`, see there for + // comments. + protected: + string getCacheKeyImpl() const override; + + public: + string getDescriptor() const override; + + size_t getResultWidth() const override; + + vector resultSortedOn() const override; + + bool knownEmptyResult() override { return left_->knownEmptyResult(); } + + float getMultiplicity(size_t col) override; + + private: + uint64_t getSizeEstimateBeforeLimit() override; + + public: + size_t getCostEstimate() override; + + vector getChildren() override { + return {left_.get(), right_.get()}; + } + + private: + ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override; + + VariableToColumnMap computeVariableToColumnMap() const override; +}; diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index fe5c6ad3e0..da868520b8 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -10,6 +10,7 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" +#include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" @@ -28,6 +29,9 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { + _subtree = ExistsJoin::addExistsJoinsToSubtree( + _expression, std::move(_subtree), getExecutionContext(), + cancellationHandle_); setPrefilterExpressionForChildren(); } diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 5205e4723e..2370a91c29 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -1,14 +1,14 @@ -// Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: -// 2018 Florian Kramer (florian.kramer@mail.uni-freiburg.de) -// 2020- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) +// Copyright 2018 - 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Florian Kramer [2018 - 2020] +// Johannes Kalmbach #include "engine/GroupBy.h" #include #include "engine/CallFixedSize.h" +#include "engine/ExistsJoin.h" #include "engine/IndexScan.h" #include "engine/Join.h" #include "engine/LazyGroupBy.h" @@ -52,6 +52,14 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, ql::ranges::sort(_groupByVariables, std::less<>{}, &Variable::name); auto sortColumns = computeSortColumns(subtree.get()); + + // Aliases are like `BIND`s, which may contain `EXISTS` expressions. + for (const auto& alias : _aliases) { + subtree = ExistsJoin::addExistsJoinsToSubtree( + alias._expression, std::move(subtree), getExecutionContext(), + cancellationHandle_); + } + _subtree = QueryExecutionTree::createSortedTree(std::move(subtree), sortColumns); } @@ -1526,7 +1534,6 @@ Result GroupBy::computeGroupByForHashMapOptimization( // NOTE: If the input blocks have very similar or even identical non-empty // local vocabs, no deduplication is performed. localVocab.mergeWith(std::span{&inputLocalVocab, 1}); - // Setup the `EvaluationContext` for this input block. sparqlExpression::EvaluationContext evaluationContext( *getExecutionContext(), _subtree->getVariableColumns(), inputTable, diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp index 75852fb69a..a831c4cd55 100644 --- a/src/engine/MultiColumnJoin.cpp +++ b/src/engine/MultiColumnJoin.cpp @@ -1,6 +1,7 @@ -// Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Florian Kramer (florian.kramer@netpun.uni-freiburg.de) +// Copyright 2018 - 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Florian Kramer [2018 - 2020] +// Johannes Kalmbach #include "MultiColumnJoin.h" @@ -237,21 +238,16 @@ void MultiColumnJoin::computeMultiColumnJoin( rowAdder.addRow(itLeft - beginLeft, itRight - beginRight); }; - auto findUndef = [](const auto& row, auto begin, auto end, - bool& resultMightBeUnsorted) { - return ad_utility::findSmallerUndefRanges(row, begin, end, - resultMightBeUnsorted); - }; - - // `isCheap` is true iff there are no UNDEF values in the join columns. In - // this case we can use a much cheaper algorithm. - // TODO There are many other cases where a cheaper implementation can - // be chosen, but we leave those for another PR, this is the most common case. - namespace stdr = ql::ranges; - bool isCheap = stdr::none_of(joinColumns, [&](const auto& jcs) { + // Compute `isCheap`, which is true iff there are no UNDEF values in the join + // columns (in which case we can use a simpler and cheaper join algorithm). + // + // TODO This is the most common case. There are many other cases + // where the generic `zipperJoinWithUndef` can be optimized. We will those + // for a later PR. + bool isCheap = ql::ranges::none_of(joinColumns, [&](const auto& jcs) { auto [leftCol, rightCol] = jcs; - return (stdr::any_of(right.getColumn(rightCol), &Id::isUndefined)) || - (stdr::any_of(left.getColumn(leftCol), &Id::isUndefined)); + return (ql::ranges::any_of(right.getColumn(rightCol), &Id::isUndefined)) || + (ql::ranges::any_of(left.getColumn(leftCol), &Id::isUndefined)); }); auto checkCancellationLambda = [this] { checkCancellation(); }; @@ -265,8 +261,10 @@ void MultiColumnJoin::computeMultiColumnJoin( } else { return ad_utility::zipperJoinWithUndef( leftJoinColumns, rightJoinColumns, - ql::ranges::lexicographical_compare, addRow, findUndef, findUndef, - ad_utility::noop, checkCancellationLambda); + ql::ranges::lexicographical_compare, addRow, + ad_utility::findSmallerUndefRanges, + ad_utility::findSmallerUndefRanges, ad_utility::noop, + checkCancellationLambda); } }(); *result = std::move(rowAdder).resultTable(); diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h new file mode 100644 index 0000000000..afaa026344 --- /dev/null +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -0,0 +1,70 @@ +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach + +#pragma once + +#include + +#include "engine/sparqlExpressions/SparqlExpression.h" +#include "parser/ParsedQuery.h" + +// The `SparqlExpression` for `EXISTS`. The implementation is straightforward +// because it only reads the value computed by the special `ExistsJoin` +// operation, where the actual work is done (see the comments there). +namespace sparqlExpression { +class ExistsExpression : public SparqlExpression { + private: + // The argument of the `EXISTS`, which is a group graph pattern. This is set + // during parsing and is used by the `ExistsJoin` operation. + ParsedQuery argument_; + + // Each `ExistsExpression` has a unique index and a unique variable name that + // is used to communicate the result computed by the `ExistsJoin` to this + // `ExistsExpression`. + static inline std::atomic indexCounter_ = 0; + size_t index_ = ++indexCounter_; + Variable variable_{absl::StrCat("?ql_internal_exists_", index_)}; + + public: + explicit ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} + const auto& argument() const { return argument_; } + const auto& variable() const { return variable_; } + + // To evaluate, just return the variable of the column computed by the + // `ExistsJoin`. + ExpressionResult evaluate(EvaluationContext* context) const override { + AD_CONTRACT_CHECK(context->_variableToColumnMap.contains(variable_)); + return variable_; + } + + // Return the cache key, which in the normal case depends on the column index + // of the variable computed by the `ExistsJoin`. + // + // There is a special case, where the corresponding `ExistsJoin` has not + // been set up yet (because the query planning is not yet complete). Since we + // cannot cache incomplete operations, we return a random cache key in this + // case. + [[nodiscard]] string getCacheKey( + const VariableToColumnMap& varColMap) const override { + if (varColMap.contains(variable_)) { + return absl::StrCat("ExistsExpression col# ", + varColMap.at(variable_).columnIndex_); + } else { + // This means that the necessary `ExistsJoin` hasn't been set up yet. For + // example, this can happen if `getCacheKey` is called during query + // planning (which is done to avoid redundant evaluation in the case of + // identical subtrees in the query plan). + return absl::StrCat("Uninitialized Exists: ", + ad_utility::FastRandomIntGenerator{}()); + } + } + + // This is the one expression, where this function should return `true`. + // Used to extract `EXISTS` expressions from a general expression tree. + bool isExistsExpression() const override { return true; } + + private: + std::span childrenImpl() override { return {}; } +}; +} // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/SparqlExpression.cpp b/src/engine/sparqlExpressions/SparqlExpression.cpp index b5ec3aa0f7..099933020f 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.cpp +++ b/src/engine/sparqlExpressions/SparqlExpression.cpp @@ -180,4 +180,18 @@ bool SparqlExpression::isInsideAggregate() const { } return isInsideAggregate_; } + +// ________________________________________________________________ +bool SparqlExpression::isExistsExpression() const { return false; } + +// ________________________________________________________________ +void SparqlExpression::getExistsExpressions( + std::vector& result) const { + if (isExistsExpression()) { + result.push_back(this); + } + for (auto& child : children()) { + child->getExistsExpressions(result); + } +} } // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/SparqlExpression.h b/src/engine/sparqlExpressions/SparqlExpression.h index 1378f10520..f033f27edc 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.h +++ b/src/engine/sparqlExpressions/SparqlExpression.h @@ -123,6 +123,16 @@ class SparqlExpression { // implementation returns `false`. virtual bool isStrExpression() const; + // Returns true iff this expression is an EXISTS(...) expression. Default + // implementation returns `false`. + virtual bool isExistsExpression() const; + + // Return non-null pointers to all `EXISTS` expressions in expression tree. + // The result is passed in as a reference to simplify the recursive + // implementation. + virtual void getExistsExpressions( + std::vector& result) const final; + // __________________________________________________________________________ virtual ~SparqlExpression() = default; diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index f1355c1b04..b691d2a55b 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -1,9 +1,8 @@ -// Copyright 2021 - 2024, University of Freiburg +// Copyright 2021 - 2025, University of Freiburg // Chair of Algorithms and Data Structures -// Authors: -// 2021 - Hannah Bast -// 2022 Julian Mundhahs -// 2022 - Johannes Kalmbach +// Authors: Julian Mundhahs +// Hannah Bast +// Johannes Kalmbach #include "parser/sparqlParser/SparqlQleverVisitor.h" @@ -14,6 +13,7 @@ #include "absl/time/time.h" #include "engine/sparqlExpressions/CountStarExpression.h" +#include "engine/sparqlExpressions/ExistsExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" #include "engine/sparqlExpressions/LiteralExpression.h" #include "engine/sparqlExpressions/NaryExpression.h" @@ -323,6 +323,7 @@ ParsedQuery Visitor::visit(Parser::ConstructQueryContext* ctx) { ParsedQuery query; query.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = query.datasetClauses_; if (ctx->constructTemplate()) { query._clause = visit(ctx->constructTemplate()) .value_or(parsedQuery::ConstructClause{}); @@ -366,9 +367,9 @@ ParsedQuery Visitor::visit(Parser::DescribeQueryContext* ctx) { } // Parse the FROM and FROM NAMED clauses. - auto datasetClauses = parsedQuery::DatasetClauses::fromClauses( + activeDatasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); - describeClause.datasetClauses_ = datasetClauses; + describeClause.datasetClauses_ = activeDatasetClauses_; // Parse the WHERE clause and construct a SELECT query from it. For `DESCRIBE // *`, add each visible variable as a resource to describe. @@ -399,7 +400,7 @@ ParsedQuery Visitor::visit(Parser::DescribeQueryContext* ctx) { parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); parsedQuery_._rootGraphPattern._graphPatterns.emplace_back( std::move(describeClause)); - parsedQuery_.datasetClauses_ = datasetClauses; + parsedQuery_.datasetClauses_ = activeDatasetClauses_; auto constructClause = ParsedQuery::ConstructClause{}; using G = GraphTerm; using V = Variable; @@ -415,6 +416,7 @@ ParsedQuery Visitor::visit(Parser::AskQueryContext* ctx) { parsedQuery_._clause = ParsedQuery::AskClause{}; parsedQuery_.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = parsedQuery_.datasetClauses_; visitWhereClause(ctx->whereClause(), parsedQuery_); // NOTE: It can make sense to have solution modifiers with an ASK query, for // example, a GROUP BY with a HAVING. @@ -660,6 +662,8 @@ ParsedQuery Visitor::visit(Parser::ModifyContext* ctx) { }; AD_CORRECTNESS_CHECK(visibleVariables_.empty()); auto graphPattern = visit(ctx->groupGraphPattern()); + parsedQuery_.datasetClauses_ = + parsedQuery::DatasetClauses::fromClauses(visitVector(ctx->usingClause())); parsedQuery_._rootGraphPattern = std::move(graphPattern); parsedQuery_.registerVariablesVisibleInQueryBody(visibleVariables_); visibleVariables_.clear(); @@ -670,8 +674,6 @@ ParsedQuery Visitor::visit(Parser::ModifyContext* ctx) { checkTriples(op.toDelete_); visitIf(&op.with_, ctx->iri()); parsedQuery_._clause = parsedQuery::UpdateClause{op}; - parsedQuery_.datasetClauses_ = - parsedQuery::DatasetClauses::fromClauses(visitVector(ctx->usingClause())); return parsedQuery_; } @@ -1263,6 +1265,7 @@ ParsedQuery Visitor::visit(Parser::SelectQueryContext* ctx) { parsedQuery_._clause = visit(ctx->selectClause()); parsedQuery_.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = parsedQuery_.datasetClauses_; visitWhereClause(ctx->whereClause(), parsedQuery_); parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); return parsedQuery_; @@ -2320,6 +2323,10 @@ ExpressionPtr Visitor::visit([[maybe_unused]] Parser::BuiltInCallContext* ctx) { return visit(ctx->substringExpression()); } else if (ctx->strReplaceExpression()) { return visit(ctx->strReplaceExpression()); + } else if (ctx->existsFunc()) { + return visit(ctx->existsFunc()); + } else if (ctx->notExistsFunc()) { + return visit(ctx->notExistsFunc()); } // Get the function name and the arguments. Note that we do not have to check // the number of arguments like for `processIriFunctionCall`, since the number @@ -2514,14 +2521,48 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { std::move(children.at(2))); } +// ____________________________________________________________________________ +ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, + bool negate) { + // The argument of 'EXISTS` is a `GroupGraphPattern` that is independent from + // the rest of the query (except for the `FROM` and `FROM NAMED` clauses, + // which also apply to the argument of `EXISTS`). We therefore have to back up + // and restore all global state when parsing `EXISTS`. + auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); + auto visibleVariablesBackup = std::move(visibleVariables_); + visibleVariables_.clear(); + + // Parse the argument of `EXISTS`. + auto group = visit(pattern); + ParsedQuery argumentOfExists = + std::exchange(parsedQuery_, std::move(queryBackup)); + argumentOfExists.selectClause().setAsterisk(); + argumentOfExists._rootGraphPattern = std::move(group); + + // The argument of `EXISTS` inherits the `FROM` and `FROM NAMED` clauses from + // the outer query. + argumentOfExists.datasetClauses_ = activeDatasetClauses_; + visibleVariables_ = std::move(visibleVariablesBackup); + auto exists = std::make_unique( + std::move(argumentOfExists)); + + // Handle `NOT EXISTS` (which is syntactically distinct from `! EXISTS`) by + // simply negating the `ExistsExpression`. + if (negate) { + return sparqlExpression::makeUnaryNegateExpression(std::move(exists)); + } else { + return exists; + } +} + // ____________________________________________________________________________________ -void Visitor::visit(const Parser::ExistsFuncContext* ctx) { - reportNotSupported(ctx, "The EXISTS function is"); +ExpressionPtr Visitor::visit(Parser::ExistsFuncContext* ctx) { + return visitExists(ctx->groupGraphPattern(), false); } // ____________________________________________________________________________________ -void Visitor::visit(const Parser::NotExistsFuncContext* ctx) { - reportNotSupported(ctx, "The NOT EXISTS function is"); +ExpressionPtr Visitor::visit(Parser::NotExistsFuncContext* ctx) { + return visitExists(ctx->groupGraphPattern(), true); } // ____________________________________________________________________________________ diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index b9a29a6fbc..412f2677f6 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -79,8 +79,18 @@ class SparqlQleverVisitor { // query. This may contain duplicates. A variable is added via // `addVisibleVariable`. std::vector visibleVariables_{}; + + // The `FROM` and `FROM NAMED` clauses of the query that is currently + // being parsed. Those are inherited by certain constructs, which are + // otherwise independent (in particular, `EXISTS` and `DESCRIBE`). + ParsedQuery::DatasetClauses activeDatasetClauses_; + + // The map from prefixes to their full IRIs. PrefixMap prefixMap_{}; + + // The `BASE` IRI of the query if any. ad_utility::triple_component::Iri baseIri_{}; + // We need to remember the prologue (prefix declarations) when we encounter it // because we need it when we encounter a SERVICE query. When there is no // prologue, this string simply remains empty. @@ -113,6 +123,11 @@ class SparqlQleverVisitor { isInsideConstructTriples_ = true; } + void setActiveDatasetClausesForTesting( + ParsedQuery::DatasetClauses datasetClauses) { + activeDatasetClauses_ = std::move(datasetClauses); + } + // ___________________________________________________________________________ ParsedQuery visit(Parser::QueryOrUpdateContext* ctx); @@ -446,9 +461,12 @@ class SparqlQleverVisitor { ExpressionPtr visit(Parser::StrReplaceExpressionContext* ctx); - [[noreturn]] static void visit(const Parser::ExistsFuncContext* ctx); + ExpressionPtr visitExists(Parser::GroupGraphPatternContext* pattern, + bool negate); + + ExpressionPtr visit(Parser::ExistsFuncContext* ctx); - [[noreturn]] static void visit(const Parser::NotExistsFuncContext* ctx); + ExpressionPtr visit(Parser::NotExistsFuncContext* ctx); ExpressionPtr visit(Parser::AggregateContext* ctx); diff --git a/src/util/JoinAlgorithms/FindUndefRanges.h b/src/util/JoinAlgorithms/FindUndefRanges.h index 5edd0b95b2..f8cfdc9dfb 100644 --- a/src/util/JoinAlgorithms/FindUndefRanges.h +++ b/src/util/JoinAlgorithms/FindUndefRanges.h @@ -166,34 +166,35 @@ CPP_template(typename It)(requires std::random_access_iterator) // // have additional information about the input (most notably which of the join // columns contain no UNDEF at all) and therefore a more specialized routine // should be chosen. -CPP_template(typename It)( - requires std::random_access_iterator< - It>) auto findSmallerUndefRanges(const auto& row, It begin, It end, - bool& resultMightBeUnsorted) - -> cppcoro::generator { - size_t numLastUndefined = 0; - assert(row.size() > 0); - auto it = ql::ranges::rbegin(row); - auto rend = ql::ranges::rend(row); - for (; it < rend; ++it) { - if (*it != Id::makeUndefined()) { - break; +struct FindSmallerUndefRanges { + CPP_template(typename It)(requires std::random_access_iterator) auto + operator()(const auto& row, It begin, It end, + bool& resultMightBeUnsorted) const -> cppcoro::generator { + size_t numLastUndefined = 0; + assert(row.size() > 0); + auto it = ql::ranges::rbegin(row); + auto rend = ql::ranges::rend(row); + for (; it < rend; ++it) { + if (*it != Id::makeUndefined()) { + break; + } + ++numLastUndefined; } - ++numLastUndefined; - } - for (; it < rend; ++it) { - if (*it == Id::makeUndefined()) { - return findSmallerUndefRangesArbitrary(row, begin, end, - resultMightBeUnsorted); + for (; it < rend; ++it) { + if (*it == Id::makeUndefined()) { + return findSmallerUndefRangesArbitrary(row, begin, end, + resultMightBeUnsorted); + } + } + if (numLastUndefined == 0) { + return findSmallerUndefRangesForRowsWithoutUndef(row, begin, end, + resultMightBeUnsorted); + } else { + return findSmallerUndefRangesForRowsWithUndefInLastColumns( + row, numLastUndefined, begin, end, resultMightBeUnsorted); } } - if (numLastUndefined == 0) { - return findSmallerUndefRangesForRowsWithoutUndef(row, begin, end, - resultMightBeUnsorted); - } else { - return findSmallerUndefRangesForRowsWithUndefInLastColumns( - row, numLastUndefined, begin, end, resultMightBeUnsorted); - } -} +}; +constexpr FindSmallerUndefRanges findSmallerUndefRanges; } // namespace ad_utility diff --git a/test/ExceptionTest.cpp b/test/ExceptionTest.cpp index eaf0d0504d..4cc649ebc0 100644 --- a/test/ExceptionTest.cpp +++ b/test/ExceptionTest.cpp @@ -1,6 +1,6 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2023 - 2025, University of Freiburg +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach #include #include diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 90462f3cc3..e2135f80c5 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -1,7 +1,7 @@ -// Copyright 2015 - 2024, University of Freiburg +// Copyright 2015 - 2025, University of Freiburg // Chair of Algorithms and Data Structures // Authors: Björn Buchhold [2015 - 2017] -// Johannes Kalmbach [2018 - 2024] +// Johannes Kalmbach #include @@ -2906,10 +2906,78 @@ TEST(QueryPlanner, Describe) { } // ____________________________________________________________________________ -TEST(QueryPlanner, GroupByRedundanteParensAndVariables) { +TEST(QueryPlanner, GroupByRedundantParensAndVariables) { auto matcher = h::GroupBy({Variable{"?x"}}, {}, h::IndexScanFromStrings("?x", "?y", "?z")); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY (?x)", matcher); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY ?x ?x", matcher); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY ?x ?x (?x)", matcher); } + +// ____________________________________________________________________________ +TEST(QueryPlanner, Exists) { + auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); + auto abc = h::IndexScanFromStrings("?a", "?b", "?c"); + auto def = h::IndexScanFromStrings("?d", "?e", "?f"); + auto ghi = h::IndexScanFromStrings("?g", "?h", "?i"); + using V = Variable; + + // Simple tests for EXISTS with FILTER, BIND, and GROUP BY. + h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c} }", + h::Filter("EXISTS {?a ?b ?c}", h::ExistsJoin(xyz, abc))); + h::expect("SELECT * { ?x ?y ?z BIND(EXISTS {?a ?b ?c} as ?bound) }", + h::Bind(h::ExistsJoin(xyz, abc), "EXISTS {?a ?b ?c}", + Variable("?bound"))); + h::expect( + "SELECT ?x (SAMPLE(EXISTS{?a ?b ?c}) as ?s) { ?x ?y ?z } GROUP BY ?x", + h::GroupBy({V{"?x"}}, {"(SAMPLE(EXISTS{?a ?b ?c}) as ?s)"}, + h::ExistsJoin(xyz, abc))); + + // Similar tests, but with multiple EXISTS clauses + auto existsAbcDef = h::ExistsJoin(h::ExistsJoin(xyz, abc), def); + h::expect( + "SELECT * { ?x ?y ?z FILTER (EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f})}", + h::Filter("EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}", existsAbcDef)); + ; + h::expect( + "SELECT * { ?x ?y ?z BIND(EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f} as " + "?bound)}", + h::Bind(existsAbcDef, "EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}", + Variable("?bound"))); + + h::expect( + "SELECT ?x (SAMPLE(EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}) as ?s) " + "(SAMPLE(EXISTS{?g ?h ?i}) as ?t) { ?x ?y ?z } GROUP BY ?x", + h::GroupBy({V{"?x"}}, + {"(SAMPLE(EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}) as ?s)", + "(SAMPLE(EXISTS{?g ?h ?i}) as ?t)"}, + h::ExistsJoin(existsAbcDef, ghi))); + + // Test the interaction of FROM with EXISTS. + using H = ad_utility::HashSet; + auto xyzg = h::IndexScanFromStrings("?x", "?y", "?z", {}, H{""}); + auto abcg = h::IndexScanFromStrings("?a", "?b", "?c", {}, H{""}); + + // Various uses of FILTER EXISTS. + auto existsJoin = h::ExistsJoin(xyzg, abcg); + auto filter = h::Filter("EXISTS {?a ?b ?c}", existsJoin); + h::expect("SELECT * FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); + h::expect("ASK FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); + h::expect( + "CONSTRUCT { } FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + filter); + h::expect("Describe ?x FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::Describe(::testing::_, filter)); + + // Test the interaction of FROM NAMES with EXISTS + auto varG = std::vector{Variable{"?g"}}; + std::vector graphCol{ADDITIONAL_COLUMN_GRAPH_ID}; + auto uvcg = + h::IndexScanFromStrings("?u", "?v", "?c", {}, H{""}, varG, graphCol); + existsJoin = h::ExistsJoin(xyzg, h::UnorderedJoins(abcg, uvcg)); + filter = h::Filter("EXISTS {?a ?b ?c. GRAPH ?g { ?u ?v ?c}}", existsJoin); + h::expect( + "SELECT * FROM FROM NAMED { ?x ?y ?z FILTER EXISTS {?a ?b ?c. " + "GRAPH ?g { ?u ?v ?c}}}", + filter); +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index c300bf0d5f..f53f30c5bb 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -15,6 +15,7 @@ #include "engine/CartesianProductJoin.h" #include "engine/CountAvailablePredicates.h" #include "engine/Describe.h" +#include "engine/ExistsJoin.h" #include "engine/Filter.h" #include "engine/GroupBy.h" #include "engine/IndexScan.h" @@ -405,6 +406,12 @@ inline QetMatcher Describe( AD_PROPERTY(::Describe, getDescribe, describeMatcher))); } +// Match an `ExistsJoin` +inline QetMatcher ExistsJoin(const QetMatcher& leftChild, + const QetMatcher& rightChild) { + return RootOperation<::ExistsJoin>(AllOf(children(leftChild, rightChild))); +} + // inline QetMatcher QetWithWarnings( const std::vector& warningSubstrings, diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 1d9f11a710..8fa0bb0287 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -1,9 +1,10 @@ -// Copyright 2021 - 2024, University of Freiburg +// Copyright 2021 - 2025, University of Freiburg // Chair of Algorithms and Data Structures // Authors: Johannes Kalmbach // Julian Mundhahs // Hannah Bast +#include #include #include @@ -14,6 +15,7 @@ #include "./SparqlExpressionTestHelpers.h" #include "./util/GTestHelpers.h" #include "./util/TripleComponentTestHelpers.h" +#include "QueryPlannerTestHelpers.h" #include "SparqlAntlrParserTestHelpers.h" #include "engine/sparqlExpressions/CountStarExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" @@ -50,9 +52,11 @@ const ad_utility::HashMap defaultPrefixMap{ template auto parse = [](const string& input, SparqlQleverVisitor::PrefixMap prefixes = {}, + ParsedQuery::DatasetClauses clauses = {}, SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks = SparqlQleverVisitor::DisableSomeChecksOnlyForTesting::False) { ParserAndVisitor p{input, std::move(prefixes), disableSomeChecks}; + p.visitor_.setActiveDatasetClausesForTesting(std::move(clauses)); if (testInsideConstructTemplate) { p.visitor_.setParseModeToInsideConstructTemplateForTesting(); } @@ -109,7 +113,21 @@ struct ExpectCompleteParse { EXPECT_NO_THROW({ return expectCompleteParse( parse( - input, std::move(prefixMap), disableSomeChecks), + input, std::move(prefixMap), {}, disableSomeChecks), + matcher, l); + }); + }; + + auto operator()(const string& input, + const testing::Matcher& matcher, + ParsedQuery::DatasetClauses activeDatasetClauses, + ad_utility::source_location l = + ad_utility::source_location::current()) const { + auto tr = generateLocationTrace(l, "successful parsing was expected here"); + EXPECT_NO_THROW({ + return expectCompleteParse( + parse( + input, {}, std::move(activeDatasetClauses), disableSomeChecks), matcher, l); }); }; @@ -134,7 +152,7 @@ struct ExpectParseFails { ad_utility::source_location l = ad_utility::source_location::current()) { auto trace = generateLocationTrace(l); AD_EXPECT_THROW_WITH_MESSAGE( - parse(input, std::move(prefixMap), disableSomeChecks), + parse(input, std::move(prefixMap), {}, disableSomeChecks), messageMatcher); } }; @@ -1907,6 +1925,68 @@ TEST(SparqlParser, binaryStringExpressions) { expectBuiltInCall("STRBEFORE(?x, ?y)", makeMatcher(&makeStrBeforeExpression)); } +// Matchers for EXISTS and NOT EXISTS functions. +namespace existsTestHelpers { +using namespace sparqlExpression; +using namespace ::testing; + +// Match an EXISTS function +auto existsMatcher(Matcher pattern) { + return Pointee(WhenDynamicCastTo( + AD_PROPERTY(ExistsExpression, argument, pattern))); +} +// Match a NOT EXISTS function +auto notExistsMatcher(Matcher pattern) { + return builtInCallTestHelpers::matchNaryWithChildrenMatchers( + &makeUnaryNegateExpression, existsMatcher(pattern)); +} +} // namespace existsTestHelpers + +// _____________________________________________________________________________ +TEST(SparqlParser, Exists) { + using namespace existsTestHelpers; + auto expectBuiltInCall = ExpectCompleteParse<&Parser::builtInCall>{}; + + // A matcher that matches the query `SELECT * { ?x ?foo }`, where the + // FROM and FROM NAMED clauses can be specified as arguments. + using Graphs = ScanSpecificationAsTripleComponent::Graphs; + auto selectABarFooMatcher = [](Graphs defaultGraphs = std::nullopt, + Graphs namedGraphs = std::nullopt) { + return testing::AllOf(m::SelectQuery( + m::AsteriskSelect(), + m::GraphPattern(m::Triples({{Var{"?a"}, "", Var{"?foo"}}})), + defaultGraphs, namedGraphs)); + }; + + expectBuiltInCall("EXISTS {?a ?foo}", + existsMatcher(selectABarFooMatcher())); + expectBuiltInCall("NOT EXISTS {?a ?foo}", + notExistsMatcher(selectABarFooMatcher())); + + Graphs defaultGraphs{ad_utility::HashSet{iri("")}}; + Graphs namedGraphs{ad_utility::HashSet{iri("")}}; + + // Now run the same tests, but with non-empty dataset clauses, that have to be + // propagated to the `ParsedQuery` stored inside the `ExistsExpression`. + ParsedQuery::DatasetClauses datasetClauses; + datasetClauses.defaultGraphs_ = defaultGraphs; + datasetClauses.namedGraphs_ = namedGraphs; + datasetClauses.defaultGraphs_.value().insert(iri("")); + expectBuiltInCall("EXISTS {?a ?foo}", + existsMatcher(selectABarFooMatcher())); + expectBuiltInCall("NOT EXISTS {?a ?foo}", + notExistsMatcher(selectABarFooMatcher())); + + expectBuiltInCall( + "EXISTS {?a ?foo}", + existsMatcher(selectABarFooMatcher(defaultGraphs, namedGraphs)), + datasetClauses); + expectBuiltInCall( + "NOT EXISTS {?a ?foo}", + notExistsMatcher(selectABarFooMatcher(defaultGraphs, namedGraphs)), + datasetClauses); +} + namespace aggregateTestHelpers { using namespace sparqlExpression; diff --git a/test/SparqlExpressionTest.cpp b/test/SparqlExpressionTest.cpp index ecc0f44846..d8bdb947a8 100644 --- a/test/SparqlExpressionTest.cpp +++ b/test/SparqlExpressionTest.cpp @@ -3,6 +3,7 @@ // Authors: Johannes Kalmbach // Hannah Bast +#include #include #include @@ -1489,3 +1490,21 @@ TEST(SingleUseExpression, simpleMembersForTestCoverage) { EXPECT_ANY_THROW(expression.getUnaggregatedVariables()); EXPECT_ANY_THROW(expression.getCacheKey({})); } + +// This just tests basic functionality of the `ExistsExpression` class. Since +// the actual implementation of the `EXISTS` operator is done in the +// `ExistsJoin` class, most of the testing happens in +// `test/engine/ExistsJoinTest.cpp`. +TEST(ExistsExpression, basicFunctionality) { + ExistsExpression exists{ParsedQuery{}}; + auto var = exists.variable(); + TestContext context; + EXPECT_ANY_THROW(exists.evaluate(&context.context)); + using namespace testing; + EXPECT_THAT(exists.getCacheKey(context.varToColMap), + HasSubstr("Uninitialized Exists")); + context.varToColMap[var] = makeAlwaysDefinedColumn(437); + EXPECT_THAT(exists.evaluate(&context.context), VariantWith(var)); + EXPECT_THAT(exists.getCacheKey(context.varToColMap), + HasSubstr("ExistsExpression col# 437")); +} diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index fef9ffed39..41b2b463ad 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -12,3 +12,4 @@ addLinkAndDiscoverTest(BindTest engine) addLinkAndRunAsSingleTest(SpatialJoinAlgorithmsTest engine) addLinkAndDiscoverTestSerial(QueryExecutionTreeTest engine) addLinkAndDiscoverTestSerial(DescribeTest engine) +addLinkAndDiscoverTestSerial(ExistsJoinTest engine) diff --git a/test/engine/ExistsJoinTest.cpp b/test/engine/ExistsJoinTest.cpp new file mode 100644 index 0000000000..e16d9b3ba7 --- /dev/null +++ b/test/engine/ExistsJoinTest.cpp @@ -0,0 +1,135 @@ +// Copyright 2024 - 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach + +#include + +#include "../util/GTestHelpers.h" +#include "../util/IdTableHelpers.h" +#include "../util/IndexTestHelpers.h" +#include "engine/ExistsJoin.h" +#include "engine/IndexScan.h" +#include "engine/NeutralElementOperation.h" +#include "engine/QueryExecutionTree.h" + +using namespace ad_utility::testing; + +namespace { + +// Helper function that computes an `ExistsJoin` of the given `left` and +// `right` and checks that the result columns is equal to `expectedAsBool`. +// The first `numJoinColumns` columns of both `leftInput` and `rightInput` are +// used as join columns. +// +void testExistsFromIdTable(IdTable left, IdTable right, + std::vector expectedAsBool, + size_t numJoinColumns) { + AD_CORRECTNESS_CHECK(left.numRows() == expectedAsBool.size()); + AD_CORRECTNESS_CHECK(left.numColumns() >= numJoinColumns); + AD_CORRECTNESS_CHECK(right.numColumns() >= numJoinColumns); + + // Randomly permute the columns of the `input` and return the permutation that + // was applied + auto permuteColumns = [](auto& table) { + auto colsView = ad_utility::integerRange(table.numColumns()); + std::vector permutation; + ql::ranges::copy(colsView, std::back_inserter(permutation)); + table.setColumnSubset(permutation); + return permutation; + }; + // Permute the columns. + auto leftPermutation = permuteColumns(left); + auto rightPermutation = permuteColumns(right); + + // We have to make the deep copy of `left` for the expected result at exactly + // this point: The permutation of the columns (above) also affects the + // expected result, while the permutation of the rows (which will be applied + // below) doesn't affect it, as the `ExistsJoin` internally sorts its inputs. + IdTable expected = left.clone(); + + // Randomly shuffle the inputs, to ensure that the `existsJoin` correctly + // pre-sorts its inputs. + ad_utility::randomShuffle(left.begin(), left.end()); + ad_utility::randomShuffle(right.begin(), right.end()); + + auto qec = getQec(); + using V = Variable; + using Vars = std::vector>; + + // Helper lambda `makeChild` that turns a `VectorTable` input into a + // `QueryExecutionTree` with a `ValuesForTesting` operation. + auto joinCol = [](size_t i) { return V{absl::StrCat("?joinCol_", i)}; }; + auto nonJoinCol = [i = 0]() mutable { + return V{absl::StrCat("?nonJoinCol_", i++)}; + }; + + auto makeChild = [&](const IdTable& input, const auto& columnPermutation) { + Vars vars; + for (auto colIdx : columnPermutation) { + if (colIdx < numJoinColumns) { + vars.push_back(joinCol(colIdx)); + } else { + vars.push_back(nonJoinCol()); + } + } + return ad_utility::makeExecutionTree(qec, input.clone(), + vars); + }; + + // Compute the `ExistsJoin` and check the result. + auto exists = ExistsJoin{qec, makeChild(left, leftPermutation), + makeChild(right, rightPermutation), V{"?exists"}}; + EXPECT_EQ(exists.getResultWidth(), left.numColumns() + 1); + auto res = exists.computeResultOnlyForTesting(); + const auto& table = res.idTable(); + ASSERT_EQ(table.numRows(), left.size()); + expected.addEmptyColumn(); + ql::ranges::transform(expectedAsBool, expected.getColumn(2).begin(), + &Id::makeFromBool); + EXPECT_THAT(table, matchesIdTable(expected)); +} + +// Same as the function above, but conveniently takes `VectorTable`s instead of +// `IdTable`s. +void testExists(const VectorTable& leftInput, const VectorTable& rightInput, + std::vector expectedAsBool, size_t numJoinColumns) { + auto left = makeIdTableFromVector(leftInput); + auto right = makeIdTableFromVector(rightInput); + testExistsFromIdTable(std::move(left), std::move(right), + std::move(expectedAsBool), numJoinColumns); +} + +} // namespace + +TEST(Exists, computeResult) { + // Single join column. + testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {true, false, true}, 1); + + // Single join column with one UNDEF (which always matches). + auto U = Id::makeUndefined(); + testExists({{U, 13}, {3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {true, true, false, true}, 1); + testExists({{3, 6}, {4, 7}, {5, 8}}, {{U, 15}}, {true, true, true}, 1); + + // Two join columns. + testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {false, false, false}, 2); + testExists({{3, 6}, {4, 7}, {5, 8}}, + {{3, 6, 11}, {3, 19, 7}, {4, 8, 0}, {5, 8, 37}}, + {true, false, true}, 2); + + // Two join columns with UNDEFs in each column. + testExists({{2, 2}, {3, U}, {4, 8}, {5, 8}}, + {{U, 8}, {3, 15}, {3, 19}, {5, U}, {5, 37}}, + {false, true, true, true}, 2); + testExists({{U, U}}, {{13, 17}}, {true}, 2); + testExists({{13, 17}, {25, 38}}, {{U, U}}, {true, true}, 2); + + // Empty inputs + auto alloc = ad_utility::testing::makeAllocator(); + testExistsFromIdTable(IdTable(2, alloc), + makeIdTableFromVector({{U, U}, {3, 7}}), {}, 1); + testExistsFromIdTable(makeIdTableFromVector({{U, U}, {3, 7}}), + IdTable(2, alloc), {false, false}, 2); +}