diff --git a/src/global/RuntimeParameters.h b/src/global/RuntimeParameters.h index 8e60725ffe..cce7e321b9 100644 --- a/src/global/RuntimeParameters.h +++ b/src/global/RuntimeParameters.h @@ -53,7 +53,12 @@ inline auto& RuntimeParameters() { Bool<"throw-on-unbound-variables">{false}, // Control up until which size lazy results should be cached. Caching // does cause significant overhead for this case. - MemorySizeParameter<"lazy-result-max-cache-size">{5_MB}}; + MemorySizeParameter<"lazy-result-max-cache-size">{5_MB}, + // When the result of an index scan is smaller than a single block, then + // its size estimate will be the size of the block divided by this + // value. + SizeT<"small-index-scan-size-estimate-divisor">{5}, + }; }(); return params; } diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index fc306e892f..c11081f66f 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -29,6 +29,35 @@ static auto getBeginAndEnd(auto& range) { return std::pair{ql::ranges::begin(range), ql::ranges::end(range)}; } +// Return true iff the `triple` is contained in the `scanSpec`. For example, the +// triple ` 42 0 3 ` is contained in the specs `U U U`, `42 U U` and `42 0 U` , +// but not in `42 2 U` where `U` means "scan for all possible values". +static auto isTripleInSpecification = + [](const ScanSpecification& scanSpec, + const CompressedBlockMetadata::PermutedTriple& triple) { + enum struct M { GuaranteedMatch, Mismatch, MustCheckNextElement }; + auto checkElement = [](const auto& optId, Id id) { + if (!optId.has_value()) { + return M::GuaranteedMatch; + } else if (optId.value() != id) { + return M::Mismatch; + } else { + return M::MustCheckNextElement; + } + }; + auto result = checkElement(scanSpec.col0Id(), triple.col0Id_); + if (result == M::MustCheckNextElement) { + result = checkElement(scanSpec.col1Id(), triple.col1Id_); + } + if (result == M::MustCheckNextElement) { + result = checkElement(scanSpec.col2Id(), triple.col2Id_); + } + // The case `result == M::MustCheckNextElement` can happen in the unlikely + // case that there only is a single triple in the block, which is scanned + // for explicitly. + return result != M::Mismatch; + }; + // modify the `block` according to the `limitOffset`. Also modify the // `limitOffset` to reflect the parts of the LIMIT and OFFSET that have been // performed by pruning this `block`. @@ -631,21 +660,46 @@ std::pair CompressedRelationReader::getResultSizeImpl( // a part of these blocks is actually part of the result, // set up a lambda which allows us to read these blocks, and returns // the size of the result. + size_t numResults = 0; + // Determine the total size of the result. + // First accumulate the complete blocks in the "middle" + std::size_t inserted = 0; + std::size_t deleted = 0; + auto readSizeOfPossiblyIncompleteBlock = [&](const auto& block) { - return readPossiblyIncompleteBlock(scanSpec, config, block, std::nullopt, - locatedTriplesPerBlock) - .numRows(); + if (exactSize) { + numResults += + readPossiblyIncompleteBlock(scanSpec, config, block, std::nullopt, + locatedTriplesPerBlock) + .numRows(); + } else { + // If the first and last triple of the block match, then we know that the + // whole block belongs to the result. + bool isComplete = isTripleInSpecification(scanSpec, block.firstTriple_) && + isTripleInSpecification(scanSpec, block.lastTriple_); + size_t divisor = + isComplete ? 1 + : RuntimeParameters() + .get<"small-index-scan-size-estimate-divisor">(); + const auto [ins, del] = + locatedTriplesPerBlock.numTriples(block.blockIndex_); + auto trunc = [divisor](size_t num) { + return std::max(std::min(num, 1ul), num / divisor); + }; + inserted += trunc(ins); + deleted += trunc(del); + numResults += trunc(block.numRows_); + } }; - size_t numResults = 0; // The first and the last block might be incomplete, compute // and store the partial results from them. if (beginBlock < endBlock) { - numResults += readSizeOfPossiblyIncompleteBlock(*beginBlock); + readSizeOfPossiblyIncompleteBlock(*beginBlock); ++beginBlock; } if (beginBlock < endBlock) { - numResults += readSizeOfPossiblyIncompleteBlock(*(endBlock - 1)); + readSizeOfPossiblyIncompleteBlock(*(endBlock - 1)); --endBlock; } @@ -653,10 +707,6 @@ std::pair CompressedRelationReader::getResultSizeImpl( return {numResults, numResults}; } - // Determine the total size of the result. - // First accumulate the complete blocks in the "middle" - std::size_t inserted = 0; - std::size_t deleted = 0; ql::ranges::for_each( ql::ranges::subrange{beginBlock, endBlock}, [&](const auto& block) { const auto [ins, del] = @@ -666,8 +716,8 @@ std::pair CompressedRelationReader::getResultSizeImpl( deleted += del; numResults += block.numRows_; } else { - // TODO We could cache the exact size as soon as we have - // merged the block once since the last update. + // TODO We could cache the exact size as soon as we + // have merged the block once since the last update. auto b = readAndDecompressBlock(block, config); numResults += b.has_value() ? b.value().block_.numRows() : 0u; } @@ -1366,10 +1416,10 @@ auto CompressedRelationWriter::createPermutationPair( // relation as its overhead is far too high for small relations. relation.swapColumns(c1Idx, c2Idx); - // We only need to sort by the columns of the triple + the graph column, - // not the additional payload. Note: We could also use - // `compareWithoutLocalVocab` to compare the IDs cheaper, but this sort - // is far from being a performance bottleneck. + // We only need to sort by the columns of the triple + the graph + // column, not the additional payload. Note: We could also use + // `compareWithoutLocalVocab` to compare the IDs cheaper, but this + // sort is far from being a performance bottleneck. auto compare = [](const auto& a, const auto& b) { return std::tie(a[0], a[1], a[2], a[3]) < std::tie(b[0], b[1], b[2], b[3]); diff --git a/test/engine/IndexScanTest.cpp b/test/engine/IndexScanTest.cpp index 62f01647a0..395a21261a 100644 --- a/test/engine/IndexScanTest.cpp +++ b/test/engine/IndexScanTest.cpp @@ -498,7 +498,8 @@ TEST(IndexScan, getResultSizeOfScan) { SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), I::fromIriref("")}; IndexScan scan{qec, Permutation::Enum::POS, scanTriple}; - EXPECT_EQ(scan.getSizeEstimate(), 0); + EXPECT_EQ(scan.getSizeEstimate(), 1); + EXPECT_EQ(scan.getExactSize(), 0); } { SparqlTripleSimple scanTriple{I::fromIriref(""), I::fromIriref("

"), diff --git a/test/engine/SpatialJoinTest.cpp b/test/engine/SpatialJoinTest.cpp index ed49d54e86..766801ee3b 100644 --- a/test/engine/SpatialJoinTest.cpp +++ b/test/engine/SpatialJoinTest.cpp @@ -926,22 +926,22 @@ class SpatialJoinMultiplicityAndSizeEstimateTest spatialJoin = static_cast(spJoin2.get()); auto varColsMap = spatialJoin->getExternallyVisibleVariableColumns(); - assertMultiplicity(subj1.getVariable(), 9.8, spatialJoin, varColsMap); - assertMultiplicity(obj1.getVariable(), 7.0, spatialJoin, varColsMap); - assertMultiplicity(subj2.getVariable(), 9.8, spatialJoin, varColsMap); - assertMultiplicity(obj2.getVariable(), 7.0, spatialJoin, varColsMap); + assertMultiplicity(subj1.getVariable(), 4.2, spatialJoin, varColsMap); + assertMultiplicity(obj1.getVariable(), 3.0, spatialJoin, varColsMap); + assertMultiplicity(subj2.getVariable(), 4.2, spatialJoin, varColsMap); + assertMultiplicity(obj2.getVariable(), 3.0, spatialJoin, varColsMap); ASSERT_TRUE( spatialJoin->onlyForTestingGetDistanceVariable().has_value()); assertMultiplicity(Variable{"?distanceForTesting"}, 1, spatialJoin, varColsMap); } else { - ASSERT_EQ(leftChild->getSizeEstimate(), 7); - ASSERT_EQ(rightChild->getSizeEstimate(), 7); + auto leftEstimate = leftChild->getSizeEstimate(); + auto rightEstimate = rightChild->getSizeEstimate(); auto spJoin1 = spatialJoin->addChild(firstChild, firstVariable); spatialJoin = static_cast(spJoin1.get()); auto spJoin2 = spatialJoin->addChild(secondChild, secondVariable); spatialJoin = static_cast(spJoin2.get()); - ASSERT_LE(spatialJoin->getSizeEstimate(), 49); + ASSERT_LE(spatialJoin->getSizeEstimate(), leftEstimate * rightEstimate); } } }