Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid reading and decompressing index blocks during query planning #1725

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/global/RuntimeParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ inline auto& RuntimeParameters() {
Bool<"throw-on-unbound-variables">{false},
// Control up until which size lazy results should be cached. Caching
// does cause significant overhead for this case.
MemorySizeParameter<"lazy-result-max-cache-size">{5_MB}};
MemorySizeParameter<"lazy-result-max-cache-size">{5_MB},
// When the result of an index scan is smaller than a single block, then
// its size estimate will be the size of the block divided by this
// value.
SizeT<"small-index-scan-size-estimate-divisor">{5},
};
}();
return params;
}
Expand Down
79 changes: 63 additions & 16 deletions src/index/CompressedRelation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,34 @@ static auto getBeginAndEnd(auto& range) {
return std::pair{ql::ranges::begin(range), ql::ranges::end(range)};
}

// Return true iff the `triple` is contained in the `scanSpec`. For example, the
// triple ` 42 0 3 ` is contained in the specs `U U U`, `42 U U` and `42 0 U` ,
// but not in `42 2 U` where `U` means "scan for all possible values".
static auto isTripleInSpecification =
[](const ScanSpecification& scanSpec,
const CompressedBlockMetadata::PermutedTriple& triple) {
// TODO<joka921> Make this a free function, make this simpler
auto checkElement = [](const auto& optId, Id id) -> std::optional<bool> {
if (!optId.has_value()) {
return true;
} else if (optId.value() != id) {
return false;
} else {
return std::nullopt;
}
};
auto result = checkElement(scanSpec.col0Id(), triple.col0Id_);
if (!result.has_value()) {
result = checkElement(scanSpec.col1Id(), triple.col1Id_);
}
if (!result.has_value()) {
result = checkElement(scanSpec.col2Id(), triple.col2Id_);
}
// The explicit `true` handles the unlikely case that there only is a
// single triple in the block, which is scanned for explicitly.
return result.value_or(true);
};

// modify the `block` according to the `limitOffset`. Also modify the
// `limitOffset` to reflect the parts of the LIMIT and OFFSET that have been
// performed by pruning this `block`.
Expand Down Expand Up @@ -631,32 +659,51 @@ std::pair<size_t, size_t> CompressedRelationReader::getResultSizeImpl(
// a part of these blocks is actually part of the result,
// set up a lambda which allows us to read these blocks, and returns
// the size of the result.
size_t numResults = 0;
// Determine the total size of the result.
// First accumulate the complete blocks in the "middle"
std::size_t inserted = 0;
std::size_t deleted = 0;

auto readSizeOfPossiblyIncompleteBlock = [&](const auto& block) {
return readPossiblyIncompleteBlock(scanSpec, config, block, std::nullopt,
locatedTriplesPerBlock)
.numRows();
if (exactSize) {
numResults +=
readPossiblyIncompleteBlock(scanSpec, config, block, std::nullopt,
locatedTriplesPerBlock)
.numRows();
} else {
bool isComplete = isTripleInSpecification(scanSpec, block.firstTriple_) &&
isTripleInSpecification(scanSpec, block.lastTriple_);
size_t divisor =
isComplete ? 1
: RuntimeParameters()
.get<"small-index-scan-size-estimate-divisor">();
const auto [ins, del] =
locatedTriplesPerBlock.numTriples(block.blockIndex_);
auto trunc = [divisor](size_t num) {
return std::max(std::min(num, 1ul), num / divisor);
};
inserted += trunc(ins);
deleted += trunc(del);
numResults += trunc(block.numRows_);
}
};

size_t numResults = 0;
// The first and the last block might be incomplete, compute
// and store the partial results from them.
if (beginBlock < endBlock) {
numResults += readSizeOfPossiblyIncompleteBlock(*beginBlock);
readSizeOfPossiblyIncompleteBlock(*beginBlock);
++beginBlock;
}
if (beginBlock < endBlock) {
numResults += readSizeOfPossiblyIncompleteBlock(*(endBlock - 1));
readSizeOfPossiblyIncompleteBlock(*(endBlock - 1));
--endBlock;
}

if (beginBlock == endBlock) {
return {numResults, numResults};
}

// Determine the total size of the result.
// First accumulate the complete blocks in the "middle"
std::size_t inserted = 0;
std::size_t deleted = 0;
ql::ranges::for_each(
ql::ranges::subrange{beginBlock, endBlock}, [&](const auto& block) {
const auto [ins, del] =
Expand All @@ -666,8 +713,8 @@ std::pair<size_t, size_t> CompressedRelationReader::getResultSizeImpl(
deleted += del;
numResults += block.numRows_;
} else {
// TODO<joka921> We could cache the exact size as soon as we have
// merged the block once since the last update.
// TODO<joka921> We could cache the exact size as soon as we
// have merged the block once since the last update.
auto b = readAndDecompressBlock(block, config);
numResults += b.has_value() ? b.value().block_.numRows() : 0u;
}
Expand Down Expand Up @@ -1366,10 +1413,10 @@ auto CompressedRelationWriter::createPermutationPair(
// relation as its overhead is far too high for small relations.
relation.swapColumns(c1Idx, c2Idx);

// We only need to sort by the columns of the triple + the graph column,
// not the additional payload. Note: We could also use
// `compareWithoutLocalVocab` to compare the IDs cheaper, but this sort
// is far from being a performance bottleneck.
// We only need to sort by the columns of the triple + the graph
// column, not the additional payload. Note: We could also use
// `compareWithoutLocalVocab` to compare the IDs cheaper, but this
// sort is far from being a performance bottleneck.
auto compare = [](const auto& a, const auto& b) {
return std::tie(a[0], a[1], a[2], a[3]) <
std::tie(b[0], b[1], b[2], b[3]);
Expand Down
3 changes: 2 additions & 1 deletion test/engine/IndexScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,8 @@ TEST(IndexScan, getResultSizeOfScan) {
SparqlTripleSimple scanTriple{I::fromIriref("<x2>"), I::fromIriref("<p>"),
I::fromIriref("<s1>")};
IndexScan scan{qec, Permutation::Enum::POS, scanTriple};
EXPECT_EQ(scan.getSizeEstimate(), 0);
EXPECT_EQ(scan.getSizeEstimate(), 1);
EXPECT_EQ(scan.getExactSize(), 0);
}
{
SparqlTripleSimple scanTriple{I::fromIriref("<x>"), I::fromIriref("<p>"),
Expand Down
14 changes: 7 additions & 7 deletions test/engine/SpatialJoinTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -926,22 +926,22 @@ class SpatialJoinMultiplicityAndSizeEstimateTest
spatialJoin = static_cast<SpatialJoin*>(spJoin2.get());
auto varColsMap = spatialJoin->getExternallyVisibleVariableColumns();

assertMultiplicity(subj1.getVariable(), 9.8, spatialJoin, varColsMap);
assertMultiplicity(obj1.getVariable(), 7.0, spatialJoin, varColsMap);
assertMultiplicity(subj2.getVariable(), 9.8, spatialJoin, varColsMap);
assertMultiplicity(obj2.getVariable(), 7.0, spatialJoin, varColsMap);
assertMultiplicity(subj1.getVariable(), 4.2, spatialJoin, varColsMap);
assertMultiplicity(obj1.getVariable(), 3.0, spatialJoin, varColsMap);
assertMultiplicity(subj2.getVariable(), 4.2, spatialJoin, varColsMap);
assertMultiplicity(obj2.getVariable(), 3.0, spatialJoin, varColsMap);
ASSERT_TRUE(
spatialJoin->onlyForTestingGetDistanceVariable().has_value());
assertMultiplicity(Variable{"?distanceForTesting"}, 1, spatialJoin,
varColsMap);
} else {
ASSERT_EQ(leftChild->getSizeEstimate(), 7);
ASSERT_EQ(rightChild->getSizeEstimate(), 7);
auto leftEstimate = leftChild->getSizeEstimate();
auto rightEstimate = rightChild->getSizeEstimate();
auto spJoin1 = spatialJoin->addChild(firstChild, firstVariable);
spatialJoin = static_cast<SpatialJoin*>(spJoin1.get());
auto spJoin2 = spatialJoin->addChild(secondChild, secondVariable);
spatialJoin = static_cast<SpatialJoin*>(spJoin2.get());
ASSERT_LE(spatialJoin->getSizeEstimate(), 49);
ASSERT_LE(spatialJoin->getSizeEstimate(), leftEstimate * rightEstimate);
}
}
}
Expand Down
Loading