Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SOLR-17151 - stop processing components once we have exceeded a query limit #2403

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ public ResponseBuilder(
* public static final String LOCAL_SHARD = "local"; public static final String DOC_QUERY = "dq";
* *
*/
public static int STAGE_START = 0;
public static final int STAGE_START = 0;

public static int STAGE_PARSE_QUERY = 1000;
public static int STAGE_TOP_GROUPS = 1500;
public static int STAGE_EXECUTE_QUERY = 2000;
public static int STAGE_GET_FIELDS = 3000;
public static int STAGE_DONE = Integer.MAX_VALUE;
public static final int STAGE_PARSE_QUERY = 1000;
public static final int STAGE_TOP_GROUPS = 1500;
public static final int STAGE_EXECUTE_QUERY = 2000;
public static final int STAGE_GET_FIELDS = 3000;
public static final int STAGE_DONE = Integer.MAX_VALUE;

public int stage; // What stage is this current request at?

Expand Down
182 changes: 132 additions & 50 deletions solr/core/src/java/org/apache/solr/handler/component/SearchHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,16 @@
import static org.apache.solr.common.params.CommonParams.FAILURE;
import static org.apache.solr.common.params.CommonParams.PATH;
import static org.apache.solr.common.params.CommonParams.STATUS;
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_DONE;
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_EXECUTE_QUERY;
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_GET_FIELDS;
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_PARSE_QUERY;
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_START;
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_TOP_GROUPS;
import static org.apache.solr.request.SolrRequestInfo.getQueryLimits;

import com.codahale.metrics.Counter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.lang.invoke.MethodHandles;
Expand All @@ -33,6 +41,7 @@
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import org.apache.lucene.index.ExitableDirectoryReader;
import org.apache.lucene.search.TotalHits;
import org.apache.solr.client.solrj.SolrRequest.SolrRequestType;
Expand Down Expand Up @@ -422,18 +431,74 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
return; // Circuit breaker tripped, return immediately
}

processComponents(req, rsp, rb, timer, components);

// SOLR-5550: still provide shards.info if requested even for a short-circuited distrib request
if (!rb.isDistrib
&& req.getParams().getBool(ShardParams.SHARDS_INFO, false)
&& rb.shortCircuitedURL != null) {
NamedList<Object> shardInfo = new SimpleOrderedMap<>();
SimpleOrderedMap<Object> nl = new SimpleOrderedMap<>();
if (rsp.getException() != null) {
Throwable cause = rsp.getException();
if (cause instanceof SolrServerException) {
cause = ((SolrServerException) cause).getRootCause();
} else {
if (cause.getCause() != null) {
cause = cause.getCause();
}
}
nl.add("error", cause.toString());
if (!core.getCoreContainer().hideStackTrace()) {
StringWriter trace = new StringWriter();
cause.printStackTrace(new PrintWriter(trace));
nl.add("trace", trace.toString());
}
} else if (rb.getResults() != null) {
nl.add("numFound", rb.getResults().docList.matches());
nl.add(
"numFoundExact",
rb.getResults().docList.hitCountRelation() == TotalHits.Relation.EQUAL_TO);
nl.add("maxScore", rb.getResults().docList.maxScore());
}
nl.add("shardAddress", rb.shortCircuitedURL);
nl.add("time", req.getRequestTimer().getTime()); // elapsed time of this request so far

int pos = rb.shortCircuitedURL.indexOf("://");
String shardInfoName =
pos != -1 ? rb.shortCircuitedURL.substring(pos + 3) : rb.shortCircuitedURL;
shardInfo.add(shardInfoName, nl);
rsp.getValues().add(ShardParams.SHARDS_INFO, shardInfo);
}
}

private void processComponents(
SolrQueryRequest req,
SolrQueryResponse rsp,
ResponseBuilder rb,
RTimerTree timer,
List<SearchComponent> components)
throws IOException {
// creates a ShardHandler object only if it's needed
final ShardHandler shardHandler1 = getAndPrepShardHandler(req, rb);

if (timer == null) {
// non-debugging prepare phase
for (SearchComponent c : components) {
if (checkLimitsBefore(c, "prepare", rb.req, rb.rsp, components)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! that's very useful info for troubleshooting.

shortCircuitedResults(req, rb);
return;
}
c.prepare(rb);
}
} else {
// debugging prepare phase
RTimerTree subt = timer.sub("prepare");
for (SearchComponent c : components) {
if (checkLimitsBefore(c, "prepare debug", rb.req, rb.rsp, components)) {
shortCircuitedResults(req, rb);
return;
}
rb.setTimer(subt.sub(c.getName()));
c.prepare(rb);
rb.getTimer().stop();
Expand Down Expand Up @@ -463,12 +528,20 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
if (!rb.isDebug()) {
// Process
for (SearchComponent c : components) {
if (checkLimitsBefore(c, "process", rb.req, rb.rsp, components)) {
shortCircuitedResults(req, rb);
return;
}
c.process(rb);
}
} else {
// Process
RTimerTree subt = timer.sub("process");
for (SearchComponent c : components) {
if (checkLimitsBefore(c, "process debug", rb.req, rb.rsp, components)) {
shortCircuitedResults(req, rb);
return;
}
rb.setTimer(subt.sub(c.getName()));
c.process(rb);
rb.getTimer().stop();
Expand All @@ -482,21 +555,7 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
}
} catch (ExitableDirectoryReader.ExitingReaderException ex) {
log.warn("Query: {}; ", req.getParamString(), ex);
if (rb.rsp.getResponse() == null) {
rb.rsp.addResponse(new SolrDocumentList());

// If a cursorMark was passed, and we didn't progress, set
// the nextCursorMark to the same position
String cursorStr = rb.req.getParams().get(CursorMarkParams.CURSOR_MARK_PARAM);
if (null != cursorStr) {
rb.rsp.add(CursorMarkParams.CURSOR_MARK_NEXT, cursorStr);
}
}
if (rb.isDebug()) {
NamedList<Object> debug = new NamedList<>();
debug.add("explain", new NamedList<>());
rb.rsp.add("debug", debug);
}
shortCircuitedResults(req, rb);
rb.rsp.setPartialResults();
}
} else {
Expand Down Expand Up @@ -610,6 +669,10 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
}

for (SearchComponent c : components) {
if (checkLimitsBefore(
c, "finish stage:" + stageInEnglish(nextStage), rb.req, rb.rsp, components)) {
return;
}
c.finishStage(rb);
}

Expand All @@ -621,46 +684,65 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
rsp.addToLog(ThreadCpuTimer.CPU_TIME, totalShardCpuTime);
}
}
}

// SOLR-5550: still provide shards.info if requested even for a short circuited distrib request
if (!rb.isDistrib
&& req.getParams().getBool(ShardParams.SHARDS_INFO, false)
&& rb.shortCircuitedURL != null) {
NamedList<Object> shardInfo = new SimpleOrderedMap<>();
SimpleOrderedMap<Object> nl = new SimpleOrderedMap<>();
if (rsp.getException() != null) {
Throwable cause = rsp.getException();
if (cause instanceof SolrServerException) {
cause = ((SolrServerException) cause).getRootCause();
} else {
if (cause.getCause() != null) {
cause = cause.getCause();
}
}
nl.add("error", cause.toString());
if (!core.getCoreContainer().hideStackTrace()) {
StringWriter trace = new StringWriter();
cause.printStackTrace(new PrintWriter(trace));
nl.add("trace", trace.toString());
}
} else if (rb.getResults() != null) {
nl.add("numFound", rb.getResults().docList.matches());
nl.add(
"numFoundExact",
rb.getResults().docList.hitCountRelation() == TotalHits.Relation.EQUAL_TO);
nl.add("maxScore", rb.getResults().docList.maxScore());
}
nl.add("shardAddress", rb.shortCircuitedURL);
nl.add("time", req.getRequestTimer().getTime()); // elapsed time of this request so far
private static String stageInEnglish(int nextStage) {
// This should probably be a enum, but that change should be its own ticket.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for enum (in a separate PR)

switch (nextStage) {
case STAGE_START:
return "START";
case STAGE_PARSE_QUERY:
return "PARSE_QUERY";
case STAGE_TOP_GROUPS:
return "TOP_GROUPS";
case STAGE_EXECUTE_QUERY:
return "EXECUTE_QUERY";
case STAGE_GET_FIELDS:
return "GET_FIELDS";
// nobody wants to think it was DONE and canceled after it completed...
case STAGE_DONE:
return "FINISHING";
default:
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR, "Unrecognized stage:" + nextStage);
}
}

int pos = rb.shortCircuitedURL.indexOf("://");
String shardInfoName =
pos != -1 ? rb.shortCircuitedURL.substring(pos + 3) : rb.shortCircuitedURL;
shardInfo.add(shardInfoName, nl);
rsp.getValues().add(ShardParams.SHARDS_INFO, shardInfo);
private static void shortCircuitedResults(SolrQueryRequest req, ResponseBuilder rb) {

if (rb.rsp.getResponse() == null) {
rb.rsp.addResponse(new SolrDocumentList());

// If a cursorMark was passed, and we didn't progress, set
// the nextCursorMark to the same position
String cursorStr = rb.req.getParams().get(CursorMarkParams.CURSOR_MARK_PARAM);
if (null != cursorStr) {
rb.rsp.add(CursorMarkParams.CURSOR_MARK_NEXT, cursorStr);
}
}
if (rb.isDebug()) {
NamedList<Object> debug = new NamedList<>();
debug.add("explain", new NamedList<>());
rb.rsp.add("debug", debug);
}
}

private static boolean checkLimitsBefore(
SearchComponent c,
String when,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this name looks confusing ... maybe stageName or stageLabel ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stage is what we use for the finishing steps... maybe phase?

SolrQueryRequest req,
SolrQueryResponse resp,
List<SearchComponent> components) {

return getQueryLimits(req, resp)
.maybeExitWithPartialResults(
() -> {
gus-asf marked this conversation as resolved.
Show resolved Hide resolved
List<String> names =
components.stream().map(SearchComponent::getName).collect(Collectors.toList());
return "[" + when + "] Limit(s) exceeded prior to " + c.getName() + " in " + names;
});
}

private long computeShardCpuTime(List<ShardResponse> responses) {
long totalShardCpuTime = 0;
for (ShardResponse response : responses) {
Expand Down
11 changes: 7 additions & 4 deletions solr/core/src/java/org/apache/solr/request/SolrRequestInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,13 @@ private void initQueryLimits() {
public QueryLimits getLimits() {
// make sure the ThreadCpuTime is always initialized
getThreadCpuTimer();
return req == null || rsp == null
? QueryLimits.NONE
: (QueryLimits)
req.getContext().computeIfAbsent(LIMITS_KEY, (k) -> new QueryLimits(req, rsp));
return req == null || rsp == null ? QueryLimits.NONE : getQueryLimits(req, rsp);
}

public static QueryLimits getQueryLimits(SolrQueryRequest request, SolrQueryResponse response) {
request.getContext().computeIfAbsent(CPU_TIMER_KEY, k -> new ThreadCpuTimer());
return (QueryLimits)
request.getContext().computeIfAbsent(LIMITS_KEY, (k) -> new QueryLimits(request, response));
}

/**
Expand Down
17 changes: 14 additions & 3 deletions solr/core/src/java/org/apache/solr/search/QueryLimits.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
*/
package org.apache.solr.search;

import static org.apache.solr.response.SolrQueryResponse.RESPONSE_HEADER_PARTIAL_RESULTS_DETAILS_KEY;
import static org.apache.solr.search.CpuAllowedLimit.hasCpuLimit;
import static org.apache.solr.search.TimeAllowedLimit.hasTimeLimit;

import java.util.ArrayList;
import java.util.List;
import java.util.function.Supplier;
import org.apache.lucene.index.QueryTimeout;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.request.SolrQueryRequest;
Expand Down Expand Up @@ -108,22 +110,31 @@ public String formatExceptionMessage(String label) {
* @throws QueryLimitsExceededException if {@link CommonParams#PARTIAL_RESULTS} request parameter
* is false and limits have been reached.
*/
public boolean maybeExitWithPartialResults(String label) throws QueryLimitsExceededException {
public boolean maybeExitWithPartialResults(Supplier<String> label)
throws QueryLimitsExceededException {
if (isLimitsEnabled() && shouldExit()) {
if (allowPartialResults) {
if (rsp != null) {
rsp.setPartialResults();
rsp.addPartialResponseDetail(formatExceptionMessage(label));
if (rsp.getResponseHeader().get(RESPONSE_HEADER_PARTIAL_RESULTS_DETAILS_KEY) == null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This loses information about the exceeded limits inside some components, when we accept partial results (ie. exception is not thrown immediately). Why wouldn't duplicate keys be ok? NamedList is essentially a multi-map so multiple keys of the same value are explicitly allowed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well typically before I did this we would get the initial limit violation, and then at least one component violation which isn't really interesting because the first one is where we really stopped. Also there are tests that check the contents of partialResultsDetail using JSONTestUtil.match() which doesn't seem to have a syntax for handling duplicate keys (or if it does I can't figure out what it is), and also if someone out there has been using a Jackson Object Mapper or similar to turn the header into an object (or JSON.parse() in javascript where they'll loose one anyway) things could go sideways if they suddenly encounter either a duplicate key or an array there... I can provide a rant about the fact we allow/use duplicate keys in the first place, how we should be referring to our APIs as noggit APIs not JSON APIs because we accept illegal JSON... etc... If you're interested :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, ok... then maybe we should add some processing similar to computeShardCpuTime to aggregate multiple details from shard responses into a single value?

// don't want to add duplicate keys. Although technically legal, there's a strong risk
// that clients won't anticipate it and break.
rsp.addPartialResponseDetail(formatExceptionMessage(label.get()));
}
}
return true;
} else {
throw new QueryLimitsExceededException(formatExceptionMessage(label));
throw new QueryLimitsExceededException(formatExceptionMessage(label.get()));
}
} else {
return false;
}
}

public boolean maybeExitWithPartialResults(String label) throws QueryLimitsExceededException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add some javadoc here that explains why we have two different methods for doing essentially the same work, and when to prefer one over the other.

return maybeExitWithPartialResults(() -> label);
}

/**
* Method to diagnose limit exceeded. Note that while this should always list the exceeded limit,
* it may also nominate additional limits that have been exceeded since the actual check that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
*/
public class ExitableDirectoryReaderTest extends SolrTestCaseJ4 {

static int NUM_DOCS = 100;
static final int NUM_DOCS = 100;
static final String assertionString = "/response/numFound==" + NUM_DOCS;
static final String failureAssertionString = "/responseHeader/partialResults==true]";
static final String longTimeout = "10000";
Expand Down Expand Up @@ -70,9 +70,12 @@ public void testPrefixQuery() throws Exception {

// this time we should get a query cache hit and hopefully no exception? this may change in the
// future if time checks are put into other places.
assertJQ(req("q", q, "timeAllowed", "1", "sleep", sleep), assertionString);
// 2024-4-15: it did change..., and now this fails with 1 or 2 ms and passes with 3ms... I see
// no way this won't be terribly brittle. Maybe TestInjection of some sort to bring this back?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Definitely the answer :) please see how it's used in TestQueryLimits.

// assertJQ(req("q", q, "timeAllowed", "2", "sleep", sleep), assertionString);

// now do the same for the filter cache
// 2024-4-15: this still passes probably because *:* is so fast, but it still worries me
assertJQ(req("q", "*:*", "fq", q, "timeAllowed", "1", "sleep", sleep), failureAssertionString);

// make sure that the result succeeds this time, and that a bad filter wasn't cached
Expand Down
Loading