-
Notifications
You must be signed in to change notification settings - Fork 692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SOLR-17151 - stop processing components once we have exceeded a query limit #2403
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,8 +20,16 @@ | |
import static org.apache.solr.common.params.CommonParams.FAILURE; | ||
import static org.apache.solr.common.params.CommonParams.PATH; | ||
import static org.apache.solr.common.params.CommonParams.STATUS; | ||
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_DONE; | ||
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_EXECUTE_QUERY; | ||
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_GET_FIELDS; | ||
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_PARSE_QUERY; | ||
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_START; | ||
import static org.apache.solr.handler.component.ResponseBuilder.STAGE_TOP_GROUPS; | ||
import static org.apache.solr.request.SolrRequestInfo.getQueryLimits; | ||
|
||
import com.codahale.metrics.Counter; | ||
import java.io.IOException; | ||
import java.io.PrintWriter; | ||
import java.io.StringWriter; | ||
import java.lang.invoke.MethodHandles; | ||
|
@@ -33,6 +41,7 @@ | |
import java.util.Set; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
import java.util.concurrent.atomic.AtomicLong; | ||
import java.util.stream.Collectors; | ||
import org.apache.lucene.index.ExitableDirectoryReader; | ||
import org.apache.lucene.search.TotalHits; | ||
import org.apache.solr.client.solrj.SolrRequest.SolrRequestType; | ||
|
@@ -422,18 +431,74 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw | |
return; // Circuit breaker tripped, return immediately | ||
} | ||
|
||
processComponents(req, rsp, rb, timer, components); | ||
|
||
// SOLR-5550: still provide shards.info if requested even for a short-circuited distrib request | ||
if (!rb.isDistrib | ||
&& req.getParams().getBool(ShardParams.SHARDS_INFO, false) | ||
&& rb.shortCircuitedURL != null) { | ||
NamedList<Object> shardInfo = new SimpleOrderedMap<>(); | ||
SimpleOrderedMap<Object> nl = new SimpleOrderedMap<>(); | ||
if (rsp.getException() != null) { | ||
Throwable cause = rsp.getException(); | ||
if (cause instanceof SolrServerException) { | ||
cause = ((SolrServerException) cause).getRootCause(); | ||
} else { | ||
if (cause.getCause() != null) { | ||
cause = cause.getCause(); | ||
} | ||
} | ||
nl.add("error", cause.toString()); | ||
if (!core.getCoreContainer().hideStackTrace()) { | ||
StringWriter trace = new StringWriter(); | ||
cause.printStackTrace(new PrintWriter(trace)); | ||
nl.add("trace", trace.toString()); | ||
} | ||
} else if (rb.getResults() != null) { | ||
nl.add("numFound", rb.getResults().docList.matches()); | ||
nl.add( | ||
"numFoundExact", | ||
rb.getResults().docList.hitCountRelation() == TotalHits.Relation.EQUAL_TO); | ||
nl.add("maxScore", rb.getResults().docList.maxScore()); | ||
} | ||
nl.add("shardAddress", rb.shortCircuitedURL); | ||
nl.add("time", req.getRequestTimer().getTime()); // elapsed time of this request so far | ||
|
||
int pos = rb.shortCircuitedURL.indexOf("://"); | ||
String shardInfoName = | ||
pos != -1 ? rb.shortCircuitedURL.substring(pos + 3) : rb.shortCircuitedURL; | ||
shardInfo.add(shardInfoName, nl); | ||
rsp.getValues().add(ShardParams.SHARDS_INFO, shardInfo); | ||
} | ||
} | ||
|
||
private void processComponents( | ||
SolrQueryRequest req, | ||
SolrQueryResponse rsp, | ||
ResponseBuilder rb, | ||
RTimerTree timer, | ||
List<SearchComponent> components) | ||
throws IOException { | ||
// creates a ShardHandler object only if it's needed | ||
final ShardHandler shardHandler1 = getAndPrepShardHandler(req, rb); | ||
|
||
if (timer == null) { | ||
// non-debugging prepare phase | ||
for (SearchComponent c : components) { | ||
if (checkLimitsBefore(c, "prepare", rb.req, rb.rsp, components)) { | ||
shortCircuitedResults(req, rb); | ||
return; | ||
} | ||
c.prepare(rb); | ||
} | ||
} else { | ||
// debugging prepare phase | ||
RTimerTree subt = timer.sub("prepare"); | ||
for (SearchComponent c : components) { | ||
if (checkLimitsBefore(c, "prepare debug", rb.req, rb.rsp, components)) { | ||
shortCircuitedResults(req, rb); | ||
return; | ||
} | ||
rb.setTimer(subt.sub(c.getName())); | ||
c.prepare(rb); | ||
rb.getTimer().stop(); | ||
|
@@ -463,12 +528,20 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw | |
if (!rb.isDebug()) { | ||
// Process | ||
for (SearchComponent c : components) { | ||
if (checkLimitsBefore(c, "process", rb.req, rb.rsp, components)) { | ||
shortCircuitedResults(req, rb); | ||
return; | ||
} | ||
c.process(rb); | ||
} | ||
} else { | ||
// Process | ||
RTimerTree subt = timer.sub("process"); | ||
for (SearchComponent c : components) { | ||
if (checkLimitsBefore(c, "process debug", rb.req, rb.rsp, components)) { | ||
shortCircuitedResults(req, rb); | ||
return; | ||
} | ||
rb.setTimer(subt.sub(c.getName())); | ||
c.process(rb); | ||
rb.getTimer().stop(); | ||
|
@@ -482,21 +555,7 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw | |
} | ||
} catch (ExitableDirectoryReader.ExitingReaderException ex) { | ||
log.warn("Query: {}; ", req.getParamString(), ex); | ||
if (rb.rsp.getResponse() == null) { | ||
rb.rsp.addResponse(new SolrDocumentList()); | ||
|
||
// If a cursorMark was passed, and we didn't progress, set | ||
// the nextCursorMark to the same position | ||
String cursorStr = rb.req.getParams().get(CursorMarkParams.CURSOR_MARK_PARAM); | ||
if (null != cursorStr) { | ||
rb.rsp.add(CursorMarkParams.CURSOR_MARK_NEXT, cursorStr); | ||
} | ||
} | ||
if (rb.isDebug()) { | ||
NamedList<Object> debug = new NamedList<>(); | ||
debug.add("explain", new NamedList<>()); | ||
rb.rsp.add("debug", debug); | ||
} | ||
shortCircuitedResults(req, rb); | ||
rb.rsp.setPartialResults(); | ||
} | ||
} else { | ||
|
@@ -610,6 +669,10 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw | |
} | ||
|
||
for (SearchComponent c : components) { | ||
if (checkLimitsBefore( | ||
c, "finish stage:" + stageInEnglish(nextStage), rb.req, rb.rsp, components)) { | ||
return; | ||
} | ||
c.finishStage(rb); | ||
} | ||
|
||
|
@@ -621,46 +684,65 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw | |
rsp.addToLog(ThreadCpuTimer.CPU_TIME, totalShardCpuTime); | ||
} | ||
} | ||
} | ||
|
||
// SOLR-5550: still provide shards.info if requested even for a short circuited distrib request | ||
if (!rb.isDistrib | ||
&& req.getParams().getBool(ShardParams.SHARDS_INFO, false) | ||
&& rb.shortCircuitedURL != null) { | ||
NamedList<Object> shardInfo = new SimpleOrderedMap<>(); | ||
SimpleOrderedMap<Object> nl = new SimpleOrderedMap<>(); | ||
if (rsp.getException() != null) { | ||
Throwable cause = rsp.getException(); | ||
if (cause instanceof SolrServerException) { | ||
cause = ((SolrServerException) cause).getRootCause(); | ||
} else { | ||
if (cause.getCause() != null) { | ||
cause = cause.getCause(); | ||
} | ||
} | ||
nl.add("error", cause.toString()); | ||
if (!core.getCoreContainer().hideStackTrace()) { | ||
StringWriter trace = new StringWriter(); | ||
cause.printStackTrace(new PrintWriter(trace)); | ||
nl.add("trace", trace.toString()); | ||
} | ||
} else if (rb.getResults() != null) { | ||
nl.add("numFound", rb.getResults().docList.matches()); | ||
nl.add( | ||
"numFoundExact", | ||
rb.getResults().docList.hitCountRelation() == TotalHits.Relation.EQUAL_TO); | ||
nl.add("maxScore", rb.getResults().docList.maxScore()); | ||
} | ||
nl.add("shardAddress", rb.shortCircuitedURL); | ||
nl.add("time", req.getRequestTimer().getTime()); // elapsed time of this request so far | ||
private static String stageInEnglish(int nextStage) { | ||
// This should probably be a enum, but that change should be its own ticket. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for enum (in a separate PR) |
||
switch (nextStage) { | ||
case STAGE_START: | ||
return "START"; | ||
case STAGE_PARSE_QUERY: | ||
return "PARSE_QUERY"; | ||
case STAGE_TOP_GROUPS: | ||
return "TOP_GROUPS"; | ||
case STAGE_EXECUTE_QUERY: | ||
return "EXECUTE_QUERY"; | ||
case STAGE_GET_FIELDS: | ||
return "GET_FIELDS"; | ||
// nobody wants to think it was DONE and canceled after it completed... | ||
case STAGE_DONE: | ||
return "FINISHING"; | ||
default: | ||
throw new SolrException( | ||
SolrException.ErrorCode.SERVER_ERROR, "Unrecognized stage:" + nextStage); | ||
} | ||
} | ||
|
||
int pos = rb.shortCircuitedURL.indexOf("://"); | ||
String shardInfoName = | ||
pos != -1 ? rb.shortCircuitedURL.substring(pos + 3) : rb.shortCircuitedURL; | ||
shardInfo.add(shardInfoName, nl); | ||
rsp.getValues().add(ShardParams.SHARDS_INFO, shardInfo); | ||
private static void shortCircuitedResults(SolrQueryRequest req, ResponseBuilder rb) { | ||
|
||
if (rb.rsp.getResponse() == null) { | ||
rb.rsp.addResponse(new SolrDocumentList()); | ||
|
||
// If a cursorMark was passed, and we didn't progress, set | ||
// the nextCursorMark to the same position | ||
String cursorStr = rb.req.getParams().get(CursorMarkParams.CURSOR_MARK_PARAM); | ||
if (null != cursorStr) { | ||
rb.rsp.add(CursorMarkParams.CURSOR_MARK_NEXT, cursorStr); | ||
} | ||
} | ||
if (rb.isDebug()) { | ||
NamedList<Object> debug = new NamedList<>(); | ||
debug.add("explain", new NamedList<>()); | ||
rb.rsp.add("debug", debug); | ||
} | ||
} | ||
|
||
private static boolean checkLimitsBefore( | ||
SearchComponent c, | ||
String when, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, this name looks confusing ... maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. stage is what we use for the finishing steps... maybe phase? |
||
SolrQueryRequest req, | ||
SolrQueryResponse resp, | ||
List<SearchComponent> components) { | ||
|
||
return getQueryLimits(req, resp) | ||
.maybeExitWithPartialResults( | ||
() -> { | ||
gus-asf marked this conversation as resolved.
Show resolved
Hide resolved
|
||
List<String> names = | ||
components.stream().map(SearchComponent::getName).collect(Collectors.toList()); | ||
return "[" + when + "] Limit(s) exceeded prior to " + c.getName() + " in " + names; | ||
}); | ||
} | ||
|
||
private long computeShardCpuTime(List<ShardResponse> responses) { | ||
long totalShardCpuTime = 0; | ||
for (ShardResponse response : responses) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,11 +16,13 @@ | |
*/ | ||
package org.apache.solr.search; | ||
|
||
import static org.apache.solr.response.SolrQueryResponse.RESPONSE_HEADER_PARTIAL_RESULTS_DETAILS_KEY; | ||
import static org.apache.solr.search.CpuAllowedLimit.hasCpuLimit; | ||
import static org.apache.solr.search.TimeAllowedLimit.hasTimeLimit; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.function.Supplier; | ||
import org.apache.lucene.index.QueryTimeout; | ||
import org.apache.solr.common.params.CommonParams; | ||
import org.apache.solr.request.SolrQueryRequest; | ||
|
@@ -108,22 +110,31 @@ public String formatExceptionMessage(String label) { | |
* @throws QueryLimitsExceededException if {@link CommonParams#PARTIAL_RESULTS} request parameter | ||
* is false and limits have been reached. | ||
*/ | ||
public boolean maybeExitWithPartialResults(String label) throws QueryLimitsExceededException { | ||
public boolean maybeExitWithPartialResults(Supplier<String> label) | ||
throws QueryLimitsExceededException { | ||
if (isLimitsEnabled() && shouldExit()) { | ||
if (allowPartialResults) { | ||
if (rsp != null) { | ||
rsp.setPartialResults(); | ||
rsp.addPartialResponseDetail(formatExceptionMessage(label)); | ||
if (rsp.getResponseHeader().get(RESPONSE_HEADER_PARTIAL_RESULTS_DETAILS_KEY) == null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This loses information about the exceeded limits inside some components, when we accept partial results (ie. exception is not thrown immediately). Why wouldn't duplicate keys be ok? NamedList is essentially a multi-map so multiple keys of the same value are explicitly allowed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well typically before I did this we would get the initial limit violation, and then at least one component violation which isn't really interesting because the first one is where we really stopped. Also there are tests that check the contents of partialResultsDetail using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, ok... then maybe we should add some processing similar to |
||
// don't want to add duplicate keys. Although technically legal, there's a strong risk | ||
// that clients won't anticipate it and break. | ||
rsp.addPartialResponseDetail(formatExceptionMessage(label.get())); | ||
} | ||
} | ||
return true; | ||
} else { | ||
throw new QueryLimitsExceededException(formatExceptionMessage(label)); | ||
throw new QueryLimitsExceededException(formatExceptionMessage(label.get())); | ||
} | ||
} else { | ||
return false; | ||
} | ||
} | ||
|
||
public boolean maybeExitWithPartialResults(String label) throws QueryLimitsExceededException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should add some javadoc here that explains why we have two different methods for doing essentially the same work, and when to prefer one over the other. |
||
return maybeExitWithPartialResults(() -> label); | ||
} | ||
|
||
/** | ||
* Method to diagnose limit exceeded. Note that while this should always list the exceeded limit, | ||
* it may also nominate additional limits that have been exceeded since the actual check that | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,7 +32,7 @@ | |
*/ | ||
public class ExitableDirectoryReaderTest extends SolrTestCaseJ4 { | ||
|
||
static int NUM_DOCS = 100; | ||
static final int NUM_DOCS = 100; | ||
static final String assertionString = "/response/numFound==" + NUM_DOCS; | ||
static final String failureAssertionString = "/responseHeader/partialResults==true]"; | ||
static final String longTimeout = "10000"; | ||
|
@@ -70,9 +70,12 @@ public void testPrefixQuery() throws Exception { | |
|
||
// this time we should get a query cache hit and hopefully no exception? this may change in the | ||
// future if time checks are put into other places. | ||
assertJQ(req("q", q, "timeAllowed", "1", "sleep", sleep), assertionString); | ||
// 2024-4-15: it did change..., and now this fails with 1 or 2 ms and passes with 3ms... I see | ||
// no way this won't be terribly brittle. Maybe TestInjection of some sort to bring this back? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely the answer :) please see how it's used in |
||
// assertJQ(req("q", q, "timeAllowed", "2", "sleep", sleep), assertionString); | ||
|
||
// now do the same for the filter cache | ||
// 2024-4-15: this still passes probably because *:* is so fast, but it still worries me | ||
assertJQ(req("q", "*:*", "fq", q, "timeAllowed", "1", "sleep", sleep), failureAssertionString); | ||
|
||
// make sure that the result succeeds this time, and that a bad filter wasn't cached | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice! that's very useful info for troubleshooting.