Skip to content

Commit

Permalink
winsys/amdgpu: Limit usage of query_reset_state2
Browse files Browse the repository at this point in the history
Following discussion on kernel mailing list[1], we are not gaining
anything from using this to figure out if we reset, and it does not
handle soft recovery.

We will hear about the context loss and rationale when we submit.

Instead, only use this for figuring out if the reset we already knew
about was completed.

[1]: https://lists.freedesktop.org/archives/amd-gfx/2024-January/103337.html

Signed-off-by: Joshua Ashton <[email protected]>

Reviewed-by: André Almeida <[email protected]>
Reviewed-by: Pierre-Eric Pelloux-Prayer <[email protected]>
Reviewed-by: Marek Olšák <[email protected]>
  • Loading branch information
misyltoad committed Jan 18, 2024
1 parent 08f7fb3 commit 706ac1d
Showing 1 changed file with 47 additions and 69 deletions.
116 changes: 47 additions & 69 deletions src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
Original file line number Diff line number Diff line change
Expand Up @@ -468,89 +468,67 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_o
bool *needs_reset, bool *reset_completed)
{
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
int r;

if (needs_reset)
*needs_reset = false;
if (reset_completed)
*reset_completed = false;

/* Return a failure due to a GPU hang. */
if (ctx->ws->info.drm_minor >= 24) {
uint64_t flags;

if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
/* If the caller is only interested in full reset (= wants to ignore soft
* recoveries), we can use the rejected cs count as a quick first check.
*/
return PIPE_NO_RESET;
}
uint64_t flags;

r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags);
if (r) {
fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
return PIPE_NO_RESET;
}
if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
/* If the caller is only interested in full reset (= wants to ignore soft
* recoveries), we can use the rejected cs count as a quick first check.
*/
return PIPE_NO_RESET;
}

if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
if (reset_completed) {
/* The ARB_robustness spec says:
*
* If a reset status other than NO_ERROR is returned and subsequent
* calls return NO_ERROR, the context reset was encountered and
* completed. If a reset status is repeatedly returned, the context may
* be in the process of resetting.
*
* Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
* so don't do anything special. On older kernels, submit a no-op cs. If it
* succeeds then assume the reset is complete.
*/
if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
*reset_completed = true;

if (ctx->ws->info.drm_minor < 54 && ctx->ws->info.has_graphics)
*reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
/*
* ctx->sw_status is updated on alloc/ioctl failures.
*
* We only rely on amdgpu_cs_query_reset_state2 to tell us
* that the context reset is complete.
*/
if (ctx->sw_status != PIPE_NO_RESET) {
int r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags);
if (!r) {
if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
if (reset_completed) {
/* The ARB_robustness spec says:
*
* If a reset status other than NO_ERROR is returned and subsequent
* calls return NO_ERROR, the context reset was encountered and
* completed. If a reset status is repeatedly returned, the context may
* be in the process of resetting.
*
* Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
* so don't do anything special. On older kernels, submit a no-op cs. If it
* succeeds then assume the reset is complete.
*/
if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
*reset_completed = true;

if (ctx->ws->info.drm_minor < 54 && ctx->ws->info.has_graphics)
*reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
}
}

if (needs_reset)
*needs_reset = flags & AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
if (flags & AMDGPU_CTX_QUERY2_FLAGS_GUILTY)
return PIPE_GUILTY_CONTEXT_RESET;
else
return PIPE_INNOCENT_CONTEXT_RESET;
}
} else {
uint32_t result, hangs;

r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs);
if (r) {
fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r);
return PIPE_NO_RESET;
}

if (needs_reset)
*needs_reset = true;
switch (result) {
case AMDGPU_CTX_GUILTY_RESET:
return PIPE_GUILTY_CONTEXT_RESET;
case AMDGPU_CTX_INNOCENT_RESET:
return PIPE_INNOCENT_CONTEXT_RESET;
case AMDGPU_CTX_UNKNOWN_RESET:
return PIPE_UNKNOWN_CONTEXT_RESET;
} else {
fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
}
}

/* Return a failure due to SW issues. */
if (ctx->sw_status != PIPE_NO_RESET) {
/* Return a failure due to SW issues. */
if (needs_reset)
*needs_reset = true;
return ctx->sw_status;
}

if (needs_reset)
*needs_reset = false;
return PIPE_NO_RESET;
}


/* COMMAND SUBMISSION */

static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
Expand Down Expand Up @@ -1784,19 +1762,19 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
if (unlikely(r)) {
if (r == -ECANCELED) {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
"amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
"amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
} else if (r == -ENODATA) {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
} else if (r == -ETIME) {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
} else {
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
PIPE_UNKNOWN_CONTEXT_RESET,
"amdgpu: The CS has been rejected, "
"see dmesg for more information (%i).\n",
r);
PIPE_UNKNOWN_CONTEXT_RESET,
"amdgpu: The CS has been rejected, "
"see dmesg for more information (%i).\n",
r);
}
}

Expand Down

0 comments on commit 706ac1d

Please sign in to comment.