From 503e280d9fd099fdecec3fca5a91c01bfe812c19 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Tue, 24 Sep 2024 00:08:43 +0900 Subject: [PATCH 1/7] drm/asahi: Workqueue: Add more debug Signed-off-by: Asahi Lina --- drivers/gpu/drm/asahi/workqueue.rs | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/asahi/workqueue.rs b/drivers/gpu/drm/asahi/workqueue.rs index d88e38642e9e8a..6010feba79135c 100644 --- a/drivers/gpu/drm/asahi/workqueue.rs +++ b/drivers/gpu/drm/asahi/workqueue.rs @@ -372,7 +372,7 @@ impl Job::ver { Some(work.get_fence()) } else { pr_err!( - "WorkQueue: Cannot submit, but queue is empty? {} > {}, {} > {} (pend={} ls={:#x?} lc={:#x?})\n", + "WorkQueue: Cannot submit, but queue is empty? {} > {}, {} > {} (pend={} ls={:#x?} lc={:#x?}) ev={:#x?}\n", inner.free_slots(), self.event_count, inner.free_space(), @@ -380,6 +380,7 @@ impl Job::ver { inner.pending.len(), inner.last_submitted, inner.last_completed, + inner.event.as_ref().map(|a| a.1), ); None } @@ -510,7 +511,7 @@ impl Drop for Job::ver { if self.committed && !self.submitted { let pipe_type = inner.pipe_type; let event = inner.event.as_mut().expect("Job lost its event"); - mod_pr_debug!( + pr_info!( "WorkQueue({:?}): Roll back {} events (slot {} val {:#x?}) and {} commands\n", pipe_type, self.event_count, @@ -546,8 +547,8 @@ impl<'a> Drop for JobSubmission::ver<'a> { let pipe_type = inner.pipe_type; let event = inner.event.as_mut().expect("JobSubmission lost its event"); - mod_pr_debug!( - "WorkQueue({:?}): Roll back {} events (slot {} val {:#x?}) and {} commands\n", + pr_info!( + "WorkQueue({:?}): JobSubmission: Roll back {} events (slot {} val {:#x?}) and {} commands\n", pipe_type, self.event_count, event.0.slot(), @@ -771,6 +772,18 @@ impl WorkQueue for WorkQueue::ver { Some(event) => event.0.current(), }; + if let Some(lc) = inner.last_completed { + if value < lc { + pr_err!( + "WorkQueue: event rolled back? cur {:#x?}, lc {:#x?}, ls {:#x?}", + value, + inner.last_completed, + inner.last_submitted + ); + } + } else { + pr_crit!("WorkQueue: signal() called with no last_completed.\n"); + } inner.last_completed = Some(value); mod_pr_debug!( From c08e3efad7fabe9ab89ec8cee7a72e5c70309b43 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Tue, 24 Sep 2024 00:32:54 +0900 Subject: [PATCH 2/7] drm/asahi: Fix event tracking when JobSubmission is dropped Signed-off-by: Asahi Lina --- drivers/gpu/drm/asahi/workqueue.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/asahi/workqueue.rs b/drivers/gpu/drm/asahi/workqueue.rs index 6010feba79135c..ab77fa12ad3f55 100644 --- a/drivers/gpu/drm/asahi/workqueue.rs +++ b/drivers/gpu/drm/asahi/workqueue.rs @@ -556,8 +556,10 @@ impl<'a> Drop for JobSubmission::ver<'a> { self.command_count ); event.1.sub(self.event_count as u32); + let val = event.1; inner.commit_seq -= self.command_count as u64; inner.event_seq -= self.event_count as u64; + inner.last_submitted = Some(val); mod_pr_debug!("WorkQueue({:?}): Dropped JobSubmission\n", inner.pipe_type); } } From 28ad75b26a7c09aeabfc11b3a934d11868141766 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Tue, 24 Sep 2024 03:27:57 +0900 Subject: [PATCH 3/7] fixup! drm/asahi: run rustfmt --- drivers/gpu/drm/asahi/file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/asahi/file.rs b/drivers/gpu/drm/asahi/file.rs index 5a5f90bd18a35e..5baf5f54631a9a 100644 --- a/drivers/gpu/drm/asahi/file.rs +++ b/drivers/gpu/drm/asahi/file.rs @@ -383,7 +383,7 @@ impl File { ualloc, ualloc_priv, vm, - kernel_range, + kernel_range, _dummy_mapping: dummy_mapping, }, GFP_KERNEL, From e6690004fe5b747e455a757f9cea972df37b1d33 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Tue, 24 Sep 2024 03:28:40 +0900 Subject: [PATCH 4/7] drm/asahi: gpu: Show unknown field in timeouts Signed-off-by: Asahi Lina --- drivers/gpu/drm/asahi/channel.rs | 5 ++--- drivers/gpu/drm/asahi/gpu.rs | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/asahi/channel.rs b/drivers/gpu/drm/asahi/channel.rs index e46c17f98a146f..709fb3d1dbe128 100644 --- a/drivers/gpu/drm/asahi/channel.rs +++ b/drivers/gpu/drm/asahi/channel.rs @@ -353,10 +353,10 @@ impl EventChannel::ver { }, EventMsg::Timeout { counter, + unk_8, event_slot, - .. } => match self.gpu.as_ref() { - Some(gpu) => gpu.handle_timeout(counter, event_slot), + Some(gpu) => gpu.handle_timeout(counter, event_slot, unk_8), None => { dev_crit!(self.dev, "EventChannel: No GPU manager available!\n") } @@ -374,7 +374,6 @@ impl EventChannel::ver { vm_slot, buffer_slot, counter, - .. } => match self.gpu.as_ref() { Some(gpu) => { self.buf_mgr.grow(buffer_slot); diff --git a/drivers/gpu/drm/asahi/gpu.rs b/drivers/gpu/drm/asahi/gpu.rs index 0d489ef4ce7b61..06f95d7c2c9699 100644 --- a/drivers/gpu/drm/asahi/gpu.rs +++ b/drivers/gpu/drm/asahi/gpu.rs @@ -253,7 +253,7 @@ pub(crate) trait GpuManager: Send + Sync { /// TODO: Does this actually work? fn flush_fw_cache(&self) -> Result; /// Handle a GPU work timeout event. - fn handle_timeout(&self, counter: u32, event_slot: i32); + fn handle_timeout(&self, counter: u32, event_slot: i32, unk: u32); /// Handle a GPU fault event. fn handle_fault(&self); /// Acknowledge a Buffer grow op. @@ -1294,7 +1294,7 @@ impl GpuManager for GpuManager::ver { &self.ids } - fn handle_timeout(&self, counter: u32, event_slot: i32) { + fn handle_timeout(&self, counter: u32, event_slot: i32, unk: u32) { dev_err!(self.dev, " (\\________/) \n"); dev_err!(self.dev, " | | \n"); dev_err!(self.dev, "'.| \\ , / |.'\n"); @@ -1304,6 +1304,7 @@ impl GpuManager for GpuManager::ver { dev_err!(self.dev, "** GPU timeout nya~!!!!! **\n"); dev_err!(self.dev, " Event slot: {}\n", event_slot); dev_err!(self.dev, " Timeout count: {}\n", counter); + dev_err!(self.dev, " Unk: {}\n", unk); // If we have fault info, consider it a fault. let error = match self.get_fault_info() { From ef96bc48972d827d5bf3f857f1255db0e098fd94 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Tue, 24 Sep 2024 03:30:36 +0900 Subject: [PATCH 5/7] fixup! drm/asahi: Add the Asahi driver for Apple AGX GPUs --- drivers/gpu/drm/asahi/fw/channels.rs | 7 ++++--- drivers/gpu/drm/asahi/fw/initdata.rs | 14 +++++++------- drivers/gpu/drm/asahi/fw/workqueue.rs | 4 ++-- drivers/gpu/drm/asahi/gpu.rs | 6 ++++++ drivers/gpu/drm/asahi/initdata.rs | 10 +++++----- drivers/gpu/drm/asahi/workqueue.rs | 4 ++-- 6 files changed, 26 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/asahi/fw/channels.rs b/drivers/gpu/drm/asahi/fw/channels.rs index 85bfc1cec0a255..f48020c75be8bc 100644 --- a/drivers/gpu/drm/asahi/fw/channels.rs +++ b/drivers/gpu/drm/asahi/fw/channels.rs @@ -173,13 +173,16 @@ pub(crate) enum DeviceControlMsg { Unk0a(Array), Unk0b(Array), Unk0c(Array), + #[ver(V >= V13_3)] + Unk0d(Array), GrowTVBAck { unk_4: u32, buffer_slot: u32, vm_slot: u32, counter: u32, subpipe: u32, - __pad: Pad<{ DEVICECONTROL_SZ::ver - 0x14 }>, + halt_count: U64, + __pad: Pad<{ DEVICECONTROL_SZ::ver - 0x1c }>, }, Unk0e(Array), Unk0f(Array), @@ -190,8 +193,6 @@ pub(crate) enum DeviceControlMsg { Unk14(Array), // Init? Unk15(Array), // Enable something Unk16(Array), // Disable something - #[ver(V >= V13_3)] - Unk17(Array), DestroyContext { unk_4: u32, ctx_23: u8, diff --git a/drivers/gpu/drm/asahi/fw/initdata.rs b/drivers/gpu/drm/asahi/fw/initdata.rs index 4c53846e81607a..d81a9b6b9df044 100644 --- a/drivers/gpu/drm/asahi/fw/initdata.rs +++ b/drivers/gpu/drm/asahi/fw/initdata.rs @@ -30,8 +30,8 @@ pub(crate) mod raw { #[derive(Debug, Default)] #[repr(C)] pub(crate) struct FwStatusFlags { - pub(crate) halt_count: AtomicU32, - __pad0: Pad<0xc>, + pub(crate) halt_count: AtomicU64, + __pad0: Pad<0x8>, pub(crate) halted: AtomicU32, __pad1: Pad<0xc>, pub(crate) resume: AtomicU32, @@ -1159,9 +1159,9 @@ pub(crate) mod raw { pub(crate) unk_10e88: Array<0x188, u8>, pub(crate) idle_ts: U64, pub(crate) idle_unk: U64, - pub(crate) unk_11020: u32, - pub(crate) unk_11024: u32, - pub(crate) unk_11028: u32, + pub(crate) progress_check_interval_3d: u32, + pub(crate) progress_check_interval_ta: u32, + pub(crate) progress_check_interval_cl: u32, #[ver(V >= V13_0B4)] pub(crate) unk_1102c_0: u32, @@ -1202,10 +1202,10 @@ pub(crate) mod raw { #[ver(V >= V13_3)] pub(crate) unk_118e0_9c_x: Array<0x8, u8>, - pub(crate) unk_118e0: u32, + pub(crate) cl_context_switch_timeout_ms: u32, #[ver(V >= V13_0B4)] - pub(crate) unk_118e4_0: u32, + pub(crate) cl_kill_timeout_ms: u32, pub(crate) cdm_context_store_latency_threshold: u32, pub(crate) unk_118e8: u32, diff --git a/drivers/gpu/drm/asahi/fw/workqueue.rs b/drivers/gpu/drm/asahi/fw/workqueue.rs index 8ad8bb1b0eee22..9ffa55e7c5a741 100644 --- a/drivers/gpu/drm/asahi/fw/workqueue.rs +++ b/drivers/gpu/drm/asahi/fw/workqueue.rs @@ -132,6 +132,8 @@ pub(crate) mod raw { pub(crate) unk_58: U64, pub(crate) busy: AtomicU32, pub(crate) __pad: Pad<0x20>, + #[ver(V >= V13_2 && G < G14X)] + pub(crate) unk_84_0: u32, pub(crate) unk_84_state: AtomicU32, pub(crate) unk_88: u32, pub(crate) unk_8c: u32, @@ -139,8 +141,6 @@ pub(crate) mod raw { pub(crate) unk_94: u32, pub(crate) pending: AtomicU32, pub(crate) unk_9c: u32, - #[ver(V >= V13_2 && G < G14X)] - pub(crate) unk_a0_0: u32, pub(crate) gpu_context: GpuPointer<'a, super::GpuContextData>, pub(crate) unk_a8: U64, #[ver(V >= V13_2 && G < G14X)] diff --git a/drivers/gpu/drm/asahi/gpu.rs b/drivers/gpu/drm/asahi/gpu.rs index 06f95d7c2c9699..87065d1dc97a70 100644 --- a/drivers/gpu/drm/asahi/gpu.rs +++ b/drivers/gpu/drm/asahi/gpu.rs @@ -1332,12 +1332,18 @@ impl GpuManager for GpuManager::ver { } fn ack_grow(&self, buffer_slot: u32, vm_slot: u32, counter: u32) { + let halt_count = self + .initdata + .fw_status + .with(|raw, _inner| raw.flags.halt_count.load(Ordering::Relaxed)); + let dc = fw::channels::DeviceControlMsg::ver::GrowTVBAck { unk_4: 1, buffer_slot, vm_slot, counter, subpipe: 0, // TODO + halt_count: U64(halt_count), __pad: Default::default(), }; diff --git a/drivers/gpu/drm/asahi/initdata.rs b/drivers/gpu/drm/asahi/initdata.rs index a3fd7a87ab79de..d8573af9aec860 100644 --- a/drivers/gpu/drm/asahi/initdata.rs +++ b/drivers/gpu/drm/asahi/initdata.rs @@ -704,9 +704,9 @@ impl<'a> InitDataBuilder::ver<'a> { unk_903c: 0, fault_control: *crate::fault_control.read(), do_init: 1, - unk_11020: 40, - unk_11024: 10, - unk_11028: 250, + progress_check_interval_3d: 40, + progress_check_interval_ta: 10, + progress_check_interval_cl: 250, #[ver(V >= V13_0B4)] unk_1102c_0: 1, #[ver(V >= V13_0B4)] @@ -718,9 +718,9 @@ impl<'a> InitDataBuilder::ver<'a> { idle_off_delay_ms: AtomicU32::new(pwr.idle_off_delay_ms), fender_idle_off_delay_ms: pwr.fender_idle_off_delay_ms, fw_early_wake_timeout_ms: pwr.fw_early_wake_timeout_ms, - unk_118e0: 40, + cl_context_switch_timeout_ms: 40, #[ver(V >= V13_0B4)] - unk_118e4_0: 50, + cl_kill_timeout_ms: 50, #[ver(V >= V13_0B4)] unk_11edc: 0, #[ver(V >= V13_0B4)] diff --git a/drivers/gpu/drm/asahi/workqueue.rs b/drivers/gpu/drm/asahi/workqueue.rs index ab77fa12ad3f55..348b09668b63ea 100644 --- a/drivers/gpu/drm/asahi/workqueue.rs +++ b/drivers/gpu/drm/asahi/workqueue.rs @@ -636,6 +636,8 @@ impl WorkQueue::ver { unk_58: Default::default(), busy: Default::default(), __pad: Default::default(), + #[ver(V >= V13_2 && G < G14X)] + unk_84_0: 0, unk_84_state: Default::default(), unk_88: 0, unk_8c: 0, @@ -643,8 +645,6 @@ impl WorkQueue::ver { unk_94: 0, pending: Default::default(), unk_9c: 0, - #[ver(V >= V13_2 && G < G14X)] - unk_a0_0: 0, gpu_context: inner.gpu_context.gpu_pointer(), unk_a8: Default::default(), #[ver(V >= V13_2 && G < G14X)] From 20398d9f824e17f3a8615b79a816b92fa1249b76 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Tue, 24 Sep 2024 03:32:28 +0900 Subject: [PATCH 6/7] fixup! drm/asahi: Add the Asahi driver UAPI --- include/uapi/drm/asahi_drm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/drm/asahi_drm.h b/include/uapi/drm/asahi_drm.h index af2e8b5801b53e..2ac3857c0cfa27 100644 --- a/include/uapi/drm/asahi_drm.h +++ b/include/uapi/drm/asahi_drm.h @@ -544,6 +544,7 @@ enum drm_asahi_status { DRM_ASAHI_STATUS_FAULT, DRM_ASAHI_STATUS_KILLED, DRM_ASAHI_STATUS_NO_DEVICE, + DRM_ASAHI_STATUS_CHANNEL_ERROR, }; enum drm_asahi_fault { From c881c4075b2f0b7d1c5b15e87eb52f0aafa86a36 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Tue, 24 Sep 2024 03:32:47 +0900 Subject: [PATCH 7/7] drm/asahi: Handle channel errors Signed-off-by: Asahi Lina --- drivers/gpu/drm/asahi/channel.rs | 25 ++++++++ drivers/gpu/drm/asahi/debug.rs | 1 + drivers/gpu/drm/asahi/event.rs | 10 ++++ drivers/gpu/drm/asahi/fw/channels.rs | 34 +++++++++-- drivers/gpu/drm/asahi/fw/workqueue.rs | 2 +- drivers/gpu/drm/asahi/gpu.rs | 86 ++++++++++++++++++++++++++- drivers/gpu/drm/asahi/workqueue.rs | 57 +++++++++++++++--- 7 files changed, 201 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/asahi/channel.rs b/drivers/gpu/drm/asahi/channel.rs index 709fb3d1dbe128..990f6469de52b1 100644 --- a/drivers/gpu/drm/asahi/channel.rs +++ b/drivers/gpu/drm/asahi/channel.rs @@ -383,6 +383,31 @@ impl EventChannel::ver { dev_crit!(self.dev, "EventChannel: No GPU manager available!\n") } }, + EventMsg::ChannelError { + error_type, + pipe_type, + event_slot, + event_value, + } => match self.gpu.as_ref() { + Some(gpu) => { + let error_type = match error_type { + 0 => ChannelErrorType::MemoryError, + 1 => ChannelErrorType::DMKill, + 2 => ChannelErrorType::Aborted, + 3 => ChannelErrorType::Unk3, + a => ChannelErrorType::Unknown(a), + }; + gpu.handle_channel_error( + error_type, + pipe_type, + event_slot, + event_value, + ); + } + None => { + dev_crit!(self.dev, "EventChannel: No GPU manager available!\n") + } + }, msg => { dev_crit!(self.dev, "Unknown event message: {:?}\n", msg); } diff --git a/drivers/gpu/drm/asahi/debug.rs b/drivers/gpu/drm/asahi/debug.rs index ab8490bd536bcc..e4b06d3853d87d 100644 --- a/drivers/gpu/drm/asahi/debug.rs +++ b/drivers/gpu/drm/asahi/debug.rs @@ -66,6 +66,7 @@ pub(crate) enum DebugFlags { Debug6 = 54, Debug7 = 55, + VerboseFaults = 61, AllowUnknownOverrides = 62, OopsOnGpuCrash = 63, } diff --git a/drivers/gpu/drm/asahi/event.rs b/drivers/gpu/drm/asahi/event.rs index 9e17ca0e1d7a26..563b62d9fe28a4 100644 --- a/drivers/gpu/drm/asahi/event.rs +++ b/drivers/gpu/drm/asahi/event.rs @@ -216,6 +216,16 @@ impl EventManager { } } + /// Returns a reference to the workqueue owning an event. + pub(crate) fn get_owner( + &self, + slot: u32, + ) -> Option> { + self.alloc + .with_inner(|inner| inner.owners[slot as usize].as_ref().cloned()) + .map(|a| a.clone()) + } + /// Fail all commands, used when the GPU crashes. pub(crate) fn fail_all(&self, error: workqueue::WorkError) { let mut owners: Vec> = Vec::new(); diff --git a/drivers/gpu/drm/asahi/fw/channels.rs b/drivers/gpu/drm/asahi/fw/channels.rs index f48020c75be8bc..cf1f1ec4eddd77 100644 --- a/drivers/gpu/drm/asahi/fw/channels.rs +++ b/drivers/gpu/drm/asahi/fw/channels.rs @@ -184,8 +184,16 @@ pub(crate) enum DeviceControlMsg { halt_count: U64, __pad: Pad<{ DEVICECONTROL_SZ::ver - 0x1c }>, }, - Unk0e(Array), - Unk0f(Array), + RecoverChannel { + pipe_type: u32, + work_queue: GpuWeakPointer, + event_value: u32, + __pad: Pad<{ DEVICECONTROL_SZ::ver - 0x10 }>, + }, + IdlePowerOff { + val: u32, + __pad: Pad<{ DEVICECONTROL_SZ::ver - 0x4 }>, + }, Unk10(Array), Unk11(Array), Unk12(Array), @@ -236,6 +244,17 @@ pub(crate) struct FwCtlMsg { pub(crate) const EVENT_SZ: usize = 0x34; +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[repr(C, u32)] +#[allow(dead_code)] +pub(crate) enum ChannelErrorType { + MemoryError, + DMKill, + Aborted, + Unk3, + Unknown(u32), +} + #[derive(Debug, Copy, Clone)] #[repr(C, u32)] #[allow(dead_code)] @@ -258,12 +277,19 @@ pub(crate) enum EventMsg { vm_slot: u32, buffer_slot: u32, counter: u32, - }, // Max discriminant: 0x7 + }, + ChannelError { + error_type: u32, + pipe_type: u32, + event_slot: u32, + event_value: u32, + }, + // Max discriminant: 0x8 } static_assert!(core::mem::size_of::() == 4 + EVENT_SZ); -pub(crate) const EVENT_MAX: u32 = 0x7; +pub(crate) const EVENT_MAX: u32 = 0x8; #[derive(Copy, Clone)] #[repr(C)] diff --git a/drivers/gpu/drm/asahi/fw/workqueue.rs b/drivers/gpu/drm/asahi/fw/workqueue.rs index 9ffa55e7c5a741..1b55d8cb6ca273 100644 --- a/drivers/gpu/drm/asahi/fw/workqueue.rs +++ b/drivers/gpu/drm/asahi/fw/workqueue.rs @@ -135,7 +135,7 @@ pub(crate) mod raw { #[ver(V >= V13_2 && G < G14X)] pub(crate) unk_84_0: u32, pub(crate) unk_84_state: AtomicU32, - pub(crate) unk_88: u32, + pub(crate) error_count: AtomicU32, pub(crate) unk_8c: u32, pub(crate) unk_90: u32, pub(crate) unk_94: u32, diff --git a/drivers/gpu/drm/asahi/gpu.rs b/drivers/gpu/drm/asahi/gpu.rs index 87065d1dc97a70..ca27ce9c0f4c9d 100644 --- a/drivers/gpu/drm/asahi/gpu.rs +++ b/drivers/gpu/drm/asahi/gpu.rs @@ -35,7 +35,7 @@ use kernel::{ use crate::alloc::Allocator; use crate::debug::*; use crate::driver::{AsahiDevRef, AsahiDevice}; -use crate::fw::channels::PipeType; +use crate::fw::channels::{ChannelErrorType, PipeType}; use crate::fw::types::{U32, U64}; use crate::{ alloc, buffer, channel, event, fw, gem, hw, initdata, mem, mmu, queue, regs, workqueue, @@ -256,6 +256,14 @@ pub(crate) trait GpuManager: Send + Sync { fn handle_timeout(&self, counter: u32, event_slot: i32, unk: u32); /// Handle a GPU fault event. fn handle_fault(&self); + /// Handle a channel error event. + fn handle_channel_error( + &self, + error_type: ChannelErrorType, + pipe_type: u32, + event_slot: u32, + event_value: u32, + ); /// Acknowledge a Buffer grow op. fn ack_grow(&self, buffer_slot: u32, vm_slot: u32, counter: u32); /// Wait for the GPU to become idle and power off. @@ -1331,6 +1339,82 @@ impl GpuManager for GpuManager::ver { self.recover(); } + fn handle_channel_error( + &self, + error_type: ChannelErrorType, + pipe_type: u32, + event_slot: u32, + event_value: u32, + ) { + dev_err!(self.dev, " (\\________/) \n"); + dev_err!(self.dev, " | | \n"); + dev_err!(self.dev, "'.| \\ , / |.'\n"); + dev_err!(self.dev, "--| / (( \\ |--\n"); + dev_err!(self.dev, ".'| _-_- |'.\n"); + dev_err!(self.dev, " |________| \n"); + dev_err!(self.dev, "GPU channel error nya~!!!!!\n"); + dev_err!(self.dev, " Error type: {:?}\n", error_type); + dev_err!(self.dev, " Pipe type: {}\n", pipe_type); + dev_err!(self.dev, " Event slot: {}\n", event_slot); + dev_err!(self.dev, " Event value: {:#x?}\n", event_value); + + self.event_manager.mark_error( + event_slot, + event_value, + workqueue::WorkError::ChannelError(error_type), + ); + + let wq = match self.event_manager.get_owner(event_slot) { + Some(wq) => wq, + None => { + dev_err!(self.dev, "Workqueue not found for this event slot!\n"); + return; + } + }; + + let wq = match wq.as_any().downcast_ref::() { + Some(wq) => wq, + None => { + dev_crit!(self.dev, "GpuManager mismatched with WorkQueue!\n"); + return; + } + }; + + if debug_enabled(DebugFlags::VerboseFaults) { + wq.dump_info(); + } + + let dc = fw::channels::DeviceControlMsg::ver::RecoverChannel { + pipe_type, + work_queue: wq.info_pointer(), + event_value, + __pad: Default::default(), + }; + + mod_dev_dbg!(self.dev, "Recover Channel command: {:?}\n", &dc); + let mut txch = self.tx_channels.lock(); + + let token = txch.device_control.send(&dc); + { + let mut guard = self.rtkit.lock(); + let rtk = guard.as_mut().unwrap(); + if rtk + .send_message(EP_DOORBELL, MSG_TX_DOORBELL | DOORBELL_DEVCTRL) + .is_err() + { + dev_err!(self.dev, "Failed to send Recover Channel command\n"); + } + } + + if txch.device_control.wait_for(token).is_err() { + dev_err!(self.dev, "Timed out waiting for Recover Channel command\n"); + } + + if debug_enabled(DebugFlags::VerboseFaults) { + wq.dump_info(); + } + } + fn ack_grow(&self, buffer_slot: u32, vm_slot: u32, counter: u32) { let halt_count = self .initdata diff --git a/drivers/gpu/drm/asahi/workqueue.rs b/drivers/gpu/drm/asahi/workqueue.rs index 348b09668b63ea..5e4084e3d0dd0f 100644 --- a/drivers/gpu/drm/asahi/workqueue.rs +++ b/drivers/gpu/drm/asahi/workqueue.rs @@ -14,13 +14,14 @@ //! up its associated event. use crate::debug::*; -use crate::fw::channels::PipeType; +use crate::fw::channels::{ChannelErrorType, PipeType}; use crate::fw::types::*; use crate::fw::workqueue::*; use crate::no_debug; use crate::object::OpaqueGpuObject; use crate::regs::FaultReason; use crate::{channel, driver, event, fw, gpu, object, regs}; +use core::any::Any; use core::num::NonZeroU64; use core::sync::atomic::Ordering; use kernel::{ @@ -48,6 +49,8 @@ pub(crate) enum WorkError { Fault(regs::FaultInfo), /// Work failed due to an error caused by other concurrent GPU work. Killed, + /// Channel error + ChannelError(ChannelErrorType), /// The GPU crashed. NoDevice, /// Unknown reason. @@ -79,6 +82,9 @@ impl From for uapi::drm_asahi_result_info { status: match a { WorkError::Timeout => uapi::drm_asahi_status_DRM_ASAHI_STATUS_TIMEOUT, WorkError::Killed => uapi::drm_asahi_status_DRM_ASAHI_STATUS_KILLED, + WorkError::ChannelError(_) => { + uapi::drm_asahi_status_DRM_ASAHI_STATUS_CHANNEL_ERROR + } WorkError::NoDevice => uapi::drm_asahi_status_DRM_ASAHI_STATUS_NO_DEVICE, _ => uapi::drm_asahi_status_DRM_ASAHI_STATUS_UNKNOWN_ERROR, }, @@ -97,6 +103,7 @@ impl From for kernel::error::Error { WorkError::Unknown => ENODATA, WorkError::Killed => ECANCELED, WorkError::NoDevice => ENODEV, + WorkError::ChannelError(_) => EIO, } } } @@ -601,20 +608,26 @@ impl WorkQueue::ver { size: u32, ) -> Result> { let gpu_buf = alloc.private.array_empty_tagged(0x2c18, b"GPBF")?; - let shared = &mut alloc.shared; + let mut state = alloc.shared.new_default::()?; + let ring = alloc.shared.array_empty(size as usize)?; let inner = WorkQueueInner::ver { dev: dev.into(), event_manager, - info: alloc.private.new_init( + // Use shared (coherent) state with verbose faults so we can dump state correctly + info: if debug_enabled(DebugFlags::VerboseFaults) { + &mut alloc.shared + } else { + &mut alloc.private + } + .new_init( try_init!(QueueInfo::ver { state: { - let mut s = shared.new_default::()?; - s.with_mut(|raw, _inner| { + state.with_mut(|raw, _inner| { raw.rb_size = size; }); - s + state }, - ring: shared.array_empty(size as usize)?, + ring, gpu_buf, notifier_list: notifier_list, gpu_context: gpu_context, @@ -639,7 +652,7 @@ impl WorkQueue::ver { #[ver(V >= V13_2 && G < G14X)] unk_84_0: 0, unk_84_state: Default::default(), - unk_88: 0, + error_count: Default::default(), unk_8c: 0, unk_90: 0, unk_94: 0, @@ -744,11 +757,35 @@ impl WorkQueue::ver { pub(crate) fn pipe_type(&self) -> PipeType { self.inner.lock().pipe_type } + + pub(crate) fn dump_info(&self) { + pr_info!("WorkQueue @ {:?}:", self.info_pointer); + self.inner.lock().info.with(|raw, _inner| { + pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr1.load(Ordering::Relaxed)); + pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr2.load(Ordering::Relaxed)); + pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr3.load(Ordering::Relaxed)); + pr_info!(" Event ID: {:#x}", raw.event_id.load(Ordering::Relaxed)); + pr_info!(" Busy: {:#x}", raw.busy.load(Ordering::Relaxed)); + pr_info!(" Unk 84: {:#x}", raw.unk_84_state.load(Ordering::Relaxed)); + pr_info!( + " Error count: {:#x}", + raw.error_count.load(Ordering::Relaxed) + ); + pr_info!(" Pending: {:#x}", raw.pending.load(Ordering::Relaxed)); + }); + } + + pub(crate) fn info_pointer(&self) -> GpuWeakPointer { + self.info_pointer + } } /// Trait used to erase the version-specific type of WorkQueues, to avoid leaking /// version-specificity into the event module. pub(crate) trait WorkQueue { + /// Cast as an Any type. + fn as_any(&self) -> &dyn Any; + fn signal(&self) -> bool; fn mark_error(&self, value: event::EventValue, error: WorkError); fn fail_all(&self, error: WorkError); @@ -756,6 +793,10 @@ pub(crate) trait WorkQueue { #[versions(AGX)] impl WorkQueue for WorkQueue::ver { + fn as_any(&self) -> &dyn Any { + self + } + /// Signal a workqueue that some work was completed. /// /// This will check the event stamp value to find out exactly how many commands were processed.