From ab53b43a1ba47353b0a005779df98cc201dcdd86 Mon Sep 17 00:00:00 2001 From: stevenewald Date: Thu, 7 Nov 2024 13:29:28 -0600 Subject: [PATCH] Add FPU support in scheduler --- include/scheduler.hpp | 16 ++++---- src/scheduler.cpp | 88 ++++++++++++++++++++++++++++--------------- 2 files changed, 65 insertions(+), 39 deletions(-) diff --git a/include/scheduler.hpp b/include/scheduler.hpp index cd669dc9..1e104ac4 100644 --- a/include/scheduler.hpp +++ b/include/scheduler.hpp @@ -13,10 +13,9 @@ namespace edge { extern "C" { void PendSV_Handler(); void SVC_Handler(); -void restore(); } -struct saved_registers { +struct stack_registers { unsigned R0{}; unsigned R1{}; unsigned R2{}; @@ -24,11 +23,11 @@ struct saved_registers { unsigned R12{}; unsigned LR{}; unsigned RETURN_ADDR{}; - unsigned FLAG{}; + unsigned CTRL{0x01000000}; + unsigned FP_REGS[16]{}; + unsigned FPSCR{}; - saved_registers(unsigned return_addr, unsigned flag = 0x01000000) : - RETURN_ADDR(return_addr), FLAG(flag) - {} + stack_registers(unsigned return_addr) : RETURN_ADDR(return_addr) {} }; class Scheduler { @@ -41,7 +40,7 @@ class Scheduler { // ===== DO NOT REARRANGE THESE ===== etl::array stack{}; - saved_registers first_stack_frame; + stack_registers first_stack_frame; // ================================== public: @@ -51,7 +50,7 @@ class Scheduler { // This is useful if we want to adjust the ratio of driver to process runtime uint8_t consecutive_quantums_to_run; - Task(const saved_registers& initial_stack_frame, uint8_t initial_priority) : + Task(const stack_registers& initial_stack_frame, uint8_t initial_priority) : first_stack_frame(initial_stack_frame), consecutive_quantums_to_run(initial_priority) {} @@ -74,7 +73,6 @@ class Scheduler { friend void PendSV_Handler(void); friend void SVC_Handler(void); - friend void restore(void); }; extern Scheduler scheduler; diff --git a/src/scheduler.cpp b/src/scheduler.cpp index 94fc2c68..df46c177 100644 --- a/src/scheduler.cpp +++ b/src/scheduler.cpp @@ -26,7 +26,7 @@ void Scheduler::start_scheduler() void Scheduler::add_task(void (*function)(void), uint8_t priority) { task_stack.emplace_back( - saved_registers{reinterpret_cast(function)}, priority + stack_registers{reinterpret_cast(function)}, priority ); } @@ -61,8 +61,9 @@ __attribute__((naked, used)) void PendSV_Handler() asm volatile("CPSID I"); if (--scheduler.slices_remaining == 0) { asm volatile("mrs r0,psp\n" - "sub r0,#32\n" - "stm r0!,{r4-r11}"); + "sub r0,#96\n" + "stm r0!,{r4-r11}\n" + "vstm r0!, {s16-s31}"); // This function will dirty registers. That's okay scheduler.task_stack[scheduler.current_task_index].stack_ptr_loc = @@ -81,8 +82,9 @@ __attribute__((naked, used)) void PendSV_Handler() )); asm volatile("mrs r0,psp\n" - "sub r0,#32\n" - "ldm r0!,{r4-r11}\n"); + "sub r0,#96\n" + "ldm r0!,{r4-r11}\n" + "vldm r0!, {s16-s31}"); } asm volatile("CPSIE I"); @@ -90,7 +92,8 @@ __attribute__((naked, used)) void PendSV_Handler() // Always want to call drivers on context switch drivers::do_async_work(); - asm volatile("ldr r0,=0xfffffffd\n" + // Return in thumb/process mode and restore using extended stack frame + asm volatile("ldr r0,=0xffffffed\n" "bx r0"); } @@ -101,24 +104,38 @@ __attribute__((used)) void SysTick_Handler() } // Runs in userspace after async callback has finished +// I don't think there's any way to make this cleaner lol __attribute__((used, naked)) void restore_regs() { - asm volatile("pop {r0, r1, r2, r3, r12, lr}\npop {pc}"); -} - -// Returns old flag -unsigned move_registers_up(saved_registers* registers) -{ - auto old_flag = registers->FLAG; - registers->FLAG = registers->RETURN_ADDR; - registers->RETURN_ADDR = registers->LR; - registers->LR = registers->R12; - registers->R12 = registers->R3; - registers->R3 = registers->R2; - registers->R2 = registers->R1; - registers->R1 = registers->R0; - registers->R0 = 0; - return old_flag; + // Load fpscr first so we can avoid dirtying r0 after its popped + asm volatile("ldr r0, [sp, #96]\n" + "vmsr fpscr, r0\n"); + + // Pop regs as usual + asm volatile("pop {r0, r1, r2, r3, r12, lr}"); + + // Skip SP and RETPSR. SP will be loaded last + // RETPSR should be ignored because we already popped when returning from exception + asm volatile("add sp, #8"); + + // Pop caller saved FP registers + asm volatile("vpop {s0-s15}"); + + // *sigh* + // We need to account for whether sp is 4- or 8-byte aligned + // diagram: https://shorturl.at/85lyY + asm volatile("push {r0}\n" + "mrs r0, psp\n" + "tst r0, #0x4\n" + "pop {r0}\n" + "ite eq\n" + "ADDEQ SP, #8\n" + "ADDNE SP, #4"); + + // Skip FPSCR (already loaded) + asm volatile("add sp, #4"); + + asm volatile("ldr pc, [sp, #-84]"); } void Scheduler::yield_current_task() @@ -135,21 +152,32 @@ void Scheduler::yield_current_task() // This stack frame, originally created by the exception handler, will be popped // by restore() - t.stack_ptr_loc = reinterpret_cast(__get_PSP()); - auto stored_registers = reinterpret_cast(t.stack_ptr_loc); - stored_registers->RETURN_ADDR++; - - unsigned old_flag = move_registers_up(stored_registers); + t.stack_ptr_loc = (unsigned*)__get_PSP(); + auto stored_registers = reinterpret_cast(t.stack_ptr_loc); + + // Account for stack pointer alignment + // diagram: https://shorturl.at/85lyY + bool eight_byte_aligned = ((unsigned)t.stack_ptr_loc) & 0x7; + if (eight_byte_aligned) { + t.stack_ptr_loc -= 1; + } + else { + t.stack_ptr_loc -= 2; + } // "Push" registers, create a fake stack frame // This will be popped by the exception handler - t.stack_ptr_loc -= 7; - auto new_registers = reinterpret_cast(t.stack_ptr_loc); + t.stack_ptr_loc -= (sizeof(stack_registers) / sizeof(unsigned)); + auto new_registers = reinterpret_cast(t.stack_ptr_loc); new_registers->R0 = static_cast(arg1); new_registers->R1 = static_cast(arg2); + + // Return to our restore_regs function so we can pop the caller-saved registers of + // the saved/previous execution path new_registers->LR = reinterpret_cast(&restore_regs); new_registers->RETURN_ADDR = reinterpret_cast(callback_address); - new_registers->FLAG = old_flag; + + new_registers->CTRL = stored_registers->CTRL; __set_PSP(reinterpret_cast(t.stack_ptr_loc)); }