From c0621d07604863d945d6fd9a597b610f22a97e7e Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 3 Jan 2024 00:39:36 +0200 Subject: [PATCH] WIP arm64 shader recompiler --- .gitmodules | 3 + CMakeLists.txt | 5 + include/PICA/dynapica/shader_rec.hpp | 4 +- .../dynapica/shader_rec_emitter_arm64.hpp | 129 ++++++++++ .../dynapica/shader_rec_emitter_arm64.cpp | 239 ++++++++++++++++++ .../PICA/dynapica/shader_rec_emitter_x64.cpp | 6 +- third_party/oaknut | 1 + 7 files changed, 383 insertions(+), 4 deletions(-) create mode 100644 include/PICA/dynapica/shader_rec_emitter_arm64.hpp create mode 100644 src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp create mode 160000 third_party/oaknut diff --git a/.gitmodules b/.gitmodules index 428ca1d13..3735d0cb1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -46,3 +46,6 @@ [submodule "third_party/zep"] path = third_party/zep url = https://github.com/Panda3DS-emu/zep +[submodule "third_party/oaknut"] + path = third_party/oaknut + url = https://github.com/merryhime/oaknut diff --git a/CMakeLists.txt b/CMakeLists.txt index 732ec793e..b78fe0ad1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,6 +128,9 @@ endif() # Check for arm64 if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") set(HOST_ARM64 TRUE) + add_subdirectory(third_party/oaknut) # Add Oaknut submodule for arm64 JITs + include_directories(third_party/oaknut/include) + add_compile_definitions(PANDA3DS_DYNAPICA_SUPPORTED) add_compile_definitions(PANDA3DS_ARM64_HOST) else() set(HOST_ARM64 FALSE) @@ -172,6 +175,7 @@ set(SERVICE_SOURCE_FILES src/core/services/service_manager.cpp src/core/services set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA/shader_unit.cpp src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp + src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp ) set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp) @@ -236,6 +240,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/services/news_u.hpp include/applets/software_keyboard.hpp include/applets/applet_manager.hpp include/fs/archive_user_save_data.hpp include/services/amiibo_device.hpp include/services/nfc_types.hpp include/swap.hpp include/services/csnd.hpp include/services/nwm_uds.hpp include/fs/archive_system_save_data.hpp include/lua_manager.hpp include/memory_mapped_file.hpp include/hydra_icon.hpp + include/PICA/dynapica/shader_rec_emitter_arm64.hpp ) cmrc_add_resource_library( diff --git a/include/PICA/dynapica/shader_rec.hpp b/include/PICA/dynapica/shader_rec.hpp index e8b6afede..2dabc128c 100644 --- a/include/PICA/dynapica/shader_rec.hpp +++ b/include/PICA/dynapica/shader_rec.hpp @@ -1,13 +1,15 @@ #pragma once #include "PICA/shader.hpp" -#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_X64_HOST) +#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && (defined(PANDA3DS_X64_HOST) || defined(PANDA3DS_ARM64_HOST)) #define PANDA3DS_SHADER_JIT_SUPPORTED #include #include #ifdef PANDA3DS_X64_HOST #include "shader_rec_emitter_x64.hpp" +#elif defined(PANDA3DS_ARM64_HOST) +#include "shader_rec_emitter_arm64.hpp" #endif #endif diff --git a/include/PICA/dynapica/shader_rec_emitter_arm64.hpp b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp new file mode 100644 index 000000000..4dd211741 --- /dev/null +++ b/include/PICA/dynapica/shader_rec_emitter_arm64.hpp @@ -0,0 +1,129 @@ +#pragma once + +// Only do anything if we're on an x64 target with JIT support enabled +#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_ARM64_HOST) +#include +#include +#include + +#include "PICA/shader.hpp" +#include "helpers.hpp" +#include "logger.hpp" + +class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator { + static constexpr size_t executableMemorySize = PICAShader::maxInstructionCount * 96; // How much executable memory to alloc for each shader + // Allocate some extra space as padding for security purposes in the extremely unlikely occasion we manage to overflow the above size + static constexpr size_t allocSize = executableMemorySize + 0x1000; + + // If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle + static constexpr uint noSwizzle = 0x1B; + + using f24 = Floats::f24; + using vec4f = std::array; + + // An array of labels (incl pointers) to each compiled (to x64) PICA instruction + std::array instructionLabels; + // A vector of PCs that can potentially return based on the state of the PICA callstack. + // Filled before compiling a shader by scanning the code for call instructions + std::vector returnPCs; + + // Vector value of (1.0, 1.0, 1.0, 1.0) for SLT(i)/SGE(i) + oaknut::Label onesVector; + + u32 recompilerPC = 0; // PC the recompiler is currently recompiling @ + u32 loopLevel = 0; // The current loop nesting level (0 = not in a loop) + + // Shows whether the loaded shader has any log2 and exp2 instructions + bool codeHasLog2 = false; + bool codeHasExp2 = false; + + oaknut::Label log2Func, exp2Func; + oaknut::Label emitLog2Func(); + oaknut::Label emitExp2Func(); + + // Compile all instructions from [current recompiler PC, end) + void compileUntil(const PICAShader& shaderUnit, u32 endPC); + // Compile instruction "instr" + void compileInstruction(const PICAShader& shaderUnit); + + bool isCall(u32 instruction) { + const u32 opcode = instruction >> 26; + return (opcode == ShaderOpcodes::CALL) || (opcode == ShaderOpcodes::CALLC) || (opcode == ShaderOpcodes::CALLU); + } + + // Scan the shader code for call instructions to fill up the returnPCs vector before starting compilation + // We also scan for log2/exp2 instructions to see whether to emit the relevant functions + void scanCode(const PICAShader& shaderUnit); + + // Load register with number "srcReg" indexed by index "idx" into the arm64 register "reg" + template + void loadRegister(oaknut::QReg dest, const PICAShader& shader, u32 src, u32 idx, u32 operandDescriptor); + void storeRegister(oaknut::QReg source, const PICAShader& shader, u32 dest, u32 operandDescriptor); + + const vec4f& getSourceRef(const PICAShader& shader, u32 src); + const vec4f& getDestRef(const PICAShader& shader, u32 dest); + + // Check the value of the cmp register for instructions like ifc and callc + // Result is returned in the zero flag. If the comparison is true then zero == 1, else zero == 0 + void checkCmpRegister(const PICAShader& shader, u32 instruction); + + // Check the value of the bool uniform for instructions like ifu and callu + // Result is returned in the zero flag. If the comparison is true then zero == 0, else zero == 1 (Opposite of checkCmpRegister) + void checkBoolUniform(const PICAShader& shader, u32 instruction); + + // Instruction recompilation functions + void recADD(const PICAShader& shader, u32 instruction); + void recCALL(const PICAShader& shader, u32 instruction); + void recCALLC(const PICAShader& shader, u32 instruction); + void recCALLU(const PICAShader& shader, u32 instruction); + void recCMP(const PICAShader& shader, u32 instruction); + void recDP3(const PICAShader& shader, u32 instruction); + void recDP4(const PICAShader& shader, u32 instruction); + void recDPH(const PICAShader& shader, u32 instruction); + void recEMIT(const PICAShader& shader, u32 instruction); + void recEND(const PICAShader& shader, u32 instruction); + void recEX2(const PICAShader& shader, u32 instruction); + void recFLR(const PICAShader& shader, u32 instruction); + void recIFC(const PICAShader& shader, u32 instruction); + void recIFU(const PICAShader& shader, u32 instruction); + void recJMPC(const PICAShader& shader, u32 instruction); + void recJMPU(const PICAShader& shader, u32 instruction); + void recLG2(const PICAShader& shader, u32 instruction); + void recLOOP(const PICAShader& shader, u32 instruction); + void recMAD(const PICAShader& shader, u32 instruction); + void recMAX(const PICAShader& shader, u32 instruction); + void recMIN(const PICAShader& shader, u32 instruction); + void recMOVA(const PICAShader& shader, u32 instruction); + void recMOV(const PICAShader& shader, u32 instruction); + void recMUL(const PICAShader& shader, u32 instruction); + void recRCP(const PICAShader& shader, u32 instruction); + void recRSQ(const PICAShader& shader, u32 instruction); + void recSETEMIT(const PICAShader& shader, u32 instruction); + void recSGE(const PICAShader& shader, u32 instruction); + void recSLT(const PICAShader& shader, u32 instruction); + + MAKE_LOG_FUNCTION(log, shaderJITLogger) + + public: + // Callback type used for instructions + using InstructionCallback = const void (*)(PICAShader& shaderUnit); + // Callback type used for the JIT prologue. This is what the caller will call + using PrologueCallback = const void (*)(PICAShader& shaderUnit, InstructionCallback cb); + + PrologueCallback prologueCb = nullptr; + + // Initialize our emitter with "allocSize" bytes of memory allocated for the code buffer + ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {} + + // PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does + InstructionCallback getInstructionCallback(u32 pc) { + // Cast away the constness because casting to a function pointer is hard otherwise. Legal as long as we don't write to *ptr + uint8_t* ptr = instructionLabels.at(pc).ptr(); + return reinterpret_cast(ptr); + } + + PrologueCallback getPrologueCallback() { return prologueCb; } + void compile(const PICAShader& shaderUnit); +}; + +#endif // arm64 recompiler check \ No newline at end of file diff --git a/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp new file mode 100644 index 000000000..750adc81a --- /dev/null +++ b/src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp @@ -0,0 +1,239 @@ +#if defined(PANDA3DS_DYNAPICA_SUPPORTED) && defined(PANDA3DS_ARM64_HOST) +#include "PICA/dynapica/shader_rec_emitter_arm64.hpp" + +#include + +using namespace Helpers; +using namespace oaknut; +using namespace oaknut::util; + +// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions +// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible +static constexpr QReg scratch1 = Q0; +static constexpr QReg scratch2 = Q1; +static constexpr QReg src1_vec = Q2; +static constexpr QReg src2_vec = Q3; +static constexpr QReg src3_vec = Q4; + +static constexpr XReg statePointer = X9; + +void ShaderEmitter::compile(const PICAShader& shaderUnit) { + // Scan the code for call, exp2, log2, etc instructions which need some special care + // After that, emit exp2 and log2 functions if the corresponding instructions are present + scanCode(shaderUnit); + if (codeHasExp2) Helpers::panic("arm64 shader JIT: Code has exp2"); + if (codeHasLog2) Helpers::panic("arm64 shader JIT: Code has log2"); + + align(16); + // Compile every instruction in the shader + // This sounds horrible but the PICA instruction memory is tiny, and most of the time it's padded wtih nops that compile to nothing + recompilerPC = 0; + loopLevel = 0; + compileUntil(shaderUnit, PICAShader::maxInstructionCount); +} + +void ShaderEmitter::scanCode(const PICAShader& shaderUnit) { + returnPCs.clear(); + + for (u32 i = 0; i < PICAShader::maxInstructionCount; i++) { + const u32 instruction = shaderUnit.loadedShader[i]; + const u32 opcode = instruction >> 26; + + if (isCall(instruction)) { + const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); + const u32 returnPC = num + dest; // Add them to get the return PC + + returnPCs.push_back(returnPC); + } else if (opcode == ShaderOpcodes::EX2) { + codeHasExp2 = true; + } else if (opcode == ShaderOpcodes::LG2) { + codeHasLog2 = true; + } + } + + // Sort return PCs so they can be binary searched + std::sort(returnPCs.begin(), returnPCs.end()); +} + +void ShaderEmitter::compileUntil(const PICAShader& shaderUnit, u32 end) { + while (recompilerPC < end) { + compileInstruction(shaderUnit); + } +} + +void ShaderEmitter::compileInstruction(const PICAShader& shaderUnit) { + // Write current location to label for this instruction + l(instructionLabels[recompilerPC]); + + // See if PC is a possible return PC and emit the proper code if so + if (std::binary_search(returnPCs.begin(), returnPCs.end(), recompilerPC)) { + Helpers::panic("Unimplemented return address for call instruction"); + } + + // Fetch instruction and inc PC + const u32 instruction = shaderUnit.loadedShader[recompilerPC++]; + const u32 opcode = instruction >> 26; + + switch (opcode) { + // case ShaderOpcodes::ADD: recADD(shaderUnit, instruction); break; + // case ShaderOpcodes::CALL: recCALL(shaderUnit, instruction); break; + // case ShaderOpcodes::CALLC: recCALLC(shaderUnit, instruction); break; + // case ShaderOpcodes::CALLU: recCALLU(shaderUnit, instruction); break; + // case ShaderOpcodes::CMP1: + // case ShaderOpcodes::CMP2: recCMP(shaderUnit, instruction); break; + // case ShaderOpcodes::DP3: recDP3(shaderUnit, instruction); break; + // case ShaderOpcodes::DP4: recDP4(shaderUnit, instruction); break; + // case ShaderOpcodes::DPH: + // case ShaderOpcodes::DPHI: recDPH(shaderUnit, instruction); break; + // case ShaderOpcodes::END: recEND(shaderUnit, instruction); break; + // case ShaderOpcodes::EX2: recEX2(shaderUnit, instruction); break; + // case ShaderOpcodes::FLR: recFLR(shaderUnit, instruction); break; + // case ShaderOpcodes::IFC: recIFC(shaderUnit, instruction); break; + // case ShaderOpcodes::IFU: recIFU(shaderUnit, instruction); break; + // case ShaderOpcodes::JMPC: recJMPC(shaderUnit, instruction); break; + // case ShaderOpcodes::JMPU: recJMPU(shaderUnit, instruction); break; + // case ShaderOpcodes::LG2: recLG2(shaderUnit, instruction); break; + // case ShaderOpcodes::LOOP: recLOOP(shaderUnit, instruction); break; + case ShaderOpcodes::MOV: recMOV(shaderUnit, instruction); break; + // case ShaderOpcodes::MOVA: recMOVA(shaderUnit, instruction); break; + // case ShaderOpcodes::MAX: recMAX(shaderUnit, instruction); break; + // case ShaderOpcodes::MIN: recMIN(shaderUnit, instruction); break; + // case ShaderOpcodes::MUL: recMUL(shaderUnit, instruction); break; + case ShaderOpcodes::NOP: + break; + // case ShaderOpcodes::RCP: recRCP(shaderUnit, instruction); break; + // case ShaderOpcodes::RSQ: recRSQ(shaderUnit, instruction); break; + + // Unimplemented opcodes that don't seem to actually be used but exist in the binary + // EMIT/SETEMIT are used in geometry shaders, however are sometimes found in vertex shaders? + // case ShaderOpcodes::EMIT: + // case ShaderOpcodes::SETEMIT: + // log("[ShaderJIT] Unknown PICA opcode: %02X\n", opcode); + // emitPrintLog(shaderUnit); + // break; + + // case ShaderOpcodes::BREAK: + // case ShaderOpcodes::BREAKC: Helpers::warn("[Shader JIT] Unimplemented BREAK(C) instruction!"); break; + + // We consider both MAD and MADI to be the same instruction and decode which one we actually have in recMAD + // case 0x30: + // case 0x31: + // case 0x32: + // case 0x33: + // case 0x34: + // case 0x35: + // case 0x36: + // case 0x37: + // case 0x38: + // case 0x39: + // case 0x3A: + // case 0x3B: + // case 0x3C: + // case 0x3D: + // case 0x3E: + // case 0x3F: recMAD(shaderUnit, instruction); break; + + // case ShaderOpcodes::SLT: + // case ShaderOpcodes::SLTI: recSLT(shaderUnit, instruction); break; + + // case ShaderOpcodes::SGE: + // case ShaderOpcodes::SGEI: recSGE(shaderUnit, instruction); break; + + default: Helpers::panic("Shader JIT: Unimplemented PICA opcode %X", opcode); + } +} + +const ShaderEmitter::vec4f& ShaderEmitter::getSourceRef(const PICAShader& shader, u32 src) { + if (src < 0x10) + return shader.inputs[src]; + else if (src < 0x20) + return shader.tempRegisters[src - 0x10]; + else if (src <= 0x7f) + return shader.floatUniforms[src - 0x20]; + else { + Helpers::warn("[Shader JIT] Unimplemented source value: %X\n", src); + return shader.dummy; + } +} + +const ShaderEmitter::vec4f& ShaderEmitter::getDestRef(const PICAShader& shader, u32 dest) { + if (dest < 0x10) { + return shader.outputs[dest]; + } else if (dest < 0x20) { + return shader.tempRegisters[dest - 0x10]; + } + Helpers::panic("[Shader JIT] Unimplemented dest: %X", dest); +} + +// See shader.hpp header for docs on how the swizzle and negate works +template +void ShaderEmitter::loadRegister(QReg dest, const PICAShader& shader, u32 src, u32 index, u32 operandDescriptor) { + u32 compSwizzle; // Component swizzle pattern for the register + bool negate; // If true, negate all lanes of the register + + if constexpr (sourceIndex == 1) { // SRC1 + negate = (getBit<4>(operandDescriptor)) != 0; + compSwizzle = getBits<5, 8>(operandDescriptor); + } else if constexpr (sourceIndex == 2) { // SRC2 + negate = (getBit<13>(operandDescriptor)) != 0; + compSwizzle = getBits<14, 8>(operandDescriptor); + } else if constexpr (sourceIndex == 3) { // SRC3 + negate = (getBit<22>(operandDescriptor)) != 0; + compSwizzle = getBits<23, 8>(operandDescriptor); + } + + switch (index) { + case 0: + [[likely]] { // Keep src as is, no need to offset it + const vec4f& srcRef = getSourceRef(shader, src); + const uintptr_t offset = uintptr_t(&srcRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct + + LDR(dest, statePointer, offset); + switch (compSwizzle) { + case noSwizzle: break; // .xyzw + case 0x0: DUP(dest.S4(), dest.Selem()[0]); break; // .xxxx + case 0x55: DUP(dest.S4(), dest.Selem()[1]); break; // .yyyy + case 0xAA: DUP(dest.S4(), dest.Selem()[2]); break; // .zzzz + case 0xFF: DUP(dest.S4(), dest.Selem()[3]); break; // .wwww + default: Helpers::panic("Unimplemented swizzle pattern for loading"); + } + + // Negate the register if necessary + if (negate) { + FNEG(dest.S4(), dest.S4()); + } + return; // Return. Rest of the function handles indexing which is not used if index == 0 + } + + default: Helpers::panic("[ShaderJIT]: Unimplemented source index type %d", index); + } + + Helpers::panic("Unimplemented indexed register load"); +} + +void ShaderEmitter::storeRegister(QReg source, const PICAShader& shader, u32 dest, u32 operandDescriptor) { + const vec4f& destRef = getDestRef(shader, dest); + const uintptr_t offset = uintptr_t(&destRef) - uintptr_t(&shader); // Calculate offset of register from start of the state struct + + // Mask of which lanes to write + u32 writeMask = operandDescriptor & 0xf; + if (writeMask == 0xf) { // No lanes are masked, just use STR + STR(source, statePointer, offset); + } else { + LDR(scratch1, statePointer, offset); // Load current source + Helpers::panic("Unimplemented: Storing to register with blending"); + } +} + +void ShaderEmitter::recMOV(const PICAShader& shader, u32 instruction) { + const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f]; + const u32 src = getBits<12, 7>(instruction); + const u32 idx = getBits<19, 2>(instruction); + const u32 dest = getBits<21, 5>(instruction); + + loadRegister<1>(src1_vec, shader, src, idx, operandDescriptor); // Load source 1 into scratch1 + storeRegister(src1_vec, shader, dest, operandDescriptor); +} + +#endif \ No newline at end of file diff --git a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp index 046c78133..f3dae7b76 100644 --- a/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp +++ b/src/core/PICA/dynapica/shader_rec_emitter_x64.cpp @@ -342,10 +342,10 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest } else if (std::popcount(writeMask) == 1) { // Only 1 register needs to be written back. This can be done with a simple shift right + movss int bit = std::countr_zero(writeMask); // Get which PICA register needs to be written to (0 = w, 1 = z, etc) size_t index = 3 - bit; - const uintptr_t lane_offset = offset + index * sizeof(float); + const uintptr_t laneOffset = offset + index * sizeof(float); if (index == 0) { // Bottom lane, no need to shift - movss(dword[statePointer + lane_offset], source); + movss(dword[statePointer + laneOffset], source); } else { // Shift right by 32 * index, then write bottom lane if (haveAVX) { vpsrldq(scratch1, source, index * sizeof(float)); @@ -353,7 +353,7 @@ void ShaderEmitter::storeRegister(Xmm source, const PICAShader& shader, u32 dest movaps(scratch1, source); psrldq(scratch1, index * sizeof(float)); } - movss(dword[statePointer + lane_offset], scratch1); + movss(dword[statePointer + laneOffset], scratch1); } } else if (haveSSE4_1) { // Bit reverse the write mask because that is what blendps expects diff --git a/third_party/oaknut b/third_party/oaknut new file mode 160000 index 000000000..1d51f5512 --- /dev/null +++ b/third_party/oaknut @@ -0,0 +1 @@ +Subproject commit 1d51f551294897ab4c8001c5259c8c5dee7e2a85