From c04d8415b33f58099fec94bfbb9b9e69c9bcbbd7 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 29 May 2024 21:37:58 +0200 Subject: [PATCH 1/3] Jit: Fix "skip redundant flushes" for skipped instructions Normally when an instruction is skipped (for instance due to branch merging or the BLR optimization), the registers that would have been flushed at the end of that instruction will instead be flushed at the end of the next instruction, which is maybe not perfect, but usually good enough. However, since the addition of the "skip redundant flushes" logic in fd511a689f, Dolphin has accidentally skipped flushing those registers, which creates unnecessary register pressure. This commit restores the old behavior. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 10 ++++++++-- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 14 +++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index b09279d45850..1b1d190e86d5 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -957,6 +957,9 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) IntializeSpeculativeConstants(); } + BitSet32 previous_gpr_in_use{}; + BitSet32 previous_fpr_in_use{}; + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { @@ -1164,8 +1167,11 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) gpr.Discard(op.gprDiscardable); fpr.Discard(op.fprDiscardable); } - gpr.Flush(~op.gprInUse & (op.regsIn | op.regsOut)); - fpr.Flush(~op.fprInUse & (op.fregsIn | op.GetFregsOut())); + gpr.Flush(~op.gprInUse & previous_gpr_in_use); + fpr.Flush(~op.fprInUse & previous_fpr_in_use); + + previous_gpr_in_use = op.gprInUse; + previous_fpr_in_use = op.fprInUse; if (opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 51925d0d4802..c9f9bf826ff3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -1171,6 +1171,10 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) IntializeSpeculativeConstants(); } + BitSet32 previous_gpr_in_use{}; + BitSet32 previous_fpr_in_use{}; + BitSet8 previous_cr_in_use{}; + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { @@ -1354,9 +1358,13 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) fpr.DiscardRegisters(op.fprDiscardable); gpr.DiscardCRRegisters(op.crDiscardable); } - gpr.StoreRegisters(~op.gprInUse & (op.regsIn | op.regsOut)); - fpr.StoreRegisters(~op.fprInUse & (op.fregsIn | op.GetFregsOut())); - gpr.StoreCRRegisters(~op.crInUse & (op.crIn | op.crOut)); + gpr.StoreRegisters(~op.gprInUse & previous_gpr_in_use); + fpr.StoreRegisters(~op.fprInUse & previous_fpr_in_use); + gpr.StoreCRRegisters(~op.crInUse & previous_cr_in_use); + + previous_gpr_in_use = op.gprInUse; + previous_fpr_in_use = op.fprInUse; + previous_cr_in_use = op.crInUse; if (opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; From 7dddc39068ed1dab654501ddd62682893ecea748 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 29 May 2024 22:07:15 +0200 Subject: [PATCH 2/3] Jit: Further improve flushing for skipped instructions Now we no longer have to wait until we've compiled the next instruction after a skipped instruction before we can flush registers. This commit is easier to read using "Ignore whitespace". --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 339 +++++++++++----------- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 312 ++++++++++---------- 2 files changed, 331 insertions(+), 320 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 1b1d190e86d5..6ee3c46720e2 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -979,207 +979,216 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) js.isLastInstruction = true; } - if (i != 0) + if (js.skipInstructions != 0) { - // Gather pipe writes using a non-immediate address are discovered by profiling. - const u32 prev_address = m_code_buffer[i - 1].address; - bool gatherPipeIntCheck = js.fifoWriteAddresses.contains(prev_address); - - // Gather pipe writes using an immediate address are explicitly tracked. - if (jo.optimizeGatherPipe && - (js.fifoBytesSinceCheck >= GPFifo::GATHER_PIPE_SIZE || js.mustCheckFifo)) - { - js.fifoBytesSinceCheck = 0; - js.mustCheckFifo = false; - BitSet32 registersInUse = CallerSavedRegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, 0); - ABI_CallFunctionP(GPFifo::FastCheckGatherPipe, &m_system.GetGPFifo()); - ABI_PopRegistersAndAdjustStack(registersInUse, 0); - gatherPipeIntCheck = true; - } - - // Gather pipe writes can generate an exception; add an exception check. - // TODO: This doesn't really match hardware; the CP interrupt is - // asynchronous. - if (gatherPipeIntCheck) + js.skipInstructions--; + } + else + { + if (i != 0) { - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_EXTERNAL_INT)); - FixupBranch extException = J_CC(CC_NZ, Jump::Near); - - SwitchToFarCode(); - SetJumpTarget(extException); - TEST(32, PPCSTATE(msr), Imm32(0x0008000)); - FixupBranch noExtIntEnable = J_CC(CC_Z, Jump::Near); - MOV(64, R(RSCRATCH), ImmPtr(&m_system.GetProcessorInterface().m_interrupt_cause)); - TEST(32, MatR(RSCRATCH), - Imm32(ProcessorInterface::INT_CAUSE_CP | ProcessorInterface::INT_CAUSE_PE_TOKEN | - ProcessorInterface::INT_CAUSE_PE_FINISH)); - FixupBranch noCPInt = J_CC(CC_Z, Jump::Near); + // Gather pipe writes using a non-immediate address are discovered by profiling. + const u32 prev_address = m_code_buffer[i - 1].address; + bool gatherPipeIntCheck = js.fifoWriteAddresses.contains(prev_address); + // Gather pipe writes using an immediate address are explicitly tracked. + if (jo.optimizeGatherPipe && + (js.fifoBytesSinceCheck >= GPFifo::GATHER_PIPE_SIZE || js.mustCheckFifo)) { - RCForkGuard gpr_guard = gpr.Fork(); - RCForkGuard fpr_guard = fpr.Fork(); - - gpr.Flush(); - fpr.Flush(); + js.fifoBytesSinceCheck = 0; + js.mustCheckFifo = false; + BitSet32 registersInUse = CallerSavedRegistersInUse(); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); + ABI_CallFunctionP(GPFifo::FastCheckGatherPipe, &m_system.GetGPFifo()); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); + gatherPipeIntCheck = true; + } - MOV(32, PPCSTATE(pc), Imm32(op.address)); - WriteExternalExceptionExit(); + // Gather pipe writes can generate an exception; add an exception check. + // TODO: This doesn't really match hardware; the CP interrupt is + // asynchronous. + if (gatherPipeIntCheck) + { + TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_EXTERNAL_INT)); + FixupBranch extException = J_CC(CC_NZ, Jump::Near); + + SwitchToFarCode(); + SetJumpTarget(extException); + TEST(32, PPCSTATE(msr), Imm32(0x0008000)); + FixupBranch noExtIntEnable = J_CC(CC_Z, Jump::Near); + MOV(64, R(RSCRATCH), ImmPtr(&m_system.GetProcessorInterface().m_interrupt_cause)); + TEST(32, MatR(RSCRATCH), + Imm32(ProcessorInterface::INT_CAUSE_CP | ProcessorInterface::INT_CAUSE_PE_TOKEN | + ProcessorInterface::INT_CAUSE_PE_FINISH)); + FixupBranch noCPInt = J_CC(CC_Z, Jump::Near); + + { + RCForkGuard gpr_guard = gpr.Fork(); + RCForkGuard fpr_guard = fpr.Fork(); + + gpr.Flush(); + fpr.Flush(); + + MOV(32, PPCSTATE(pc), Imm32(op.address)); + WriteExternalExceptionExit(); + } + SwitchToNearCode(); + SetJumpTarget(noCPInt); + SetJumpTarget(noExtIntEnable); } - SwitchToNearCode(); - SetJumpTarget(noCPInt); - SetJumpTarget(noExtIntEnable); } - } - if (HandleFunctionHooking(op.address)) - break; + if (HandleFunctionHooking(op.address)) + break; - if (op.skip) - { - if (IsDebuggingEnabled()) - { - // The only thing that currently sets op.skip is the BLR following optimization. - // If any non-branch instruction starts setting that too, this will need to be changed. - ASSERT(op.inst.hex == 0x4e800020); - WriteBranchWatch(op.address, op.branchTo, op.inst, RSCRATCH, RSCRATCH2, - CallerSavedRegistersInUse()); - } - } - else - { - auto& cpu = m_system.GetCPU(); - auto& power_pc = m_system.GetPowerPC(); - if (IsDebuggingEnabled() && power_pc.GetBreakPoints().IsAddressBreakPoint(op.address) && - !cpu.IsStepping()) + if (op.skip) { - gpr.Flush(); - fpr.Flush(); - - MOV(32, PPCSTATE(pc), Imm32(op.address)); - ABI_PushRegistersAndAdjustStack({}, 0); - ABI_CallFunctionP(PowerPC::CheckAndHandleBreakPointsFromJIT, &power_pc); - ABI_PopRegistersAndAdjustStack({}, 0); - MOV(64, R(RSCRATCH), ImmPtr(cpu.GetStatePtr())); - CMP(32, MatR(RSCRATCH), Imm32(Common::ToUnderlying(CPU::State::Running))); - FixupBranch noBreakpoint = J_CC(CC_E); - - Cleanup(); - MOV(32, PPCSTATE(npc), Imm32(op.address)); - SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); - JMP(asm_routines.dispatcher_exit, Jump::Near); - - SetJumpTarget(noBreakpoint); + if (IsDebuggingEnabled()) + { + // The only thing that currently sets op.skip is the BLR following optimization. + // If any non-branch instruction starts setting that too, this will need to be changed. + ASSERT(op.inst.hex == 0x4e800020); + WriteBranchWatch(op.address, op.branchTo, op.inst, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } } - - if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound) + else { - // This instruction uses FPU - needs to add FP exception bailout - TEST(32, PPCSTATE(msr), Imm32(1 << 13)); // Test FP enabled bit - FixupBranch b1 = J_CC(CC_Z, Jump::Near); - - SwitchToFarCode(); - SetJumpTarget(b1); + auto& cpu = m_system.GetCPU(); + auto& power_pc = m_system.GetPowerPC(); + if (IsDebuggingEnabled() && power_pc.GetBreakPoints().IsAddressBreakPoint(op.address) && + !cpu.IsStepping()) { - RCForkGuard gpr_guard = gpr.Fork(); - RCForkGuard fpr_guard = fpr.Fork(); - gpr.Flush(); fpr.Flush(); - // If a FPU exception occurs, the exception handler will read - // from PC. Update PC with the latest value in case that happens. MOV(32, PPCSTATE(pc), Imm32(op.address)); - OR(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE)); - WriteExceptionExit(); + ABI_PushRegistersAndAdjustStack({}, 0); + ABI_CallFunctionP(PowerPC::CheckAndHandleBreakPointsFromJIT, &power_pc); + ABI_PopRegistersAndAdjustStack({}, 0); + MOV(64, R(RSCRATCH), ImmPtr(cpu.GetStatePtr())); + CMP(32, MatR(RSCRATCH), Imm32(Common::ToUnderlying(CPU::State::Running))); + FixupBranch noBreakpoint = J_CC(CC_E); + + Cleanup(); + MOV(32, PPCSTATE(npc), Imm32(op.address)); + SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); + JMP(asm_routines.dispatcher_exit, Jump::Near); + + SetJumpTarget(noBreakpoint); } - SwitchToNearCode(); - - js.firstFPInstructionFound = true; - } - if (bJITRegisterCacheOff) - { - gpr.Flush(); - fpr.Flush(); - } - else - { - // If we have an input register that is going to be used again, load it pre-emptively, - // even if the instruction doesn't strictly need it in a register, to avoid redundant - // loads later. Of course, don't do this if we're already out of registers. - // As a bit of a heuristic, make sure we have at least one register left over for the - // output, which needs to be bound in the actual instruction compilation. - // TODO: make this smarter in the case that we're actually register-starved, i.e. - // prioritize the more important registers. - gpr.PreloadRegisters(op.regsIn & op.gprInUse & ~op.gprDiscardable); - fpr.PreloadRegisters(op.fregsIn & op.fprInXmm & ~op.fprDiscardable); - } - - CompileInstruction(op); - - js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; - - if (jo.memcheck && (opinfo->flags & FL_LOADSTORE)) - { - // If we have a fastmem loadstore, we can omit the exception check and let fastmem handle - // it. - FixupBranch memException; - ASSERT_MSG(DYNA_REC, !(js.fastmemLoadStore && js.fixupExceptionHandler), - "Fastmem loadstores shouldn't have exception handler fixups (PC={:x})!", - op.address); - if (!js.fastmemLoadStore && !js.fixupExceptionHandler) + if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound) { - TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); - memException = J_CC(CC_NZ, Jump::Near); + // This instruction uses FPU - needs to add FP exception bailout + TEST(32, PPCSTATE(msr), Imm32(1 << 13)); // Test FP enabled bit + FixupBranch b1 = J_CC(CC_Z, Jump::Near); + + SwitchToFarCode(); + SetJumpTarget(b1); + { + RCForkGuard gpr_guard = gpr.Fork(); + RCForkGuard fpr_guard = fpr.Fork(); + + gpr.Flush(); + fpr.Flush(); + + // If a FPU exception occurs, the exception handler will read + // from PC. Update PC with the latest value in case that happens. + MOV(32, PPCSTATE(pc), Imm32(op.address)); + OR(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE)); + WriteExceptionExit(); + } + SwitchToNearCode(); + + js.firstFPInstructionFound = true; } - SwitchToFarCode(); - if (!js.fastmemLoadStore) + if (bJITRegisterCacheOff) { - m_exception_handler_at_loc[js.fastmemLoadStore] = nullptr; - SetJumpTarget(js.fixupExceptionHandler ? js.exceptionHandler : memException); + gpr.Flush(); + fpr.Flush(); } else { - m_exception_handler_at_loc[js.fastmemLoadStore] = GetWritableCodePtr(); + // If we have an input register that is going to be used again, load it pre-emptively, + // even if the instruction doesn't strictly need it in a register, to avoid redundant + // loads later. Of course, don't do this if we're already out of registers. + // As a bit of a heuristic, make sure we have at least one register left over for the + // output, which needs to be bound in the actual instruction compilation. + // TODO: make this smarter in the case that we're actually register-starved, i.e. + // prioritize the more important registers. + gpr.PreloadRegisters(op.regsIn & op.gprInUse & ~op.gprDiscardable); + fpr.PreloadRegisters(op.fregsIn & op.fprInXmm & ~op.fprDiscardable); } - RCForkGuard gpr_guard = gpr.Fork(); - RCForkGuard fpr_guard = fpr.Fork(); + CompileInstruction(op); - gpr.Revert(); - fpr.Revert(); - gpr.Flush(); - fpr.Flush(); + js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; - MOV(32, PPCSTATE(pc), Imm32(op.address)); - WriteExceptionExit(); - SwitchToNearCode(); - } + if (jo.memcheck && (opinfo->flags & FL_LOADSTORE)) + { + // If we have a fastmem loadstore, we can omit the exception check and let fastmem handle + // it. + FixupBranch memException; + ASSERT_MSG(DYNA_REC, !(js.fastmemLoadStore && js.fixupExceptionHandler), + "Fastmem loadstores shouldn't have exception handler fixups (PC={:x})!", + op.address); + if (!js.fastmemLoadStore && !js.fixupExceptionHandler) + { + TEST(32, PPCSTATE(Exceptions), Imm32(EXCEPTION_DSI)); + memException = J_CC(CC_NZ, Jump::Near); + } + + SwitchToFarCode(); + if (!js.fastmemLoadStore) + { + m_exception_handler_at_loc[js.fastmemLoadStore] = nullptr; + SetJumpTarget(js.fixupExceptionHandler ? js.exceptionHandler : memException); + } + else + { + m_exception_handler_at_loc[js.fastmemLoadStore] = GetWritableCodePtr(); + } - gpr.Commit(); - fpr.Commit(); + RCForkGuard gpr_guard = gpr.Fork(); + RCForkGuard fpr_guard = fpr.Fork(); - // If we have a register that will never be used again, discard or flush it. - if (!bJITRegisterCacheOff) - { - gpr.Discard(op.gprDiscardable); - fpr.Discard(op.fprDiscardable); - } - gpr.Flush(~op.gprInUse & previous_gpr_in_use); - fpr.Flush(~op.fprInUse & previous_fpr_in_use); + gpr.Revert(); + fpr.Revert(); + gpr.Flush(); + fpr.Flush(); + + MOV(32, PPCSTATE(pc), Imm32(op.address)); + WriteExceptionExit(); + SwitchToNearCode(); + } - previous_gpr_in_use = op.gprInUse; - previous_fpr_in_use = op.fprInUse; + gpr.Commit(); + fpr.Commit(); - if (opinfo->flags & FL_LOADSTORE) - ++js.numLoadStoreInst; + if (opinfo->flags & FL_LOADSTORE) + ++js.numLoadStoreInst; - if (opinfo->flags & FL_USE_FPU) - ++js.numFloatingPointInst; + if (opinfo->flags & FL_USE_FPU) + ++js.numFloatingPointInst; + } } + js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; + + // If we have a register that will never be used again, discard or flush it. + if (!bJITRegisterCacheOff) + { + gpr.Discard(op.gprDiscardable); + fpr.Discard(op.fprDiscardable); + } + gpr.Flush(~op.gprInUse & previous_gpr_in_use); + fpr.Flush(~op.fprInUse & previous_fpr_in_use); + + previous_gpr_in_use = op.gprInUse; + previous_fpr_in_use = op.fprInUse; + #if defined(_DEBUG) || defined(DEBUGFAST) if (!gpr.SanityCheck() || !fpr.SanityCheck()) { @@ -1187,8 +1196,6 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) NOTICE_LOG_FMT(DYNA_REC, "Unflushed register: {}", ppc_inst); } #endif - i += js.skipInstructions; - js.skipInstructions = 0; } if (code_block.m_broken) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index c9f9bf826ff3..4e92b516e5d9 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -1188,193 +1188,197 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) js.downcountAmount += opinfo->num_cycles; js.isLastInstruction = i == (code_block.m_num_instructions - 1); - // Skip calling UpdateLastUsed for lmw/stmw - it usually hurts more than it helps - if (op.inst.OPCD != 46 && op.inst.OPCD != 47) - gpr.UpdateLastUsed(op.regsIn | op.regsOut); - - BitSet32 fpr_used = op.fregsIn; - if (op.fregOut >= 0) - fpr_used[op.fregOut] = true; - fpr.UpdateLastUsed(fpr_used); - - if (i != 0) - { - // Gather pipe writes using a non-immediate address are discovered by profiling. - const u32 prev_address = m_code_buffer[i - 1].address; - bool gatherPipeIntCheck = js.fifoWriteAddresses.contains(prev_address); - - if (jo.optimizeGatherPipe && - (js.fifoBytesSinceCheck >= GPFifo::GATHER_PIPE_SIZE || js.mustCheckFifo)) - { - js.fifoBytesSinceCheck = 0; - js.mustCheckFifo = false; - - gpr.Lock(ARM64Reg::W30); - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(ARM64Reg::W30)] = 0; - - ABI_PushRegisters(regs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use, ARM64Reg::X30); - ABI_CallFunction(&GPFifo::FastCheckGatherPipe, &m_system.GetGPFifo()); - m_float_emit.ABI_PopRegisters(fprs_in_use, ARM64Reg::X30); - ABI_PopRegisters(regs_in_use); - - gpr.Unlock(ARM64Reg::W30); - gatherPipeIntCheck = true; - } - // Gather pipe writes can generate an exception; add an exception check. - // TODO: This doesn't really match hardware; the CP interrupt is - // asynchronous. - if (jo.optimizeGatherPipe && gatherPipeIntCheck) - { - auto WA = gpr.GetScopedReg(); - ARM64Reg XA = EncodeRegTo64(WA); - - LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); - FixupBranch no_ext_exception = TBZ(WA, MathUtil::IntLog2(EXCEPTION_EXTERNAL_INT)); - FixupBranch exception = B(); - SwitchToFarCode(); - const u8* done_here = GetCodePtr(); - FixupBranch exit = B(); - SetJumpTarget(exception); - LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(msr)); - TBZ(WA, 15, done_here); // MSR.EE - LDR(IndexType::Unsigned, WA, XA, - MOVPage2R(XA, &m_system.GetProcessorInterface().m_interrupt_cause)); - constexpr u32 cause_mask = ProcessorInterface::INT_CAUSE_CP | - ProcessorInterface::INT_CAUSE_PE_TOKEN | - ProcessorInterface::INT_CAUSE_PE_FINISH; - TST(WA, LogicalImm(cause_mask, GPRSize::B32)); - B(CC_EQ, done_here); - - gpr.Flush(FlushMode::MaintainState, WA); - fpr.Flush(FlushMode::MaintainState, ARM64Reg::INVALID_REG); - WriteExceptionExit(js.compilerPC, true, true); - SwitchToNearCode(); - SetJumpTarget(no_ext_exception); - SetJumpTarget(exit); - } - } - - if (HandleFunctionHooking(op.address)) - break; - - if (op.skip) + if (js.skipInstructions != 0) { - if (IsDebuggingEnabled()) - { - // The only thing that currently sets op.skip is the BLR following optimization. - // If any non-branch instruction starts setting that too, this will need to be changed. - ASSERT(op.inst.hex == 0x4e800020); - const auto bw_reg_a = gpr.GetScopedReg(), bw_reg_b = gpr.GetScopedReg(); - const BitSet32 gpr_caller_save = - gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(bw_reg_a), DecodeReg(bw_reg_b)}; - WriteBranchWatch(op.address, op.branchTo, op.inst, bw_reg_a, bw_reg_b, - gpr_caller_save, fpr.GetCallerSavedUsed()); - } + js.skipInstructions--; } else { - if (IsDebuggingEnabled() && !cpu.IsStepping() && - m_system.GetPowerPC().GetBreakPoints().IsAddressBreakPoint(op.address)) - { - FlushCarry(); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + // Skip calling UpdateLastUsed for lmw/stmw - it usually hurts more than it helps + if (op.inst.OPCD != 46 && op.inst.OPCD != 47) + gpr.UpdateLastUsed(op.regsIn | op.regsOut); - static_assert(PPCSTATE_OFF(pc) <= 252); - static_assert(PPCSTATE_OFF(pc) + 4 == PPCSTATE_OFF(npc)); + BitSet32 fpr_used = op.fregsIn; + if (op.fregOut >= 0) + fpr_used[op.fregOut] = true; + fpr.UpdateLastUsed(fpr_used); - MOVI2R(DISPATCHER_PC, op.address); - STP(IndexType::Signed, DISPATCHER_PC, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); - ABI_CallFunction(&PowerPC::CheckAndHandleBreakPointsFromJIT, &m_system.GetPowerPC()); - - LDR(IndexType::Unsigned, ARM64Reg::W0, ARM64Reg::X0, - MOVPage2R(ARM64Reg::X0, cpu.GetStatePtr())); - static_assert(Common::ToUnderlying(CPU::State::Running) == 0); - FixupBranch no_breakpoint = CBZ(ARM64Reg::W0); + if (i != 0) + { + // Gather pipe writes using a non-immediate address are discovered by profiling. + const u32 prev_address = m_code_buffer[i - 1].address; + bool gatherPipeIntCheck = js.fifoWriteAddresses.contains(prev_address); - Cleanup(); - if (IsProfilingEnabled()) + if (jo.optimizeGatherPipe && + (js.fifoBytesSinceCheck >= GPFifo::GATHER_PIPE_SIZE || js.mustCheckFifo)) { - ABI_CallFunction(&JitBlock::ProfileData::EndProfiling, b->profile_data.get(), - js.downcountAmount); + js.fifoBytesSinceCheck = 0; + js.mustCheckFifo = false; + + gpr.Lock(ARM64Reg::W30); + BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); + BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + regs_in_use[DecodeReg(ARM64Reg::W30)] = 0; + + ABI_PushRegisters(regs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, ARM64Reg::X30); + ABI_CallFunction(&GPFifo::FastCheckGatherPipe, &m_system.GetGPFifo()); + m_float_emit.ABI_PopRegisters(fprs_in_use, ARM64Reg::X30); + ABI_PopRegisters(regs_in_use); + + gpr.Unlock(ARM64Reg::W30); + gatherPipeIntCheck = true; } - DoDownCount(); - B(dispatcher_exit); - - SetJumpTarget(no_breakpoint); - } - - if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound) - { - FixupBranch b1; - // This instruction uses FPU - needs to add FP exception bailout + // Gather pipe writes can generate an exception; add an exception check. + // TODO: This doesn't really match hardware; the CP interrupt is + // asynchronous. + if (jo.optimizeGatherPipe && gatherPipeIntCheck) { auto WA = gpr.GetScopedReg(); - LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(msr)); - b1 = TBNZ(WA, 13); // Test FP enabled bit + ARM64Reg XA = EncodeRegTo64(WA); - FixupBranch far_addr = B(); + LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); + FixupBranch no_ext_exception = TBZ(WA, MathUtil::IntLog2(EXCEPTION_EXTERNAL_INT)); + FixupBranch exception = B(); SwitchToFarCode(); - SetJumpTarget(far_addr); + const u8* done_here = GetCodePtr(); + FixupBranch exit = B(); + SetJumpTarget(exception); + LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(msr)); + TBZ(WA, 15, done_here); // MSR.EE + LDR(IndexType::Unsigned, WA, XA, + MOVPage2R(XA, &m_system.GetProcessorInterface().m_interrupt_cause)); + constexpr u32 cause_mask = ProcessorInterface::INT_CAUSE_CP | + ProcessorInterface::INT_CAUSE_PE_TOKEN | + ProcessorInterface::INT_CAUSE_PE_FINISH; + TST(WA, LogicalImm(cause_mask, GPRSize::B32)); + B(CC_EQ, done_here); gpr.Flush(FlushMode::MaintainState, WA); fpr.Flush(FlushMode::MaintainState, ARM64Reg::INVALID_REG); + WriteExceptionExit(js.compilerPC, true, true); + SwitchToNearCode(); + SetJumpTarget(no_ext_exception); + SetJumpTarget(exit); + } + } - LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); - ORR(WA, WA, LogicalImm(EXCEPTION_FPU_UNAVAILABLE, GPRSize::B32)); - STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); + if (HandleFunctionHooking(op.address)) + break; + + if (op.skip) + { + if (IsDebuggingEnabled()) + { + // The only thing that currently sets op.skip is the BLR following optimization. + // If any non-branch instruction starts setting that too, this will need to be changed. + ASSERT(op.inst.hex == 0x4e800020); + const auto bw_reg_a = gpr.GetScopedReg(), bw_reg_b = gpr.GetScopedReg(); + const BitSet32 gpr_caller_save = + gpr.GetCallerSavedUsed() & ~BitSet32{DecodeReg(bw_reg_a), DecodeReg(bw_reg_b)}; + WriteBranchWatch(op.address, op.branchTo, op.inst, bw_reg_a, bw_reg_b, + gpr_caller_save, fpr.GetCallerSavedUsed()); + } + } + else + { + if (IsDebuggingEnabled() && !cpu.IsStepping() && + m_system.GetPowerPC().GetBreakPoints().IsAddressBreakPoint(op.address)) + { + FlushCarry(); + gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + + static_assert(PPCSTATE_OFF(pc) <= 252); + static_assert(PPCSTATE_OFF(pc) + 4 == PPCSTATE_OFF(npc)); + + MOVI2R(DISPATCHER_PC, op.address); + STP(IndexType::Signed, DISPATCHER_PC, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); + ABI_CallFunction(&PowerPC::CheckAndHandleBreakPointsFromJIT, &m_system.GetPowerPC()); + + LDR(IndexType::Unsigned, ARM64Reg::W0, ARM64Reg::X0, + MOVPage2R(ARM64Reg::X0, cpu.GetStatePtr())); + static_assert(Common::ToUnderlying(CPU::State::Running) == 0); + FixupBranch no_breakpoint = CBZ(ARM64Reg::W0); + + Cleanup(); + if (IsProfilingEnabled()) + { + ABI_CallFunction(&JitBlock::ProfileData::EndProfiling, b->profile_data.get(), + js.downcountAmount); + } + DoDownCount(); + B(dispatcher_exit); + + SetJumpTarget(no_breakpoint); } - WriteExceptionExit(js.compilerPC, false, true); + if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound) + { + FixupBranch b1; + // This instruction uses FPU - needs to add FP exception bailout + { + auto WA = gpr.GetScopedReg(); + LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(msr)); + b1 = TBNZ(WA, 13); // Test FP enabled bit - SwitchToNearCode(); + FixupBranch far_addr = B(); + SwitchToFarCode(); + SetJumpTarget(far_addr); - SetJumpTarget(b1); + gpr.Flush(FlushMode::MaintainState, WA); + fpr.Flush(FlushMode::MaintainState, ARM64Reg::INVALID_REG); - js.firstFPInstructionFound = true; - } + LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); + ORR(WA, WA, LogicalImm(EXCEPTION_FPU_UNAVAILABLE, GPRSize::B32)); + STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); + } - if (bJITRegisterCacheOff) - { - FlushCarry(); - gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); - } + WriteExceptionExit(js.compilerPC, false, true); - CompileInstruction(op); + SwitchToNearCode(); - js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; + SetJumpTarget(b1); - if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer) - FlushCarry(); + js.firstFPInstructionFound = true; + } - // If we have a register that will never be used again, discard or flush it. - if (!bJITRegisterCacheOff) - { - gpr.DiscardRegisters(op.gprDiscardable); - fpr.DiscardRegisters(op.fprDiscardable); - gpr.DiscardCRRegisters(op.crDiscardable); + if (bJITRegisterCacheOff) + { + FlushCarry(); + gpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + fpr.Flush(FlushMode::All, ARM64Reg::INVALID_REG); + } + + CompileInstruction(op); + + if (opinfo->flags & FL_LOADSTORE) + ++js.numLoadStoreInst; + + if (opinfo->flags & FL_USE_FPU) + ++js.numFloatingPointInst; } - gpr.StoreRegisters(~op.gprInUse & previous_gpr_in_use); - fpr.StoreRegisters(~op.fprInUse & previous_fpr_in_use); - gpr.StoreCRRegisters(~op.crInUse & previous_cr_in_use); + } - previous_gpr_in_use = op.gprInUse; - previous_fpr_in_use = op.fprInUse; - previous_cr_in_use = op.crInUse; + js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; - if (opinfo->flags & FL_LOADSTORE) - ++js.numLoadStoreInst; + if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer) + FlushCarry(); - if (opinfo->flags & FL_USE_FPU) - ++js.numFloatingPointInst; + // If we have a register that will never be used again, discard or flush it. + if (!bJITRegisterCacheOff) + { + gpr.DiscardRegisters(op.gprDiscardable); + fpr.DiscardRegisters(op.fprDiscardable); + gpr.DiscardCRRegisters(op.crDiscardable); } + gpr.StoreRegisters(~op.gprInUse & previous_gpr_in_use); + fpr.StoreRegisters(~op.fprInUse & previous_fpr_in_use); + gpr.StoreCRRegisters(~op.crInUse & previous_cr_in_use); - i += js.skipInstructions; - js.skipInstructions = 0; + previous_gpr_in_use = op.gprInUse; + previous_fpr_in_use = op.fprInUse; + previous_cr_in_use = op.crInUse; } if (code_block.m_broken) From a5a40de3cc15bb1b83835a58993838be217cb00d Mon Sep 17 00:00:00 2001 From: JosJuice Date: Wed, 29 May 2024 22:14:25 +0200 Subject: [PATCH 3/3] Jit: Don't skip incrementing numLoadStoreInst/numFloatingPointInst This was technically wrong, but it only affected merged dcbt+dcbst, and I doubt any games care about it. --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 12 ++++++------ Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 6ee3c46720e2..d9fd7cec8165 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -1166,15 +1166,15 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) gpr.Commit(); fpr.Commit(); - - if (opinfo->flags & FL_LOADSTORE) - ++js.numLoadStoreInst; - - if (opinfo->flags & FL_USE_FPU) - ++js.numFloatingPointInst; } } + if (opinfo->flags & FL_LOADSTORE) + ++js.numLoadStoreInst; + + if (opinfo->flags & FL_USE_FPU) + ++js.numFloatingPointInst; + js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; // If we have a register that will never be used again, discard or flush it. diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 4e92b516e5d9..90bb81195ed7 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -1351,15 +1351,15 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) } CompileInstruction(op); - - if (opinfo->flags & FL_LOADSTORE) - ++js.numLoadStoreInst; - - if (opinfo->flags & FL_USE_FPU) - ++js.numFloatingPointInst; } } + if (opinfo->flags & FL_LOADSTORE) + ++js.numLoadStoreInst; + + if (opinfo->flags & FL_USE_FPU) + ++js.numFloatingPointInst; + js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst; if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer)