From 7417efe6006f8b01458e3d78184363276d6c0a65 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 29 Dec 2024 23:06:53 +0100 Subject: [PATCH] JitArm64: Use GetRegWithPreference for EmitBackpatchRoutine addr I'm adding a new function to the register cache called GetRegWithPreference. If the passed-in register is unlocked, it gets locked. Otherwise, GetReg is called. The function also has a GetScopedRegWithPreference variant. Then, I'm making JitArm64 call this function when allocating an address register for use with EmitBackpatchRoutine. This way, when register pressure is low we can use the optimal register, and when register pressure is high (but not completely full) we can sacrifice a bit of farcode size for not having to evict a register from the register cache. --- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 43 ++++++++++--------- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 16 +++---- .../JitArm64/JitArm64_LoadStorePaired.cpp | 26 +++++------ .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 25 +++++++++++ .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 11 ++++- 5 files changed, 78 insertions(+), 43 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 3f872600c6ce..44681cdf14e5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -30,10 +30,12 @@ using namespace Arm64Gen; void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update) { // We want to make sure to not get LR as a temp register - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); + gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg, false); ARM64Reg dest_reg = gpr.R(dest); ARM64Reg up_reg = ARM64Reg::INVALID_REG; @@ -45,7 +47,6 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o if (offsetReg != -1 && !gpr.IsImm(offsetReg)) off_reg = gpr.R(offsetReg); - ARM64Reg addr_reg = ARM64Reg::W1; u32 imm_addr = 0; bool is_immediate = false; @@ -124,7 +125,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -141,7 +142,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o } else if (mmio_address) { - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; scratch_gprs[DecodeReg(dest_reg)] = true; MMIOLoadToReg(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, @@ -166,7 +167,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o MOV(gpr.R(addr), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -175,7 +176,9 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s bool update) { // We want to make sure to not get LR as a temp register - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); + + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); // Don't materialize zero. ARM64Reg RS = gpr.IsImm(value, 0) ? ARM64Reg::WZR : gpr.R(value); @@ -188,8 +191,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s if (dest != -1 && !gpr.IsImm(dest)) reg_dest = gpr.R(dest); - ARM64Reg addr_reg = ARM64Reg::W2; - u32 imm_addr = 0; bool is_immediate = false; @@ -268,7 +269,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -309,7 +310,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } else if (mmio_address) { - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; scratch_gprs[DecodeReg(RS)] = 0; MMIOWriteRegToAddr(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, @@ -330,7 +331,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s MOV(gpr.R(dest), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM64Reg tmp, @@ -512,13 +513,13 @@ void JitArm64::lmw(UGeckoInstruction inst) u32 a = inst.RA, d = inst.RD; s32 offset = inst.SIMM_16; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); // MMU games make use of a >= d despite this being invalid according to the PEM. // If a >= d occurs, we must make sure to not re-read rA after starting doing the loads. - ARM64Reg addr_reg = ARM64Reg::W1; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) @@ -634,7 +635,7 @@ void JitArm64::lmw(UGeckoInstruction inst) } } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -647,9 +648,9 @@ void JitArm64::stmw(UGeckoInstruction inst) u32 a = inst.RA, s = inst.RS; s32 offset = inst.SIMM_16; - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - ARM64Reg addr_reg = ARM64Reg::W2; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) @@ -767,7 +768,7 @@ void JitArm64::stmw(UGeckoInstruction inst) } } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } void JitArm64::dcbx(UGeckoInstruction inst) @@ -987,11 +988,11 @@ void JitArm64::dcbz(UGeckoInstruction inst) int a = inst.RA, b = inst.RB; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); }); + Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W30); }); - constexpr ARM64Reg addr_reg = ARM64Reg::W1; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); constexpr ARM64Reg temp_reg = ARM64Reg::W30; // HACK: Don't clear any memory in the [0x8000'0000, 0x8000'8000) region. @@ -1055,7 +1056,7 @@ void JitArm64::dcbz(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, addr_reg, scratch_gprs, scratch_fprs); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index ff7d37df96a5..18e86a318533 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -77,12 +77,12 @@ void JitArm64::lfXX(UGeckoInstruction inst) const RegType type = (flags & BackPatchInfo::FLAG_SIZE_64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); const ARM64Reg VD = fpr.RW(inst.FD, type, false); - ARM64Reg addr_reg = ARM64Reg::W1; if (update) { @@ -164,7 +164,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -187,7 +187,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -270,9 +270,9 @@ void JitArm64::stfXX(UGeckoInstruction inst) V0 = std::move(single_reg); } - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - ARM64Reg addr_reg = ARM64Reg::W2; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); if (update) { @@ -358,7 +358,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (is_immediate) { @@ -409,5 +409,5 @@ void JitArm64::stfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index b225c83eab6d..803d7a5dae67 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -38,10 +38,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Lock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); } else if (jo.memcheck) @@ -50,7 +50,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) } constexpr ARM64Reg type_reg = ARM64Reg::W0; - constexpr ARM64Reg addr_reg = ARM64Reg::W1; + const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W1) : + Arm64RegCache::ScopedARM64Reg(ARM64Reg::W1); constexpr ARM64Reg scale_reg = ARM64Reg::W2; ARM64Reg VS = fpr.RW(inst.RS, RegType::Single, false); @@ -82,7 +83,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -127,10 +128,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); } else if (jo.memcheck) @@ -197,17 +198,18 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) } } - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2); if (!jo.fastmem) gpr.Lock(ARM64Reg::W3); } constexpr ARM64Reg type_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; - constexpr ARM64Reg addr_reg = ARM64Reg::W2; + const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W2) : + Arm64RegCache::ScopedARM64Reg(ARM64Reg::W2); if (inst.RA || update) // Always uses the register on update { @@ -237,7 +239,7 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) @@ -269,10 +271,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2); if (!jo.fastmem) gpr.Unlock(ARM64Reg::W3); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 465d0257afa4..60dde40c6175 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "Common/Assert.h" @@ -63,6 +64,30 @@ ARM64Reg Arm64RegCache::GetReg() return ARM64Reg::INVALID_REG; } +ARM64Reg Arm64RegCache::GetRegWithPreference(Arm64Gen::ARM64Reg preferred) +{ + // In practice, the preferred register tends to be towards the end of m_host_registers, + // so we scan through m_host_registers backwards + for (auto& it : m_host_registers | std::views::reverse) + { + if (it.GetReg() == preferred) + { + if (it.IsLocked()) + { + return GetReg(); + } + else + { + it.Lock(); + return it.GetReg(); + } + } + } + ASSERT_MSG(DYNA_REC, false, "Preferred register {:#x} is not in register cache", + static_cast(preferred)); + return ARM64Reg::INVALID_REG; +} + void Arm64RegCache::UpdateLastUsed(BitSet32 regs_used) { for (size_t i = 0; i < m_guest_registers.size(); ++i) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index b98e17053186..72a2c0af5964 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -183,13 +183,16 @@ class Arm64RegCache // Returns a temporary register for use // Requires unlocking after done Arm64Gen::ARM64Reg GetReg(); + Arm64Gen::ARM64Reg GetRegWithPreference(Arm64Gen::ARM64Reg preferred); class ScopedARM64Reg { public: inline ScopedARM64Reg() = default; ScopedARM64Reg(const ScopedARM64Reg&) = delete; - explicit inline ScopedARM64Reg(Arm64RegCache& cache) : m_reg(cache.GetReg()), m_gpr(&cache) {} + inline ScopedARM64Reg(Arm64RegCache& cache, Arm64Gen::ARM64Reg reg) : m_reg(reg), m_gpr(&cache) + { + } inline ScopedARM64Reg(Arm64Gen::ARM64Reg reg) : m_reg(reg) {} inline ScopedARM64Reg(ScopedARM64Reg&& scoped_reg) { *this = std::move(scoped_reg); } inline ~ScopedARM64Reg() { Unlock(); } @@ -235,7 +238,11 @@ class Arm64RegCache // Returns a temporary register // Unlocking is implicitly handled through RAII - inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this); } + inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this, GetReg()); } + inline ScopedARM64Reg GetScopedRegWithPreference(Arm64Gen::ARM64Reg preferred) + { + return ScopedARM64Reg(*this, GetRegWithPreference(preferred)); + } void UpdateLastUsed(BitSet32 regs_used);