diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp index 40472ac20a..a46bc9de29 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp @@ -1784,16 +1784,59 @@ DEF_OP(MemSet) { } }; + const auto SubRegSize = + Size == 1 ? ARMEmitter::SubRegSize::i8Bit : + Size == 2 ? ARMEmitter::SubRegSize::i16Bit : + Size == 4 ? ARMEmitter::SubRegSize::i32Bit : + Size == 8 ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit; + auto EmitMemset = [&](int32_t Direction) { const int32_t OpSize = Size; const int32_t SizeDirection = Size * Direction; - ARMEmitter::BackwardLabel AgainInternal{}; - ARMEmitter::SingleUseForwardLabel DoneInternal{}; + ARMEmitter::BiDirectionalLabel AgainInternal{}; + ARMEmitter::ForwardLabel DoneInternal{}; // Early exit if zero count. cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + if (!Op->IsAtomic) { + ARMEmitter::ForwardLabel AgainInternal256Exit{}; + ARMEmitter::BackwardLabel AgainInternal256{}; + ARMEmitter::ForwardLabel AgainInternal128Exit{}; + ARMEmitter::BackwardLabel AgainInternal128{}; + + // Fallback to byte by byte loop if not 4 byte aligned + and_(ARMEmitter::Size::i64Bit, TMP4, TMP2, 0x3); + cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); + + // Fill VTMP2 with the set pattern + dup(SubRegSize, VTMP2.Q(), Value); + + Bind(&AgainInternal256); + // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback + // to the copy unit size copy loop for the last chunk. + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + tbnz(TMP1, 63, &AgainInternal256Exit); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + b(&AgainInternal256); + + Bind(&AgainInternal256Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + Bind(&AgainInternal128); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + stp(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction); + b(&AgainInternal128); + + Bind(&AgainInternal128Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + } + Bind(&AgainInternal); if (Op->IsAtomic) { MemStoreTSO(Value, OpSize, SizeDirection); @@ -1943,6 +1986,10 @@ DEF_OP(MemCpy) { ldr(TMP4, TMP3, Size); str(TMP4, TMP2, Size); break; + case 32: + ldp(VTMP1.Q(), VTMP2.Q(), TMP3, Size); + stp(VTMP1.Q(), VTMP2.Q(), TMP2, Size); + break; default: LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size); break; @@ -2049,11 +2096,46 @@ DEF_OP(MemCpy) { const int32_t OpSize = Size; const int32_t SizeDirection = Size * Direction; - ARMEmitter::BackwardLabel AgainInternal{}; - ARMEmitter::SingleUseForwardLabel DoneInternal{}; - - // Early exit if zero count. - cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + ARMEmitter::BiDirectionalLabel AgainInternal{}; + ARMEmitter::ForwardLabel DoneInternal{}; + + if (!Op->IsAtomic) { + ARMEmitter::ForwardLabel AgainInternal256Exit{}; + ARMEmitter::ForwardLabel AgainInternal128Exit{}; + ARMEmitter::BackwardLabel AgainInternal128{}; + ARMEmitter::BackwardLabel AgainInternal256{}; + + // Early exit if zero count. + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + orr(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3); + + // Fallback to byte by byte loop if either of start/end are not 4 byte aligned + and_(ARMEmitter::Size::i64Bit, TMP4, TMP4, 0x3); + cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); + + Bind(&AgainInternal256); + // Keep the counter one copy ahead, so that underflow can be used to detect when to fallback + // to the copy unit size copy loop for the last chunk. + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + tbnz(TMP1, 63, &AgainInternal256Exit); + MemCpy(32, 32 * Direction); + MemCpy(32, 32 * Direction); + b(&AgainInternal256); + + Bind(&AgainInternal256Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + + Bind(&AgainInternal128); + sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + tbnz(TMP1, 63, &AgainInternal128Exit); + MemCpy(32, 32 * Direction); + b(&AgainInternal128); + + Bind(&AgainInternal128Exit); + add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size); + cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); + } Bind(&AgainInternal); if (Op->IsAtomic) {