Skip to content

Commit

Permalink
FEXCore: Add non-atomic Memcpy and Memset IR fast paths
Browse files Browse the repository at this point in the history
When TSO is disabled, vector LDP/STP can be used for a two
instruction 32 byte memory copy which is significantly faster than the
current byte-by-byte copy. Performing two such copies directly after
oneanother also marginally increases copy speed for all sizes >=64.
  • Loading branch information
bylaws committed Feb 29, 2024
1 parent 009ae55 commit 99071b2
Showing 1 changed file with 89 additions and 7 deletions.
96 changes: 89 additions & 7 deletions FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1784,16 +1784,59 @@ DEF_OP(MemSet) {
}
};

const auto SubRegSize =
Size == 1 ? ARMEmitter::SubRegSize::i8Bit :
Size == 2 ? ARMEmitter::SubRegSize::i16Bit :
Size == 4 ? ARMEmitter::SubRegSize::i32Bit :
Size == 8 ? ARMEmitter::SubRegSize::i64Bit : ARMEmitter::SubRegSize::i8Bit;

auto EmitMemset = [&](int32_t Direction) {
const int32_t OpSize = Size;
const int32_t SizeDirection = Size * Direction;

ARMEmitter::BackwardLabel AgainInternal{};
ARMEmitter::SingleUseForwardLabel DoneInternal{};
ARMEmitter::BiDirectionalLabel AgainInternal{};
ARMEmitter::ForwardLabel DoneInternal{};

// Early exit if zero count.
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);

if (!Op->IsAtomic) {
ARMEmitter::ForwardLabel AgainInternal256Exit{};
ARMEmitter::BackwardLabel AgainInternal256{};
ARMEmitter::ForwardLabel AgainInternal128Exit{};
ARMEmitter::BackwardLabel AgainInternal128{};

// Fallback to byte by byte loop if not 4 byte aligned
and_(ARMEmitter::Size::i64Bit, TMP4, TMP2, 0x3);
cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal);

// Fill VTMP2 with the set pattern
dup(SubRegSize, VTMP2.Q(), Value);

Bind(&AgainInternal256);
// Keep the counter one copy ahead, so that underflow can be used to detect when to fallback
// to the copy unit size copy loop for the last chunk.
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
tbnz(TMP1, 63, &AgainInternal256Exit);
stp<ARMEmitter::IndexType::POST>(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction);
stp<ARMEmitter::IndexType::POST>(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction);
b(&AgainInternal256);

Bind(&AgainInternal256Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);

Bind(&AgainInternal128);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal128Exit);
stp<ARMEmitter::IndexType::POST>(VTMP2.Q(), VTMP2.Q(), TMP2, 32 * Direction);
b(&AgainInternal128);

Bind(&AgainInternal128Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);
}

Bind(&AgainInternal);
if (Op->IsAtomic) {
MemStoreTSO(Value, OpSize, SizeDirection);
Expand Down Expand Up @@ -1943,6 +1986,10 @@ DEF_OP(MemCpy) {
ldr<ARMEmitter::IndexType::POST>(TMP4, TMP3, Size);
str<ARMEmitter::IndexType::POST>(TMP4, TMP2, Size);
break;
case 32:
ldp<ARMEmitter::IndexType::POST>(VTMP1.Q(), VTMP2.Q(), TMP3, Size);
stp<ARMEmitter::IndexType::POST>(VTMP1.Q(), VTMP2.Q(), TMP2, Size);
break;
default:
LOGMAN_MSG_A_FMT("Unhandled {} size: {}", __func__, Size);
break;
Expand Down Expand Up @@ -2049,11 +2096,46 @@ DEF_OP(MemCpy) {
const int32_t OpSize = Size;
const int32_t SizeDirection = Size * Direction;

ARMEmitter::BackwardLabel AgainInternal{};
ARMEmitter::SingleUseForwardLabel DoneInternal{};

// Early exit if zero count.
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);
ARMEmitter::BiDirectionalLabel AgainInternal{};
ARMEmitter::ForwardLabel DoneInternal{};

if (!Op->IsAtomic) {
ARMEmitter::ForwardLabel AgainInternal256Exit{};
ARMEmitter::ForwardLabel AgainInternal128Exit{};
ARMEmitter::BackwardLabel AgainInternal128{};
ARMEmitter::BackwardLabel AgainInternal256{};

// Early exit if zero count.
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);
orr(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3);

// Fallback to byte by byte loop if either of start/end are not 4 byte aligned
and_(ARMEmitter::Size::i64Bit, TMP4, TMP4, 0x3);
cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal);

Bind(&AgainInternal256);
// Keep the counter one copy ahead, so that underflow can be used to detect when to fallback
// to the copy unit size copy loop for the last chunk.
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
tbnz(TMP1, 63, &AgainInternal256Exit);
MemCpy(32, 32 * Direction);
MemCpy(32, 32 * Direction);
b(&AgainInternal256);

Bind(&AgainInternal256Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 64 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);

Bind(&AgainInternal128);
sub(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
tbnz(TMP1, 63, &AgainInternal128Exit);
MemCpy(32, 32 * Direction);
b(&AgainInternal128);

Bind(&AgainInternal128Exit);
add(ARMEmitter::Size::i64Bit, TMP1, TMP1, 32 / Size);
cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal);
}

Bind(&AgainInternal);
if (Op->IsAtomic) {
Expand Down

0 comments on commit 99071b2

Please sign in to comment.