Skip to content

Commit

Permalink
Merge pull request #3492 from Sonicadvance1/implement_prefetch
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Implement support for the various prefetch instructions
  • Loading branch information
alyssarosenzweig authored Mar 18, 2024
2 parents ba3029b + 6757a80 commit 2a9fcc6
Show file tree
Hide file tree
Showing 9 changed files with 297 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3762,7 +3762,12 @@ public:
}
else {
if (MemSrc.MetaType.ImmType.Index == ARMEmitter::IndexType::OFFSET) {
prfm(prfop, MemSrc.rn, MemSrc.MetaType.ImmType.Imm);
if ((MemSrc.MetaType.ImmType.Imm & 0b111) || MemSrc.MetaType.ImmType.Imm < 0) {
prfum<IndexType::OFFSET>(prfop, MemSrc.rn, MemSrc.MetaType.ImmType.Imm);
}
else {
prfm(prfop, MemSrc.rn, MemSrc.MetaType.ImmType.Imm);
}
}
else {
LOGMAN_MSG_A_FMT("Unexpected loadstore index type");
Expand Down
41 changes: 41 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2427,6 +2427,47 @@ DEF_OP(CacheLineZero) {
}
}

DEF_OP(Prefetch) {
auto Op = IROp->C<IR::IROp_Prefetch>();
const auto MemReg = GetReg(Op->Addr.ID());

// Access size is only ever handled as 8-byte. Even though it is accesssed as a cacheline.
const auto MemSrc = GenerateMemOperand(8, MemReg, Op->Offset, Op->OffsetType, Op->OffsetScale);

size_t LUT =
(Op->Stream ? 1 : 0) |
((Op->CacheLevel - 1) << 1) |
(Op->ForStore ? 1U << 3 : 0);

constexpr static std::array<ARMEmitter::Prefetch, 14> PrefetchType = {
ARMEmitter::Prefetch::PLDL1KEEP,
ARMEmitter::Prefetch::PLDL1STRM,

ARMEmitter::Prefetch::PLDL2KEEP,
ARMEmitter::Prefetch::PLDL2STRM,

ARMEmitter::Prefetch::PLDL3KEEP,
ARMEmitter::Prefetch::PLDL3STRM,

// Gap of two.
// 0b0'11'0
ARMEmitter::Prefetch::PLDL1STRM,
// 0b0'11'1
ARMEmitter::Prefetch::PLDL1STRM,

ARMEmitter::Prefetch::PSTL1KEEP,
ARMEmitter::Prefetch::PSTL1STRM,

ARMEmitter::Prefetch::PSTL2KEEP,
ARMEmitter::Prefetch::PSTL2STRM,

ARMEmitter::Prefetch::PSTL3KEEP,
ARMEmitter::Prefetch::PSTL3STRM,
};

prfm(PrefetchType[LUT], MemSrc);
}

#undef DEF_OP
}

39 changes: 34 additions & 5 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5408,6 +5408,12 @@ void OpDispatchBuilder::CLZeroOp(OpcodeArgs) {
_CacheLineZero(DestMem);
}

template<bool ForStore, bool Stream, uint8_t Level>
void OpDispatchBuilder::Prefetch(OpcodeArgs) {
OrderedNode *DestMem = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.LoadData = false});
Prefetch(ForStore, Stream, Level, DestMem);
}

void OpDispatchBuilder::RDTSCPOp(OpcodeArgs) {
// RDTSCP is slightly different than RDTSC
// IA32_TSC_AUX is returned in RCX
Expand Down Expand Up @@ -6623,13 +6629,36 @@ constexpr uint16_t PF_F2 = 3;
{OPD(FEXCore::X86Tables::TYPE_GROUP_15, PF_66, 7), 1, &OpDispatchBuilder::CLFLUSHOPT},

// GROUP 16
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 0), 8, &OpDispatchBuilder::NOPOp},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 0), 8, &OpDispatchBuilder::NOPOp},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 0), 8, &OpDispatchBuilder::NOPOp},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 0), 8, &OpDispatchBuilder::NOPOp},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 0), 1, &OpDispatchBuilder::Prefetch<false, true, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 1), 1, &OpDispatchBuilder::Prefetch<false, false, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 2), 1, &OpDispatchBuilder::Prefetch<false, false, 2>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 3), 1, &OpDispatchBuilder::Prefetch<false, false, 3>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_NONE, 4), 4, &OpDispatchBuilder::NOPOp},

{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 0), 1, &OpDispatchBuilder::Prefetch<false, true, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 1), 1, &OpDispatchBuilder::Prefetch<false, false, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 2), 1, &OpDispatchBuilder::Prefetch<false, false, 2>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 3), 1, &OpDispatchBuilder::Prefetch<false, false, 3>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F3, 4), 4, &OpDispatchBuilder::NOPOp},

{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 0), 1, &OpDispatchBuilder::Prefetch<false, true, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 1), 1, &OpDispatchBuilder::Prefetch<false, false, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 2), 1, &OpDispatchBuilder::Prefetch<false, false, 2>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 3), 1, &OpDispatchBuilder::Prefetch<false, false, 3>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_66, 4), 4, &OpDispatchBuilder::NOPOp},

{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 0), 1, &OpDispatchBuilder::Prefetch<false, true, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 1), 1, &OpDispatchBuilder::Prefetch<false, false, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 2), 1, &OpDispatchBuilder::Prefetch<false, false, 2>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 3), 1, &OpDispatchBuilder::Prefetch<false, false, 3>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_16, PF_F2, 4), 4, &OpDispatchBuilder::NOPOp},

// GROUP P
{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 0), 8, &OpDispatchBuilder::NOPOp},
{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 0), 1, &OpDispatchBuilder::Prefetch<false, false, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 1), 1, &OpDispatchBuilder::Prefetch<true, false, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 2), 1, &OpDispatchBuilder::Prefetch<true, false, 1>},
{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_NONE, 3), 5, &OpDispatchBuilder::NOPOp},

{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_F3, 0), 8, &OpDispatchBuilder::NOPOp},
{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_66, 0), 8, &OpDispatchBuilder::NOPOp},
{OPD(FEXCore::X86Tables::TYPE_GROUP_P, PF_F2, 0), 8, &OpDispatchBuilder::NOPOp},
Expand Down
7 changes: 7 additions & 0 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,9 @@ friend class FEXCore::IR::PassManager;
void RDTSCPOp(OpcodeArgs);
void RDPIDOp(OpcodeArgs);

template<bool ForStore, bool Stream, uint8_t Level>
void Prefetch(OpcodeArgs);

void PSADBW(OpcodeArgs);

OrderedNode *BitwiseAtLeastTwo(OrderedNode *A, OrderedNode *B, OrderedNode *C);
Expand Down Expand Up @@ -2172,6 +2175,10 @@ friend class FEXCore::IR::PassManager;
return _LoadMem(Class, Size, ssa0, Invalid(), Align, MEM_OFFSET_SXTX, 1);
}

OrderedNode* Prefetch(bool ForStore, bool Stream, uint8_t CacheLevel, OrderedNode *ssa0) {
return _Prefetch(ForStore, Stream, CacheLevel, ssa0, Invalid(), MEM_OFFSET_SXTX, 1);
}

void InstallHostSpecificOpcodeHandlers();

///< Segment telemetry tracking
Expand Down
9 changes: 9 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,15 @@
"Ensures the memory operations are globally visible"
],
"HasSideEffects": true
},
"Prefetch i1:$ForStore, i1:$Stream, i8:$CacheLevel, GPR:$Addr, GPR:$Offset, MemOffsetType:$OffsetType, u8:$OffsetScale": {
"Desc": ["Does a cacheline prefetch operation"
],
"EmitValidation": [
"_CacheLevel > 0 && _CacheLevel < 4"
],
"HasSideEffects": true,
"DestSize": "8"
}
},
"Atomic": {
Expand Down
65 changes: 65 additions & 0 deletions FEXCore/Source/Interface/IR/Passes/ConstProp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,27 @@ using MemExtendedAddrResult =
static std::optional<MemExtendedAddrResult>
MemExtendedAddressing(IREmitter *IREmit, uint8_t AccessSize,
IROp_Header *AddressHeader) {
// Try to optimize: AddShift Base, LSHL(Offset, Scale)
if (AddressHeader->Op == OP_ADDSHIFT) {
auto AddShift = AddressHeader->C<IROp_AddShift>();
if (AddShift->Shift == IR::ShiftType::LSL) {
auto Scale = 1U << AddShift->ShiftAmount;
if (IsMemoryScale(Scale, AccessSize)) {
// remove shift as it can be folded to the mem op
return std::make_optional(
std::make_tuple(MEM_OFFSET_SXTX, (uint8_t)Scale,
IREmit->UnwrapNode(AddShift->Src2),
IREmit->UnwrapNode(AddShift->Src1)));
} else if (Scale == 1) {
return std::make_optional(std::make_tuple(
MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddShift->Src2),
IREmit->UnwrapNode(AddShift->Src1)));
}
}

return std::nullopt;
}

LOGMAN_THROW_A_FMT(AddressHeader->Op == OP_ADD, "Invalid address Op");
auto Src0Header = IREmit->GetOpHeader(AddressHeader->Args[0]);
if (Src0Header->Size == 8) {
Expand Down Expand Up @@ -633,6 +654,34 @@ bool ConstProp::ConstantPropagation(IREmitter *IREmit, const IRListView& Current
break;
}

case OP_PREFETCH: {
auto Op = IROp->CW<IR::IROp_Prefetch>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);

const bool SupportedOp =
AddressHeader->Op == OP_ADD ||
AddressHeader->Op == OP_ADDSHIFT;

if (SupportedOp &&
((Is64BitMode && AddressHeader->Size == 8) ||
(!Is64BitMode && AddressHeader->Size == 4))) {
auto MaybeMemAddr =
MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;

Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset

Changed = true;
}
break;
}

case OP_ADD: {
auto Op = IROp->C<IR::IROp_Add>();
uint64_t Constant1{};
Expand Down Expand Up @@ -1351,6 +1400,22 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR)
break;
}

case OP_PREFETCH:
{
auto Op = IROp->CW<IR::IROp_Prefetch>();

uint64_t Constant2{};
if (Op->OffsetType == MEM_OFFSET_SXTX && IREmit->IsValueConstant(Op->Offset, &Constant2)) {
if (IsImmMemory(Constant2, IROp->Size)) {
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->Offset));

IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, CreateInlineConstant(IREmit, Constant2));

Changed = true;
}
}
break;
}
default:
break;
}
Expand Down
65 changes: 65 additions & 0 deletions unittests/InstructionCountCI/FEXOpt/AddressingLimitations.json
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,71 @@
"mov w20, w20",
"ldr d16, [x20]"
]
},
"prefetch [rcx - 257]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"sub x20, x5, #0x101 (257)",
"prfm pldl1keep, [x20]"
]
},
"prefetch [rcx - 256]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"sub x20, x5, #0x100 (256)",
"prfm pldl1keep, [x20]"
]
},
"prefetch [rcx + 255]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"prfum pldl1keep, [x5, #255]"
]
},
"prefetch [rcx + 256]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"prfm pldl1keep, [x5, #256]"
]
},
"prefetch [rcx + 32760]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"prfm pldl1keep, [x5, #32760]"
]
},
"prefetch [rcx + 32761]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"mov w20, #0x7ff9",
"prfm pldl1keep, [x5, x20, sxtx]"
]
},
"prefetch [rax + rcx*1]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"prfm pldl1keep, [x4, x5, sxtx]"
]
},
"prefetch [rax + rcx*2]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #1",
"prfm pldl1keep, [x20]"
]
},
"prefetch [rax + rcx*4]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #2",
"prfm pldl1keep, [x20]"
]
},
"prefetch [rax + rcx*8]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"prfm pldl1keep, [x5, x4, sxtx #3]"
]
}
}
}
Loading

0 comments on commit 2a9fcc6

Please sign in to comment.