Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use scalar integer code to calculate PF #3545

Merged
merged 4 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -769,6 +769,16 @@ DEF_OP(XorShift) {
eor(EmitSize, GetReg(Node), GetReg(Op->Src1.ID()), GetReg(Op->Src2.ID()), ConvertIRShiftType(Op->Shift), Op->ShiftAmount);
}

DEF_OP(XornShift) {
auto Op = IROp->C<IR::IROp_XornShift>();
const uint8_t OpSize = IROp->Size;

LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;

eon(EmitSize, GetReg(Node), GetReg(Op->Src1.ID()), GetReg(Op->Src2.ID()), ConvertIRShiftType(Op->Shift), Op->ShiftAmount);
}

DEF_OP(Lshl) {
auto Op = IROp->C<IR::IROp_Lshl>();
const uint8_t OpSize = IROp->Size;
Expand Down
14 changes: 5 additions & 9 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -775,24 +775,20 @@ void OpDispatchBuilder::CALLAbsoluteOp(OpcodeArgs) {
_ExitFunction(JMPPCOffset); // If we get here then leave the function now
}

OrderedNode *OpDispatchBuilder::SelectBit(OrderedNode *Cmp, bool TrueIsNonzero, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue) {
OrderedNode *OpDispatchBuilder::SelectBit(OrderedNode *Cmp, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue) {
uint64_t TrueConst, FalseConst;
if (IsValueConstant(WrapNode(TrueValue), &TrueConst) &&
IsValueConstant(WrapNode(FalseValue), &FalseConst) &&
TrueConst == 1 &&
FalseConst == 0) {

if (!TrueIsNonzero)
Cmp = _Not(OpSize::i32Bit, Cmp);

return _And(ResultSize, Cmp, _Constant(1));
}

SaveNZCV();
_TestNZ(OpSize::i32Bit, Cmp, _Constant(1));
return _NZCVSelect(ResultSize,
TrueIsNonzero ? CondClassType{COND_NEQ} : CondClassType{COND_EQ},
TrueValue, FalseValue);
return _NZCVSelect(ResultSize, CondClassType{COND_NEQ},
TrueValue, FalseValue);
}

std::pair<bool, CondClassType> OpDispatchBuilder::DecodeNZCVCondition(uint8_t OP) const {
Expand Down Expand Up @@ -857,10 +853,10 @@ OrderedNode *OpDispatchBuilder::SelectCC(uint8_t OP, IR::OpSize ResultSize, Orde
}
case 0xA: { // JP - Jump if PF == 1
// Raw value contains inverted PF in bottom bit
return SelectBit(LoadPFRaw(), false, ResultSize, TrueValue, FalseValue);
return SelectBit(LoadPFRaw(true), ResultSize, TrueValue, FalseValue);
}
case 0xB: { // JNP - Jump if PF == 0
return SelectBit(LoadPFRaw(), true, ResultSize, TrueValue, FalseValue);
return SelectBit(LoadPFRaw(false), ResultSize, TrueValue, FalseValue);
}
default:
LOGMAN_MSG_A_FMT("Unknown CC Op: 0x{:x}\n", OP);
Expand Down
4 changes: 2 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -1627,7 +1627,7 @@ friend class FEXCore::IR::PassManager;
}

std::pair<bool, CondClassType> DecodeNZCVCondition(uint8_t OP) const;
OrderedNode *SelectBit(OrderedNode *Cmp, bool Invert, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue);
OrderedNode *SelectBit(OrderedNode *Cmp, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue);
OrderedNode *SelectCC(uint8_t OP, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue);

/**
Expand Down Expand Up @@ -1761,7 +1761,7 @@ friend class FEXCore::IR::PassManager;
/**
* @name These functions are used by the deferred flag handling while it is calculating and storing flags in to RFLAGs.
* @{ */
OrderedNode *LoadPFRaw();
OrderedNode *LoadPFRaw(bool Invert);
OrderedNode *LoadAF();
void FixupAF();
void SetAFAndFixup(OrderedNode *AF);
Expand Down
19 changes: 11 additions & 8 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ OrderedNode *OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) {
// instead.
if (FlagsMask & (1 << FEXCore::X86State::RFLAG_PF_RAW_LOC)) {
// Set every bit except the bottommost.
auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(), _Constant(~1ull));
auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(false), _Constant(~1ull));

// Rotate the bottom bit to the appropriate location for PF, so we get
// something like 111P1111. Then invert that to get 000p0000. Then OR that
Expand Down Expand Up @@ -237,18 +237,21 @@ void OpDispatchBuilder::CalculateOF(uint8_t SrcSize, OrderedNode *Res, OrderedNo
SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Anded, SrcSize * 8 - 1, true);
}

OrderedNode *OpDispatchBuilder::LoadPFRaw() {
OrderedNode *OpDispatchBuilder::LoadPFRaw(bool Invert) {
// Read the stored byte. This is the original result (up to 64-bits), it needs
// parity calculated.
auto Result = GetRFLAG(FEXCore::X86State::RFLAG_PF_RAW_LOC);

// Cast the input to a 32-bit FPR. Logically we only need 8-bit, but that would
// generate unwanted an ubfx instruction. VPopcount will ignore the upper bits anyway.
auto InputFPR = _VCastFromGPR(4, 4, Result);
// Cascade to calculate parity of bottom 8-bits to bottom bit.
Result = _XorShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 4);
Result = _XorShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 2);

// Calculate the popcount.
auto Count = _VPopcount(1, 1, InputFPR);
return _VExtractToGPR(8, 1, Count, 0);
if (Invert)
Result = _XornShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 1);
else
Result = _XorShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 1);

return Result;
}

OrderedNode *OpDispatchBuilder::LoadAF() {
Expand Down
7 changes: 7 additions & 0 deletions FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -1153,6 +1153,13 @@
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"GPR = XornShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": {
"Desc": [ "Integer binary exclusive or not with shifted register"],
"DestSize": "Size",
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"GPR = And OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": ["Integer binary and"
],
Expand Down
9 changes: 4 additions & 5 deletions unittests/InstructionCountCI/FlagM/FlagOpts.json
Original file line number Diff line number Diff line change
Expand Up @@ -267,18 +267,17 @@
]
},
"AND use only PF": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"x86Insts": [
"and eax, ebx",
"setp cl",
"test cl, cl"
],
"ExpectedArm64ASM": [
"and w4, w4, w7",
"fmov s2, w4",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"mvn w20, w20",
"eor w20, w4, w4, lsr #4",
"eor w20, w20, w20, lsr #2",
"eon w20, w20, w20, lsr #1",
"and x20, x20, #0x1",
"bfxil x5, x20, #0, #8",
"mov x26, x5",
Expand Down
18 changes: 9 additions & 9 deletions unittests/InstructionCountCI/FlagM/Primary.json
Original file line number Diff line number Diff line change
Expand Up @@ -1746,9 +1746,9 @@
"orr x20, x20, x21, lsl #20",
"ldrb w21, [x28, #725]",
"orr x20, x20, x21, lsl #21",
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w21, v2.b[0]",
"eor w21, w26, w26, lsr #4",
"eor w21, w21, w21, lsr #2",
"eor w21, w21, w21, lsr #1",
"orr x21, x21, #0xfffffffffffffffe",
"orn x20, x20, x21, ror #62",
"mrs x21, nzcv",
Expand Down Expand Up @@ -1791,9 +1791,9 @@
"orr x20, x20, x21, lsl #20",
"ldrb w21, [x28, #725]",
"orr x20, x20, x21, lsl #21",
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w21, v2.b[0]",
"eor w21, w26, w26, lsr #4",
"eor w21, w21, w21, lsr #2",
"eor w21, w21, w21, lsr #1",
"orr x21, x21, #0xfffffffffffffffe",
"orn x20, x20, x21, ror #62",
"mrs x21, nzcv",
Expand Down Expand Up @@ -1866,9 +1866,9 @@
"eor w21, w27, w26",
"ubfx w21, w21, #4, #1",
"orr x20, x20, x21, lsl #4",
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w21, v2.b[0]",
"eor w21, w26, w26, lsr #4",
"eor w21, w21, w21, lsr #2",
"eor w21, w21, w21, lsr #1",
"orr x21, x21, #0xfffffffffffffffe",
"orn x20, x20, x21, ror #62",
"mrs x21, nzcv",
Expand Down
57 changes: 28 additions & 29 deletions unittests/InstructionCountCI/FlagM/Secondary.json
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,12 @@
"ExpectedInstructionCount": 8,
"Comment": "0x0f 0x4a",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eon w20, w20, w20, lsr #1",
"mrs x21, nzcv",
"tst w20, #0x1",
"csel w20, w7, w4, eq",
"csel w20, w7, w4, ne",
"bfxil x4, x20, #0, #16",
"msr nzcv, x21"
]
Expand All @@ -279,35 +279,35 @@
"ExpectedInstructionCount": 7,
"Comment": "0x0f 0x4a",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eon w20, w20, w20, lsr #1",
"mrs x21, nzcv",
"tst w20, #0x1",
"csel w4, w7, w4, eq",
"csel w4, w7, w4, ne",
"msr nzcv, x21"
]
},
"cmovpe rax, rbx": {
"ExpectedInstructionCount": 7,
"Comment": "0x0f 0x4a",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eon w20, w20, w20, lsr #1",
"mrs x21, nzcv",
"tst w20, #0x1",
"csel x4, x7, x4, eq",
"csel x4, x7, x4, ne",
"msr nzcv, x21"
]
},
"cmovnp ax, bx": {
"ExpectedInstructionCount": 8,
"Comment": "0x0f 0x4b",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eor w20, w20, w20, lsr #1",
"mrs x21, nzcv",
"tst w20, #0x1",
"csel w20, w7, w4, ne",
Expand All @@ -319,9 +319,9 @@
"ExpectedInstructionCount": 7,
"Comment": "0x0f 0x4b",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eor w20, w20, w20, lsr #1",
"mrs x21, nzcv",
"tst w20, #0x1",
"csel w4, w7, w4, ne",
Expand All @@ -332,9 +332,9 @@
"ExpectedInstructionCount": 7,
"Comment": "0x0f 0x4b",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eor w20, w20, w20, lsr #1",
"mrs x21, nzcv",
"tst w20, #0x1",
"csel x4, x7, x4, ne",
Expand Down Expand Up @@ -513,13 +513,12 @@
]
},
"setpe al": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"Comment": "0x0f 0x9a",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"mvn w20, w20",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eon w20, w20, w20, lsr #1",
"and x20, x20, #0x1",
"bfxil x4, x20, #0, #8"
]
Expand All @@ -528,9 +527,9 @@
"ExpectedInstructionCount": 5,
"Comment": "0x0f 0x9b",
"ExpectedArm64ASM": [
"fmov s2, w26",
"cnt v2.16b, v2.16b",
"umov w20, v2.b[0]",
"eor w20, w26, w26, lsr #4",
"eor w20, w20, w20, lsr #2",
"eor w20, w20, w20, lsr #1",
"and x20, x20, #0x1",
"bfxil x4, x20, #0, #8"
]
Expand Down
Loading
Loading