FEX-Emu · alyssarosenzweig · Apr 2, 2024 · Apr 1, 2024 · Apr 1, 2024 · Apr 1, 2024
diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp
@@ -769,6 +769,16 @@ DEF_OP(XorShift) {
   eor(EmitSize, GetReg(Node), GetReg(Op->Src1.ID()), GetReg(Op->Src2.ID()), ConvertIRShiftType(Op->Shift), Op->ShiftAmount);
 }
 
+DEF_OP(XornShift) {
+  auto Op = IROp->C<IR::IROp_XornShift>();
+  const uint8_t OpSize = IROp->Size;
+
+  LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
+  const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
+
+  eon(EmitSize, GetReg(Node), GetReg(Op->Src1.ID()), GetReg(Op->Src2.ID()), ConvertIRShiftType(Op->Shift), Op->ShiftAmount);
+}
+
 DEF_OP(Lshl) {
   auto Op = IROp->C<IR::IROp_Lshl>();
   const uint8_t OpSize = IROp->Size;

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
@@ -775,24 +775,20 @@ void OpDispatchBuilder::CALLAbsoluteOp(OpcodeArgs) {
   _ExitFunction(JMPPCOffset); // If we get here then leave the function now
 }
 
-OrderedNode *OpDispatchBuilder::SelectBit(OrderedNode *Cmp, bool TrueIsNonzero, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue) {
+OrderedNode *OpDispatchBuilder::SelectBit(OrderedNode *Cmp, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue) {
   uint64_t TrueConst, FalseConst;
   if (IsValueConstant(WrapNode(TrueValue), &TrueConst) &&
       IsValueConstant(WrapNode(FalseValue), &FalseConst) &&
       TrueConst == 1 &&
       FalseConst == 0) {
 
-      if (!TrueIsNonzero)
-        Cmp = _Not(OpSize::i32Bit, Cmp);
-
       return _And(ResultSize, Cmp, _Constant(1));
   }
 
   SaveNZCV();
   _TestNZ(OpSize::i32Bit, Cmp, _Constant(1));
-  return _NZCVSelect(ResultSize,
-                 TrueIsNonzero ? CondClassType{COND_NEQ} : CondClassType{COND_EQ},
-                 TrueValue, FalseValue);
+  return _NZCVSelect(ResultSize, CondClassType{COND_NEQ},
+                     TrueValue, FalseValue);
 }
 
 std::pair<bool, CondClassType> OpDispatchBuilder::DecodeNZCVCondition(uint8_t OP) const {
@@ -857,10 +853,10 @@ OrderedNode *OpDispatchBuilder::SelectCC(uint8_t OP, IR::OpSize ResultSize, Orde
     }
     case 0xA: { // JP - Jump if PF == 1
       // Raw value contains inverted PF in bottom bit
-      return SelectBit(LoadPFRaw(), false, ResultSize, TrueValue, FalseValue);
+      return SelectBit(LoadPFRaw(true), ResultSize, TrueValue, FalseValue);
     }
     case 0xB: { // JNP - Jump if PF == 0
-      return SelectBit(LoadPFRaw(), true, ResultSize, TrueValue, FalseValue);
+      return SelectBit(LoadPFRaw(false), ResultSize, TrueValue, FalseValue);
     }
     default:
       LOGMAN_MSG_A_FMT("Unknown CC Op: 0x{:x}\n", OP);

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@@ -1627,7 +1627,7 @@ friend class FEXCore::IR::PassManager;
   }
 
   std::pair<bool, CondClassType> DecodeNZCVCondition(uint8_t OP) const;
-  OrderedNode *SelectBit(OrderedNode *Cmp, bool Invert, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue);
+  OrderedNode *SelectBit(OrderedNode *Cmp, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue);
   OrderedNode *SelectCC(uint8_t OP, IR::OpSize ResultSize, OrderedNode *TrueValue, OrderedNode *FalseValue);
 
   /**
@@ -1761,7 +1761,7 @@ friend class FEXCore::IR::PassManager;
   /**
    * @name These functions are used by the deferred flag handling while it is calculating and storing flags in to RFLAGs.
    * @{ */
-  OrderedNode *LoadPFRaw();
+  OrderedNode *LoadPFRaw(bool Invert);
   OrderedNode *LoadAF();
   void FixupAF();
   void SetAFAndFixup(OrderedNode *AF);

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Flags.cpp
@@ -179,7 +179,7 @@ OrderedNode *OpDispatchBuilder::GetPackedRFLAG(uint32_t FlagsMask) {
   // instead.
   if (FlagsMask & (1 << FEXCore::X86State::RFLAG_PF_RAW_LOC)) {
     // Set every bit except the bottommost.
-    auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(), _Constant(~1ull));
+    auto OnesInvPF = _Or(OpSize::i64Bit, LoadPFRaw(false), _Constant(~1ull));
 
     // Rotate the bottom bit to the appropriate location for PF, so we get
     // something like 111P1111. Then invert that to get 000p0000. Then OR that
@@ -237,18 +237,21 @@ void OpDispatchBuilder::CalculateOF(uint8_t SrcSize, OrderedNode *Res, OrderedNo
   SetRFLAG<FEXCore::X86State::RFLAG_OF_RAW_LOC>(Anded, SrcSize * 8 - 1, true);
 }
 
-OrderedNode *OpDispatchBuilder::LoadPFRaw() {
+OrderedNode *OpDispatchBuilder::LoadPFRaw(bool Invert) {
   // Read the stored byte. This is the original result (up to 64-bits), it needs
   // parity calculated.
   auto Result = GetRFLAG(FEXCore::X86State::RFLAG_PF_RAW_LOC);
 
-  // Cast the input to a 32-bit FPR. Logically we only need 8-bit, but that would
-  // generate unwanted an ubfx instruction. VPopcount will ignore the upper bits anyway.
-  auto InputFPR = _VCastFromGPR(4, 4, Result);
+  // Cascade to calculate parity of bottom 8-bits to bottom bit.
+  Result = _XorShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 4);
+  Result = _XorShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 2);
 
-  // Calculate the popcount.
-  auto Count = _VPopcount(1, 1, InputFPR);
-  return _VExtractToGPR(8, 1, Count, 0);
+  if (Invert)
+    Result = _XornShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 1);
+  else
+    Result = _XorShift(OpSize::i32Bit, Result, Result, ShiftType::LSR, 1);
+
+  return Result;
 }
 
 OrderedNode *OpDispatchBuilder::LoadAF() {

diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json
@@ -1153,6 +1153,13 @@
           "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
         ]
       },
+      "GPR = XornShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": {
+        "Desc": [ "Integer binary exclusive or not with shifted register"],
+        "DestSize": "Size",
+        "EmitValidation": [
+          "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
+        ]
+      },
       "GPR = And OpSize:#Size, GPR:$Src1, GPR:$Src2": {
         "Desc": ["Integer binary and"
                 ],

diff --git a/unittests/InstructionCountCI/FlagM/FlagOpts.json b/unittests/InstructionCountCI/FlagM/FlagOpts.json
@@ -267,18 +267,17 @@
       ]
     },
     "AND use only PF": {
-      "ExpectedInstructionCount": 9,
+      "ExpectedInstructionCount": 8,
       "x86Insts": [
         "and eax, ebx",
         "setp cl",
         "test cl, cl"
       ],
       "ExpectedArm64ASM": [
         "and w4, w4, w7",
-        "fmov s2, w4",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
-        "mvn w20, w20",
+        "eor w20, w4, w4, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eon w20, w20, w20, lsr #1",
         "and x20, x20, #0x1",
         "bfxil x5, x20, #0, #8",
         "mov x26, x5",

diff --git a/unittests/InstructionCountCI/FlagM/Primary.json b/unittests/InstructionCountCI/FlagM/Primary.json
@@ -1746,9 +1746,9 @@
         "orr x20, x20, x21, lsl #20",
         "ldrb w21, [x28, #725]",
         "orr x20, x20, x21, lsl #21",
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w21, v2.b[0]",
+        "eor w21, w26, w26, lsr #4",
+        "eor w21, w21, w21, lsr #2",
+        "eor w21, w21, w21, lsr #1",
         "orr x21, x21, #0xfffffffffffffffe",
         "orn x20, x20, x21, ror #62",
         "mrs x21, nzcv",
@@ -1791,9 +1791,9 @@
         "orr x20, x20, x21, lsl #20",
         "ldrb w21, [x28, #725]",
         "orr x20, x20, x21, lsl #21",
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w21, v2.b[0]",
+        "eor w21, w26, w26, lsr #4",
+        "eor w21, w21, w21, lsr #2",
+        "eor w21, w21, w21, lsr #1",
         "orr x21, x21, #0xfffffffffffffffe",
         "orn x20, x20, x21, ror #62",
         "mrs x21, nzcv",
@@ -1866,9 +1866,9 @@
         "eor w21, w27, w26",
         "ubfx w21, w21, #4, #1",
         "orr x20, x20, x21, lsl #4",
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w21, v2.b[0]",
+        "eor w21, w26, w26, lsr #4",
+        "eor w21, w21, w21, lsr #2",
+        "eor w21, w21, w21, lsr #1",
         "orr x21, x21, #0xfffffffffffffffe",
         "orn x20, x20, x21, ror #62",
         "mrs x21, nzcv",

diff --git a/unittests/InstructionCountCI/FlagM/Secondary.json b/unittests/InstructionCountCI/FlagM/Secondary.json
@@ -265,12 +265,12 @@
       "ExpectedInstructionCount": 8,
       "Comment": "0x0f 0x4a",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eon w20, w20, w20, lsr #1",
         "mrs x21, nzcv",
         "tst w20, #0x1",
-        "csel w20, w7, w4, eq",
+        "csel w20, w7, w4, ne",
         "bfxil x4, x20, #0, #16",
         "msr nzcv, x21"
       ]
@@ -279,35 +279,35 @@
       "ExpectedInstructionCount": 7,
       "Comment": "0x0f 0x4a",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eon w20, w20, w20, lsr #1",
         "mrs x21, nzcv",
         "tst w20, #0x1",
-        "csel w4, w7, w4, eq",
+        "csel w4, w7, w4, ne",
         "msr nzcv, x21"
       ]
     },
     "cmovpe rax, rbx": {
       "ExpectedInstructionCount": 7,
       "Comment": "0x0f 0x4a",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eon w20, w20, w20, lsr #1",
         "mrs x21, nzcv",
         "tst w20, #0x1",
-        "csel x4, x7, x4, eq",
+        "csel x4, x7, x4, ne",
         "msr nzcv, x21"
       ]
     },
     "cmovnp ax, bx": {
       "ExpectedInstructionCount": 8,
       "Comment": "0x0f 0x4b",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eor w20, w20, w20, lsr #1",
         "mrs x21, nzcv",
         "tst w20, #0x1",
         "csel w20, w7, w4, ne",
@@ -319,9 +319,9 @@
       "ExpectedInstructionCount": 7,
       "Comment": "0x0f 0x4b",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eor w20, w20, w20, lsr #1",
         "mrs x21, nzcv",
         "tst w20, #0x1",
         "csel w4, w7, w4, ne",
@@ -332,9 +332,9 @@
       "ExpectedInstructionCount": 7,
       "Comment": "0x0f 0x4b",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eor w20, w20, w20, lsr #1",
         "mrs x21, nzcv",
         "tst w20, #0x1",
         "csel x4, x7, x4, ne",
@@ -513,13 +513,12 @@
       ]
     },
     "setpe al": {
-      "ExpectedInstructionCount": 6,
+      "ExpectedInstructionCount": 5,
       "Comment": "0x0f 0x9a",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
-        "mvn w20, w20",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eon w20, w20, w20, lsr #1",
         "and x20, x20, #0x1",
         "bfxil x4, x20, #0, #8"
       ]
@@ -528,9 +527,9 @@
       "ExpectedInstructionCount": 5,
       "Comment": "0x0f 0x9b",
       "ExpectedArm64ASM": [
-        "fmov s2, w26",
-        "cnt v2.16b, v2.16b",
-        "umov w20, v2.b[0]",
+        "eor w20, w26, w26, lsr #4",
+        "eor w20, w20, w20, lsr #2",
+        "eor w20, w20, w20, lsr #1",
         "and x20, x20, #0x1",
         "bfxil x4, x20, #0, #8"
       ]