From a70ea30c0239c168e460dbf96a192dd0e2cdc529 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 17:50:57 -0400 Subject: [PATCH 1/8] IR: add CondSubNZCV (ccmp) instruction Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/Core/JIT/Arm64/ALUOps.cpp | 18 ++++++++++++++++++ FEXCore/Source/Interface/IR/IR.json | 8 ++++++++ .../Source/Interface/IR/Passes/ConstProp.cpp | 1 + .../RedundantFlagCalculationElimination.cpp | 1 + 4 files changed, 28 insertions(+) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp index 129d05a4f4..8466533166 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp @@ -402,6 +402,24 @@ DEF_OP(CondAddNZCV) { } } +DEF_OP(CondSubNZCV) { + auto Op = IROp->C(); + const auto OpSize = IROp->Size; + + LOGMAN_THROW_AA_FMT(OpSize == IR::i32Bit || OpSize == IR::i64Bit, "Unsupported {} size: {}", __func__, OpSize); + const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + + ARMEmitter::StatusFlags Flags = (ARMEmitter::StatusFlags)Op->FalseNZCV; + uint64_t Const = 0; + auto Src1 = GetZeroableReg(Op->Src1); + + if (IsInlineConstant(Op->Src2, &Const)) { + ccmp(EmitSize, Src1, Const, Flags, MapSelectCC(Op->Cond)); + } else { + ccmp(EmitSize, Src1, GetReg(Op->Src2.ID()), Flags, MapSelectCC(Op->Cond)); + } +} + DEF_OP(Neg) { auto Op = IROp->C(); const uint8_t OpSize = IROp->Size; diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index abf9b8586d..0f40cdb12f 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -1035,6 +1035,14 @@ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, + "CondSubNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2, CondClass:$Cond, u8:$FalseNZCV": { + "Desc": ["If condition is true, set NZCV per difference of GPRs, else force NZCV to a constant."], + "HasSideEffects": true, + "DestSize": "Size", + "EmitValidation": [ + "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" + ] + }, "GPR = AdcWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Adds and set NZCV for the sum of two GPRs and carry-in given as NZCV"], "HasSideEffects": true, diff --git a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp index 192ae6520c..60464625da 100644 --- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp +++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp @@ -1139,6 +1139,7 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR) break; } case OP_CONDADDNZCV: + case OP_CONDSUBNZCV: { auto Op = IROp->C(); diff --git a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp index 2df6270d8d..ac57749dfa 100644 --- a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp @@ -228,6 +228,7 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp) return {.Read = FlagsForCondClassType(Op->Cond)}; } + case OP_CONDSUBNZCV: case OP_CONDADDNZCV: { auto Op = IROp->CW(); return { From 1a1545da0f19eb24b2a4b2af2ab7daab549add0f Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 17:51:13 -0400 Subject: [PATCH 2/8] OpcodeDispatcher: rework rep cmp 1. pull flag calculation out of the loop body for perf 2. fully rotate the inner loop to save an instruction per iteration 3. hoist the rcx=0 jump to avoid computing df when rcx=0 Signed-off-by: Alyssa Rosenzweig --- .../Interface/Core/OpcodeDispatcher.cpp | 69 +++++++++++-------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 2214f4c923..f3ad7d1dbc 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -3617,26 +3617,24 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; - // read DF once - auto PtrDir = LoadDir(Size); - - auto JumpStart = Jump(); - // Make sure to start a new block after ending this one - auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetJumpTarget(JumpStart, LoopStart); - SetCurrentCodeBlock(LoopStart); - StartNewBlock(); - + // If rcx = 0, skip the whole loop. OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); + auto OuterJump = CondJump(Counter, {COND_EQ}); + IRPair InnerJump; - // Can we end the block? - auto CondJump_ = CondJump(Counter, {COND_EQ}); - IRPair InternalCondJump; + // read DF once, outside the loop + auto BeforeLoop = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetFalseJumpTarget(OuterJump, BeforeLoop); + SetCurrentCodeBlock(BeforeLoop); + StartNewBlock(); + auto PtrDir = LoadDir(Size); + auto JumpIntoLoop = Jump(); - auto LoopTail = CreateNewCodeBlockAfter(LoopStart); - SetFalseJumpTarget(CondJump_, LoopTail); - SetCurrentCodeBlock(LoopTail); + // Setup for the loop + auto LoopHeader = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetCurrentCodeBlock(LoopHeader); StartNewBlock(); + SetJumpTarget(JumpIntoLoop, LoopHeader); // Working loop { @@ -3651,15 +3649,14 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { auto Src1 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); auto Src2 = _LoadMem(GPRClass, Size, Dest_RSI, Size); - GenerateFlags_SUB(Op, Src2, Src1); - - // Calculate flags early. - CalculateDeferredFlags(); + // We'll calculate PF/AF after the loop, so use them as temporaries here. + _StoreRegister(Src1, false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + _StoreRegister(Src2, false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); // Decrement counter - TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); + TailCounter = _SubWithFlags(OpSize::i64Bit, TailCounter, _Constant(1)); // Store the counter since we don't have phis StoreGPRRegister(X86State::REG_RCX, TailCounter); @@ -3672,21 +3669,37 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, PtrDir); StoreGPRRegister(X86State::REG_RSI, Dest_RSI); - CalculateDeferredFlags(); - InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); + // If TailCounter != 0, compare sources. + // If TailCounter == 0, set ZF iff that would break. + _CondSubNZCV(OpSize::i64Bit, Src2, Src1, {COND_NEQ}, REPE ? 0 : (1 << 2) /* Z */); + CachedNZCV = nullptr; + NZCVDirty = false; + InnerJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); // Jump back to the start if we have more work to do - SetTrueJumpTarget(InternalCondJump, LoopStart); + SetTrueJumpTarget(InnerJump, LoopHeader); } // Make sure to start a new block after ending this one - auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); - SetTrueJumpTarget(CondJump_, LoopEnd); - - SetFalseJumpTarget(InternalCondJump, LoopEnd); + auto LoopEnd = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetFalseJumpTarget(InnerJump, LoopEnd); SetCurrentCodeBlock(LoopEnd); StartNewBlock(); + { + // Grab the sources from the last iteration so we can set flags. + auto Src1 = _LoadRegister(false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + auto Src2 = _LoadRegister(false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + GenerateFlags_SUB(Op, Src2, Src1); + CalculateDeferredFlags(); + } + auto Jump_ = Jump(); + + auto Exit = CreateNewCodeBlockAfter(LoopEnd); + SetJumpTarget(Jump_, Exit); + SetTrueJumpTarget(OuterJump, Exit); + SetCurrentCodeBlock(Exit); + StartNewBlock(); } } From 784cdd7b6b055cfa25a93caff5d1eb7a53749e09 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 17:51:47 -0400 Subject: [PATCH 3/8] InstCountCI: Update Signed-off-by: Alyssa Rosenzweig --- .../InstructionCountCI/FlagM/Primary.json | 176 ++++++++------- unittests/InstructionCountCI/Primary.json | 208 ++++++++++-------- 2 files changed, 208 insertions(+), 176 deletions(-) diff --git a/unittests/InstructionCountCI/FlagM/Primary.json b/unittests/InstructionCountCI/FlagM/Primary.json index 743376c771..b187d5b094 100644 --- a/unittests/InstructionCountCI/FlagM/Primary.json +++ b/unittests/InstructionCountCI/FlagM/Primary.json @@ -1950,153 +1950,169 @@ ] }, "repz cmpsb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 15, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x3c", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x30", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x2c" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "cfinv" ] }, "repz cmpsw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 16, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x40", "ldrsb x20, [x28, #714]", "lsl x20, x20, #1", - "cbz x5, #+0x30", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x2c" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "cfinv" ] }, "repz cmpsd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 14, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x38", "ldrsb x20, [x28, #714]", "lsl x20, x20, #2", - "cbz x5, #+0x28", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x24" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "cfinv" ] }, "repz cmpsq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 14, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x38", "ldrsb x20, [x28, #714]", "lsl x20, x20, #3", - "cbz x5, #+0x28", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x24" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "cfinv" ] }, "repnz cmpsb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 15, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x3c", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x30", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x2c" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "cfinv" ] }, "repnz cmpsw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 16, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x40", "ldrsb x20, [x28, #714]", "lsl x20, x20, #1", - "cbz x5, #+0x30", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x2c" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "cfinv" ] }, "repnz cmpsd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 14, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x38", "ldrsb x20, [x28, #714]", "lsl x20, x20, #2", - "cbz x5, #+0x28", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x24" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "cfinv" ] }, "repnz cmpsq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 14, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x38", "ldrsb x20, [x28, #714]", "lsl x20, x20, #3", - "cbz x5, #+0x28", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "cfinv", - "sub x5, x5, #0x1 (1)", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x24" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "cfinv" ] }, "test al, 1": { diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index 5ee00256f5..26e1fac028 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -3295,169 +3295,185 @@ ] }, "repz cmpsb": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 17, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x44", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x38", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x34" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repz cmpsw": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 18, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x48", "ldrsb x20, [x28, #714]", "lsl x20, x20, #1", - "cbz x5, #+0x38", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x34" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repz cmpsd": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 16, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x40", "ldrsb x20, [x28, #714]", "lsl x20, x20, #2", - "cbz x5, #+0x30", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x2c" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repz cmpsq": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 16, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x40", "ldrsb x20, [x28, #714]", "lsl x20, x20, #3", - "cbz x5, #+0x30", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.eq #-0x2c" + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsb": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 17, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x44", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x38", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x34" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsw": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 18, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x48", "ldrsb x20, [x28, #714]", "lsl x20, x20, #1", - "cbz x5, #+0x38", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x34" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsd": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 16, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x40", "ldrsb x20, [x28, #714]", "lsl x20, x20, #2", - "cbz x5, #+0x30", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x2c" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsq": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 16, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x40", "ldrsb x20, [x28, #714]", "lsl x20, x20, #3", - "cbz x5, #+0x30", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", "add x11, x11, x20", "add x10, x10, x20", - "b.ne #-0x2c" + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "test al, 1": { From 3f66173bc77d67b9e8f11cb5e260bc9c0e6637e8 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 20:28:56 -0400 Subject: [PATCH 4/8] OpcodeDispatcher: add ForeachDirection helper Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/Core/OpcodeDispatcher.h | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9c58e50725..6313220b3e 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -228,6 +228,32 @@ friend class FEXCore::IR::PassManager; return CanHaveSideEffects; } + template + void ForeachDirection(F&& Routine) { + // Otherwise, prepare to branch. + auto Zero = _Constant(0); + + // If the shift is zero, do not touch the flags. + auto ForwardBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); + auto BackwardBlock = CreateNewCodeBlockAfter(ForwardBlock); + auto ExitBlock = CreateNewCodeBlockAfter(BackwardBlock); + + auto DF = GetRFLAG(X86State::RFLAG_DF_RAW_LOC); + CondJump(DF, Zero, ForwardBlock, BackwardBlock, {COND_EQ}); + + for (auto D = 0; D < 2; ++D) { + SetCurrentCodeBlock(D ? BackwardBlock : ForwardBlock); + StartNewBlock(); + { + Routine(D ? -1 : 1); + Jump(ExitBlock); + } + } + + SetCurrentCodeBlock(ExitBlock); + StartNewBlock(); + } + OpDispatchBuilder(FEXCore::Context::ContextImpl *ctx); OpDispatchBuilder(FEXCore::Utils::IntrusivePooledAllocator &Allocator); From 5d79d4eb501f9f1e7e65c39ed9a1bdfeca33e7df Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 20:29:16 -0400 Subject: [PATCH 5/8] OpcodeDispatcher: use ForeachDirection for CMPS eliminates xblock liveness Signed-off-by: Alyssa Rosenzweig --- .../Interface/Core/OpcodeDispatcher.cpp | 98 ++++++++++--------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index f3ad7d1dbc..0b0d2748ac 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -3620,72 +3620,74 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { // If rcx = 0, skip the whole loop. OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); auto OuterJump = CondJump(Counter, {COND_EQ}); - IRPair InnerJump; - // read DF once, outside the loop auto BeforeLoop = CreateNewCodeBlockAfter(GetCurrentBlock()); SetFalseJumpTarget(OuterJump, BeforeLoop); SetCurrentCodeBlock(BeforeLoop); StartNewBlock(); - auto PtrDir = LoadDir(Size); - auto JumpIntoLoop = Jump(); - // Setup for the loop - auto LoopHeader = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetCurrentCodeBlock(LoopHeader); - StartNewBlock(); - SetJumpTarget(JumpIntoLoop, LoopHeader); + ForeachDirection([this, Op, Size, REPE](int PtrDir) { + IRPair InnerJump; + auto JumpIntoLoop = Jump(); - // Working loop - { - OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); - OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); + // Setup for the loop + auto LoopHeader = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetCurrentCodeBlock(LoopHeader); + StartNewBlock(); + SetJumpTarget(JumpIntoLoop, LoopHeader); - // Only ES prefix - Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); - // Default DS prefix - Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); + // Working loop + { + OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); + OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); - auto Src1 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); - auto Src2 = _LoadMem(GPRClass, Size, Dest_RSI, Size); + // Only ES prefix + Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); + // Default DS prefix + Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); - // We'll calculate PF/AF after the loop, so use them as temporaries here. - _StoreRegister(Src1, false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); - _StoreRegister(Src2, false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + auto Src1 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); + auto Src2 = _LoadMem(GPRClass, Size, Dest_RSI, Size); - OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); + // We'll calculate PF/AF after the loop, so use them as temporaries here. + _StoreRegister(Src1, false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + _StoreRegister(Src2, false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); - // Decrement counter - TailCounter = _SubWithFlags(OpSize::i64Bit, TailCounter, _Constant(1)); + OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); - // Store the counter since we don't have phis - StoreGPRRegister(X86State::REG_RCX, TailCounter); + // Decrement counter + TailCounter = _SubWithFlags(OpSize::i64Bit, TailCounter, _Constant(1)); - // Offset the pointer - Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir); - StoreGPRRegister(X86State::REG_RDI, Dest_RDI); + // Store the counter since we don't have phis + StoreGPRRegister(X86State::REG_RCX, TailCounter); - // Offset second pointer - Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, PtrDir); - StoreGPRRegister(X86State::REG_RSI, Dest_RSI); + // Offset the pointer + Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * Size)); + StoreGPRRegister(X86State::REG_RDI, Dest_RDI); - // If TailCounter != 0, compare sources. - // If TailCounter == 0, set ZF iff that would break. - _CondSubNZCV(OpSize::i64Bit, Src2, Src1, {COND_NEQ}, REPE ? 0 : (1 << 2) /* Z */); - CachedNZCV = nullptr; - NZCVDirty = false; - InnerJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); + // Offset second pointer + Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * Size)); + StoreGPRRegister(X86State::REG_RSI, Dest_RSI); - // Jump back to the start if we have more work to do - SetTrueJumpTarget(InnerJump, LoopHeader); - } + // If TailCounter != 0, compare sources. + // If TailCounter == 0, set ZF iff that would break. + _CondSubNZCV(OpSize::i64Bit, Src2, Src1, {COND_NEQ}, REPE ? 0 : (1 << 2) /* Z */); + CachedNZCV = nullptr; + NZCVDirty = false; + InnerJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); - // Make sure to start a new block after ending this one - auto LoopEnd = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetFalseJumpTarget(InnerJump, LoopEnd); + // Jump back to the start if we have more work to do + SetTrueJumpTarget(InnerJump, LoopHeader); + } - SetCurrentCodeBlock(LoopEnd); - StartNewBlock(); + // Make sure to start a new block after ending this one + auto LoopEnd = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetFalseJumpTarget(InnerJump, LoopEnd); + SetCurrentCodeBlock(LoopEnd); + StartNewBlock(); + }); + + // Make sure to start a new block after ending this one { // Grab the sources from the last iteration so we can set flags. auto Src1 = _LoadRegister(false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); @@ -3695,7 +3697,7 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { } auto Jump_ = Jump(); - auto Exit = CreateNewCodeBlockAfter(LoopEnd); + auto Exit = CreateNewCodeBlockAfter(GetCurrentBlock()); SetJumpTarget(Jump_, Exit); SetTrueJumpTarget(OuterJump, Exit); SetCurrentCodeBlock(Exit); From 5c7f2934de6a0c230bbd572915f64abb3378f21f Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 20:29:29 -0400 Subject: [PATCH 6/8] OpcodeDispatcher: use ForeachDirection for lods eliminates xblock live Signed-off-by: Alyssa Rosenzweig --- .../Interface/Core/OpcodeDispatcher.cpp | 91 +++++++++---------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 0b0d2748ac..07b85e0e3c 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -3731,65 +3731,64 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) { // Calculate flags early. because end of block CalculateDeferredFlags(); - // XXX: Theoretically LODS could be optimized to - // RSI += {-}(RCX * Size) - // RAX = [RSI - Size] - // But this might violate the case of an application scanning pages for read permission and catching the fault - // May or may not matter - - // Read DF once - auto PtrDir = LoadDir(Size); - - auto JumpStart = Jump(); - // Make sure to start a new block after ending this one - auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetJumpTarget(JumpStart, LoopStart); - SetCurrentCodeBlock(LoopStart); - StartNewBlock(); + ForeachDirection([this, Op, Size](int PtrDir) { + // XXX: Theoretically LODS could be optimized to + // RSI += {-}(RCX * Size) + // RAX = [RSI - Size] + // But this might violate the case of an application scanning pages for read permission and catching the fault + // May or may not matter + + auto JumpStart = Jump(); + // Make sure to start a new block after ending this one + auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetJumpTarget(JumpStart, LoopStart); + SetCurrentCodeBlock(LoopStart); + StartNewBlock(); - OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); - // Can we end the block? + // Can we end the block? - // We leave if RCX = 0 - auto CondJump_ = CondJump(Counter, {COND_EQ}); + // We leave if RCX = 0 + auto CondJump_ = CondJump(Counter, {COND_EQ}); - auto LoopTail = CreateNewCodeBlockAfter(LoopStart); - SetFalseJumpTarget(CondJump_, LoopTail); - SetCurrentCodeBlock(LoopTail); - StartNewBlock(); + auto LoopTail = CreateNewCodeBlockAfter(LoopStart); + SetFalseJumpTarget(CondJump_, LoopTail); + SetCurrentCodeBlock(LoopTail); + StartNewBlock(); - // Working loop - { - OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); + // Working loop + { + OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); - Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); + Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); - auto Src = _LoadMemAutoTSO(GPRClass, Size, Dest_RSI, Size); + auto Src = _LoadMemAutoTSO(GPRClass, Size, Dest_RSI, Size); - StoreResult(GPRClass, Op, Src, -1); + StoreResult(GPRClass, Op, Src, -1); - OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); - OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI); + OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI); - // Decrement counter - TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); + // Decrement counter + TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); - // Store the counter since we don't have phis - StoreGPRRegister(X86State::REG_RCX, TailCounter); + // Store the counter since we don't have phis + StoreGPRRegister(X86State::REG_RCX, TailCounter); - // Offset the pointer - TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, PtrDir); - StoreGPRRegister(X86State::REG_RSI, TailDest_RSI); + // Offset the pointer + TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * Size)); + StoreGPRRegister(X86State::REG_RSI, TailDest_RSI); - // Jump back to the start, we have more work to do - Jump(LoopStart); - } - // Make sure to start a new block after ending this one - auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); - SetTrueJumpTarget(CondJump_, LoopEnd); - SetCurrentCodeBlock(LoopEnd); - StartNewBlock(); + // Jump back to the start, we have more work to do + Jump(LoopStart); + } + // Make sure to start a new block after ending this one + auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); + SetTrueJumpTarget(CondJump_, LoopEnd); + SetCurrentCodeBlock(LoopEnd); + StartNewBlock(); + }); } } From 7b1bb159faa9a38813b19e51d83ca2ab8162f138 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 20:30:58 -0400 Subject: [PATCH 7/8] OpcodeDispatcher: use ForeachDirection for scas Signed-off-by: Alyssa Rosenzweig --- .../Interface/Core/OpcodeDispatcher.cpp | 95 +++++++++---------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 07b85e0e3c..c3a6895906 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -3819,71 +3819,70 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) { // Calculate flags early. because end of block CalculateDeferredFlags(); - bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; - - // read DF once - auto PtrDir = LoadDir(Size); + ForeachDirection([this, Op, Size](int Dir){ + bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; - auto JumpStart = Jump(); - // Make sure to start a new block after ending this one - auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetJumpTarget(JumpStart, LoopStart); - SetCurrentCodeBlock(LoopStart); - StartNewBlock(); + auto JumpStart = Jump(); + // Make sure to start a new block after ending this one + auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetJumpTarget(JumpStart, LoopStart); + SetCurrentCodeBlock(LoopStart); + StartNewBlock(); - OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); - // Can we end the block? - // We leave if RCX = 0 - auto CondJump_ = CondJump(Counter, {COND_EQ}); - IRPair InternalCondJump; + // Can we end the block? + // We leave if RCX = 0 + auto CondJump_ = CondJump(Counter, {COND_EQ}); + IRPair InternalCondJump; - auto LoopTail = CreateNewCodeBlockAfter(LoopStart); - SetFalseJumpTarget(CondJump_, LoopTail); - SetCurrentCodeBlock(LoopTail); - StartNewBlock(); + auto LoopTail = CreateNewCodeBlockAfter(LoopStart); + SetFalseJumpTarget(CondJump_, LoopTail); + SetCurrentCodeBlock(LoopTail); + StartNewBlock(); - // Working loop - { - OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); + // Working loop + { + OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); - Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); + Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); - auto Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); - auto Src2 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); + auto Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); + auto Src2 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); - GenerateFlags_SUB(Op, Src1, Src2); + GenerateFlags_SUB(Op, Src1, Src2); - // Calculate flags early. - CalculateDeferredFlags(); + // Calculate flags early. + CalculateDeferredFlags(); - OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); - OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI); + OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI); - // Decrement counter - TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); + // Decrement counter + TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); - // Store the counter since we don't have phis - StoreGPRRegister(X86State::REG_RCX, TailCounter); + // Store the counter since we don't have phis + StoreGPRRegister(X86State::REG_RCX, TailCounter); - // Offset the pointer - TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, PtrDir); - StoreGPRRegister(X86State::REG_RDI, TailDest_RDI); + // Offset the pointer + TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * Size)); + StoreGPRRegister(X86State::REG_RDI, TailDest_RDI); - CalculateDeferredFlags(); - InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); + CalculateDeferredFlags(); + InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); - // Jump back to the start if we have more work to do - SetTrueJumpTarget(InternalCondJump, LoopStart); - } - // Make sure to start a new block after ending this one - auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); - SetTrueJumpTarget(CondJump_, LoopEnd); + // Jump back to the start if we have more work to do + SetTrueJumpTarget(InternalCondJump, LoopStart); + } + // Make sure to start a new block after ending this one + auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); + SetTrueJumpTarget(CondJump_, LoopEnd); - SetFalseJumpTarget(InternalCondJump, LoopEnd); + SetFalseJumpTarget(InternalCondJump, LoopEnd); - SetCurrentCodeBlock(LoopEnd); - StartNewBlock(); + SetCurrentCodeBlock(LoopEnd); + StartNewBlock(); + }); } } From ad0dd34412d253a408b7cf32e0b08af48dc038ff Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 31 Mar 2024 20:29:48 -0400 Subject: [PATCH 8/8] InstCountCI: Update Signed-off-by: Alyssa Rosenzweig --- .../InstructionCountCI/FlagM/Primary.json | 356 +++++++++---- unittests/InstructionCountCI/Primary.json | 485 +++++++++++++----- 2 files changed, 626 insertions(+), 215 deletions(-) diff --git a/unittests/InstructionCountCI/FlagM/Primary.json b/unittests/InstructionCountCI/FlagM/Primary.json index b187d5b094..3fa39e3ba8 100644 --- a/unittests/InstructionCountCI/FlagM/Primary.json +++ b/unittests/InstructionCountCI/FlagM/Primary.json @@ -1950,16 +1950,27 @@ ] }, "repz cmpsb": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 26, "Comment": "0xa6", "ExpectedArm64ASM": [ - "cbz x5, #+0x3c", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldrb w26, [x11]", "ldrb w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -1971,17 +1982,27 @@ ] }, "repz cmpsw": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x40", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldrh w26, [x11]", "ldrh w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -1993,17 +2014,27 @@ ] }, "repz cmpsd": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x38", + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", "ldr w26, [x11]", "ldr w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -2013,17 +2044,27 @@ ] }, "repz cmpsq": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x38", + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldr x26, [x11]", "ldr x27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -2033,16 +2074,27 @@ ] }, "repnz cmpsb": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 26, "Comment": "0xa6", "ExpectedArm64ASM": [ - "cbz x5, #+0x3c", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldrb w26, [x11]", "ldrb w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -2054,17 +2106,27 @@ ] }, "repnz cmpsw": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x40", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", "ldrh w26, [x11]", "ldrh w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -2076,17 +2138,27 @@ ] }, "repnz cmpsd": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x38", + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldr w26, [x11]", "ldr w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -2096,17 +2168,27 @@ ] }, "repnz cmpsq": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x38", + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldr x26, [x11]", "ldr x27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -2228,136 +2310,234 @@ ] }, "repz scasb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 25, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", + "cbz x5, #+0x28", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "b.eq #-0x24", + "b #+0x2c", "cbz x5, #+0x28", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x1 (1)", "b.eq #-0x24" ] }, "repz scasw": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "b.eq #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x2 (2)", "b.eq #-0x24" ] }, "repz scasd": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", "cbz x5, #+0x20", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x4 (4)", + "b.eq #-0x1c", + "b #+0x24", + "cbz x5, #+0x20", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", "b.eq #-0x1c" ] }, "repz scasq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", + "cbz x5, #+0x20", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "b.eq #-0x1c", + "b #+0x24", "cbz x5, #+0x20", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x8 (8)", "b.eq #-0x1c" ] }, "repnz scasb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 25, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "b.ne #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x1 (1)", "b.ne #-0x24" ] }, "repnz scasw": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x2 (2)", + "b.ne #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", "b.ne #-0x24" ] }, "repnz scasd": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", + "cbz x5, #+0x20", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "b.ne #-0x1c", + "b #+0x24", "cbz x5, #+0x20", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x4 (4)", "b.ne #-0x1c" ] }, "repnz scasq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", "cbz x5, #+0x20", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "b.ne #-0x1c", + "b #+0x24", + "cbz x5, #+0x20", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x8 (8)", "b.ne #-0x1c" ] }, diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index 26e1fac028..7590652008 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -3295,16 +3295,27 @@ ] }, "repz cmpsb": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 28, "Comment": "0xa6", "ExpectedArm64ASM": [ - "cbz x5, #+0x44", + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldrb w26, [x11]", "ldrb w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -3318,17 +3329,27 @@ ] }, "repz cmpsw": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 28, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x48", + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldrh w26, [x11]", "ldrh w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -3342,17 +3363,27 @@ ] }, "repz cmpsd": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x40", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldr w26, [x11]", "ldr w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -3364,17 +3395,27 @@ ] }, "repz cmpsq": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x40", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldr x26, [x11]", "ldr x27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", "ccmp x27, x26, #nzcv, ne", "b.eq #-0x18", "mov x20, x27", @@ -3386,16 +3427,27 @@ ] }, "repnz cmpsb": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 28, "Comment": "0xa6", "ExpectedArm64ASM": [ - "cbz x5, #+0x44", + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldrb w26, [x11]", "ldrb w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -3409,17 +3461,27 @@ ] }, "repnz cmpsw": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 28, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x48", + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldrh w26, [x11]", "ldrh w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -3433,17 +3495,27 @@ ] }, "repnz cmpsd": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x40", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldr w26, [x11]", "ldr w27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -3455,17 +3527,27 @@ ] }, "repnz cmpsq": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ - "cbz x5, #+0x40", + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", "ldr x26, [x11]", "ldr x27, [x10]", "subs x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", "ccmp x27, x26, #nZcv, ne", "b.ne #-0x18", "mov x20, x27", @@ -3858,55 +3940,90 @@ ] }, "rep lodsb": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 17, "Comment": "0xac", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x20", "cbz x5, #+0x18", - "ldrb w21, [x10]", - "bfxil x4, x21, #0, #8", + "ldrb w20, [x10]", + "bfxil x4, x20, #0, #8", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x1 (1)", + "b #-0x14", + "b #+0x1c", + "cbz x5, #+0x18", + "ldrb w20, [x10]", + "bfxil x4, x20, #0, #8", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x1 (1)", "b #-0x14" ] }, "rep lodsw": { - "ExpectedInstructionCount": 8, + "ExpectedInstructionCount": 17, "Comment": "0xad", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x20", "cbz x5, #+0x18", - "ldrh w21, [x10]", - "bfxil x4, x21, #0, #16", + "ldrh w20, [x10]", + "bfxil x4, x20, #0, #16", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x2 (2)", + "b #-0x14", + "b #+0x1c", + "cbz x5, #+0x18", + "ldrh w20, [x10]", + "bfxil x4, x20, #0, #16", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x2 (2)", "b #-0x14" ] }, "rep lodsd": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 15, "Comment": "0xad", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x1c", "cbz x5, #+0x14", "ldr w4, [x10]", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x4 (4)", + "b #-0x10", + "b #+0x18", + "cbz x5, #+0x14", + "ldr w4, [x10]", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x4 (4)", "b #-0x10" ] }, "rep lodsq": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 15, "Comment": "0xad", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x1c", "cbz x5, #+0x14", "ldr x4, [x10]", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x8 (8)", + "b #-0x10", + "b #+0x18", + "cbz x5, #+0x14", + "ldr x4, [x10]", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x8 (8)", "b #-0x10" ] }, @@ -3971,152 +4088,266 @@ ] }, "repz scasb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 29, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", + "cbz x5, #+0x30", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "b.eq #-0x2c", + "b #+0x34", "cbz x5, #+0x30", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x1 (1)", "b.eq #-0x2c" ] }, "repz scasw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 29, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", "cbz x5, #+0x30", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x2 (2)", + "b.eq #-0x2c", + "b #+0x34", + "cbz x5, #+0x30", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", "b.eq #-0x2c" ] }, "repz scasd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "b.eq #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x4 (4)", "b.eq #-0x24" ] }, "repz scasq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "b.eq #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x8 (8)", "b.eq #-0x24" ] }, "repnz scasb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 29, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", "cbz x5, #+0x30", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x1 (1)", + "b.ne #-0x2c", + "b #+0x34", + "cbz x5, #+0x30", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", "b.ne #-0x2c" ] }, "repnz scasw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 29, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", "cbz x5, #+0x30", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "b.ne #-0x2c", + "b #+0x34", + "cbz x5, #+0x30", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x2 (2)", "b.ne #-0x2c" ] }, "repnz scasd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "b.ne #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x4 (4)", "b.ne #-0x24" ] }, "repnz scasq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "b.ne #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x8 (8)", "b.ne #-0x24" ] },