From 94fecb9dad89de881efced965fefb858e63283aa Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Wed, 20 Mar 2024 20:54:09 +0000 Subject: [PATCH 1/3] FEXCore: Remove needless alignment checks for the mem{cpy,set} fastpath --- FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp index 207646b044..8a9ab72223 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp @@ -1814,10 +1814,6 @@ DEF_OP(MemSet) { ARMEmitter::ForwardLabel AgainInternal128Exit{}; ARMEmitter::BackwardLabel AgainInternal128{}; - // Fallback to byte by byte loop if not 4 byte aligned - and_(ARMEmitter::Size::i64Bit, TMP4, TMP2, 0x3); - cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); - if (Direction == -1) { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); } @@ -2132,10 +2128,6 @@ DEF_OP(MemCpy) { ARMEmitter::BackwardLabel AgainInternal128{}; ARMEmitter::BackwardLabel AgainInternal256{}; - // Fallback to byte by byte loop if either of start/end are not 4 byte aligned - orr(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3); - and_(ARMEmitter::Size::i64Bit, TMP4, TMP4, 0x3); - cbnz(ARMEmitter::Size::i64Bit, TMP4, &AgainInternal); if (Direction == -1) { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); From d490cb1b7961ee7c6986ba32140e51c232217e61 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Wed, 20 Mar 2024 20:54:19 +0000 Subject: [PATCH 2/3] FEXCore: Fallback to the memcpy slow path for overlaps within 32 bytes Take e.g a forward rep movsb copy from addr 0 to 1, the expected behaviour since this is a bytewise copy is: before: aaabbbb... after: aaaaaaa... but by copying in 32-byte chunks we end up with: after: aaaabbbb... due to the self overwrites not occuring within a single 32 bit copy. --- FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp index 8a9ab72223..1453d931ad 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/MemoryOps.cpp @@ -2123,11 +2123,18 @@ DEF_OP(MemCpy) { cbz(ARMEmitter::Size::i64Bit, TMP1, &DoneInternal); if (!Op->IsAtomic) { + ARMEmitter::ForwardLabel AbsPos{}; ARMEmitter::ForwardLabel AgainInternal256Exit{}; ARMEmitter::ForwardLabel AgainInternal128Exit{}; ARMEmitter::BackwardLabel AgainInternal128{}; ARMEmitter::BackwardLabel AgainInternal256{}; + sub(ARMEmitter::Size::i64Bit, TMP4, TMP2, TMP3); + tbz(TMP4, 63, &AbsPos); + neg(ARMEmitter::Size::i64Bit, TMP4, TMP4); + Bind(&AbsPos); + sub(ARMEmitter::Size::i64Bit, TMP4, TMP4, 32); + tbnz(TMP4, 63, &AgainInternal); if (Direction == -1) { sub(ARMEmitter::Size::i64Bit, TMP2, TMP2, 32 - Size); From 12fb26f9c0f74ebe43b485a2aed52c12a262fcdf Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Wed, 20 Mar 2024 21:08:59 +0000 Subject: [PATCH 3/3] Update InstCountCI --- .../InstructionCountCI/FEXOpt/MultiInst.json | 144 ++++++++-------- unittests/InstructionCountCI/Primary.json | 160 +++++++++--------- 2 files changed, 152 insertions(+), 152 deletions(-) diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst.json b/unittests/InstructionCountCI/FEXOpt/MultiInst.json index bcf564da92..0d57852521 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst.json @@ -216,7 +216,7 @@ ] }, "positive rep movsb": { - "ExpectedInstructionCount": 42, + "ExpectedInstructionCount": 44, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -232,10 +232,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x20 (32)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x20 (32)", @@ -271,7 +273,7 @@ ] }, "positive rep movsw": { - "ExpectedInstructionCount": 42, + "ExpectedInstructionCount": 44, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -287,10 +289,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x10 (16)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x10 (16)", @@ -326,7 +330,7 @@ ] }, "positive rep movsd": { - "ExpectedInstructionCount": 42, + "ExpectedInstructionCount": 44, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -342,10 +346,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x8 (8)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x8 (8)", @@ -381,7 +387,7 @@ ] }, "positive rep movsq": { - "ExpectedInstructionCount": 42, + "ExpectedInstructionCount": 44, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -397,10 +403,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x4 (4)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x4 (4)", @@ -436,7 +444,7 @@ ] }, "negative rep movsb": { - "ExpectedInstructionCount": 45, + "ExpectedInstructionCount": 47, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -451,10 +459,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x1f (31)", "sub x2, x2, #0x1f (31)", "sub x0, x0, #0x20 (32)", @@ -494,7 +504,7 @@ ] }, "negative rep movsw": { - "ExpectedInstructionCount": 45, + "ExpectedInstructionCount": 47, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -509,10 +519,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x1e (30)", "sub x2, x2, #0x1e (30)", "sub x0, x0, #0x10 (16)", @@ -552,7 +564,7 @@ ] }, "negative rep movsd": { - "ExpectedInstructionCount": 45, + "ExpectedInstructionCount": 47, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -567,10 +579,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x1c (28)", "sub x2, x2, #0x1c (28)", "sub x0, x0, #0x8 (8)", @@ -610,7 +624,7 @@ ] }, "negative rep movsq": { - "ExpectedInstructionCount": 45, + "ExpectedInstructionCount": 47, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -625,10 +639,12 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x18 (24)", "sub x2, x2, #0x18 (24)", "sub x0, x0, #0x4 (4)", @@ -668,7 +684,7 @@ ] }, "positive rep stosb": { - "ExpectedInstructionCount": 32, + "ExpectedInstructionCount": 30, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -684,9 +700,7 @@ "uxtb w21, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "cbz x0, #+0x58", "sub x0, x0, #0x20 (32)", "tbnz x0, #63, #+0x3c", "dup v1.16b, w21", @@ -713,7 +727,7 @@ ] }, "positive rep stosw": { - "ExpectedInstructionCount": 32, + "ExpectedInstructionCount": 30, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -729,9 +743,7 @@ "uxth w21, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "cbz x0, #+0x58", "sub x0, x0, #0x10 (16)", "tbnz x0, #63, #+0x3c", "dup v1.8h, w21", @@ -758,7 +770,7 @@ ] }, "positive rep stosd": { - "ExpectedInstructionCount": 32, + "ExpectedInstructionCount": 30, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -774,9 +786,7 @@ "mov w21, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "cbz x0, #+0x58", "sub x0, x0, #0x8 (8)", "tbnz x0, #63, #+0x3c", "dup v1.4s, w21", @@ -803,7 +813,7 @@ ] }, "positive rep stosq": { - "ExpectedInstructionCount": 31, + "ExpectedInstructionCount": 29, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -818,9 +828,7 @@ "strb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "cbz x0, #+0x58", "sub x0, x0, #0x4 (4)", "tbnz x0, #63, #+0x3c", "dup v1.2d, x4", @@ -847,7 +855,7 @@ ] }, "negative rep stosb": { - "ExpectedInstructionCount": 33, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -862,9 +870,7 @@ "uxtb w20, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x60", "sub x1, x1, #0x1f (31)", "sub x0, x0, #0x20 (32)", "tbnz x0, #63, #+0x3c", @@ -893,7 +899,7 @@ ] }, "negative rep stosw": { - "ExpectedInstructionCount": 33, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -908,9 +914,7 @@ "uxth w20, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x60", "sub x1, x1, #0x1e (30)", "sub x0, x0, #0x10 (16)", "tbnz x0, #63, #+0x3c", @@ -939,7 +943,7 @@ ] }, "negative rep stosd": { - "ExpectedInstructionCount": 33, + "ExpectedInstructionCount": 31, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -954,9 +958,7 @@ "mov w20, w4", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x60", "sub x1, x1, #0x1c (28)", "sub x0, x0, #0x8 (8)", "tbnz x0, #63, #+0x3c", @@ -985,7 +987,7 @@ ] }, "negative rep stosq": { - "ExpectedInstructionCount": 32, + "ExpectedInstructionCount": 30, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -999,9 +1001,7 @@ "strb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "cbz x0, #+0x60", "sub x1, x1, #0x18 (24)", "sub x0, x0, #0x4 (4)", "tbnz x0, #63, #+0x3c", diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index 8cd7ecdefc..5ee00256f5 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -2860,18 +2860,20 @@ ] }, "rep movsb": { - "ExpectedInstructionCount": 79, + "ExpectedInstructionCount": 83, "Comment": "0xa4", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x8c", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "tbnz w20, #1, #+0x94", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x20 (32)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x20 (32)", @@ -2901,11 +2903,13 @@ "mov x2, x5", "add x20, x0, x2", "add x21, x1, x2", - "b #+0x98", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "b #+0xa0", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x1f (31)", "sub x2, x2, #0x1f (31)", "sub x0, x0, #0x20 (32)", @@ -2945,18 +2949,20 @@ ] }, "rep movsw": { - "ExpectedInstructionCount": 79, + "ExpectedInstructionCount": 83, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x8c", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "tbnz w20, #1, #+0x94", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x10 (16)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x10 (16)", @@ -2986,11 +2992,13 @@ "mov x2, x5", "add x20, x0, x2, lsl #1", "add x21, x1, x2, lsl #1", - "b #+0x98", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "b #+0xa0", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x1e (30)", "sub x2, x2, #0x1e (30)", "sub x0, x0, #0x10 (16)", @@ -3030,18 +3038,20 @@ ] }, "rep movsd": { - "ExpectedInstructionCount": 79, + "ExpectedInstructionCount": 83, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x8c", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "tbnz w20, #1, #+0x94", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x8 (8)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x8 (8)", @@ -3071,11 +3081,13 @@ "mov x2, x5", "add x20, x0, x2, lsl #2", "add x21, x1, x2, lsl #2", - "b #+0x98", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "b #+0xa0", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x1c (28)", "sub x2, x2, #0x1c (28)", "sub x0, x0, #0x8 (8)", @@ -3115,18 +3127,20 @@ ] }, "rep movsq": { - "ExpectedInstructionCount": 79, + "ExpectedInstructionCount": 83, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "tbnz w20, #1, #+0x8c", - "cbz x0, #+0x70", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x54", + "tbnz w20, #1, #+0x94", + "cbz x0, #+0x78", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x54", "sub x0, x0, #0x4 (4)", "tbnz x0, #63, #+0x44", "sub x0, x0, #0x4 (4)", @@ -3156,11 +3170,13 @@ "mov x2, x5", "add x20, x0, x2, lsl #3", "add x21, x1, x2, lsl #3", - "b #+0x98", - "cbz x0, #+0x80", - "orr x3, x1, x2", - "and x3, x3, #0x3", - "cbnz x3, #+0x64", + "b #+0xa0", + "cbz x0, #+0x88", + "sub x3, x1, x2", + "tbz x3, #63, #+0x8", + "neg x3, x3", + "sub x3, x3, #0x20 (32)", + "tbnz x3, #63, #+0x64", "sub x1, x1, #0x18 (24)", "sub x2, x2, #0x18 (24)", "sub x0, x0, #0x4 (4)", @@ -3541,17 +3557,15 @@ ] }, "rep stosb": { - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 55, "Comment": "0xaa", "ExpectedArm64ASM": [ "uxtb w20, w4", "ldrsb x21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w21, #1, #+0x6c", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "tbnz w21, #1, #+0x64", + "cbz x0, #+0x58", "sub x0, x0, #0x20 (32)", "tbnz x0, #63, #+0x3c", "dup v1.16b, w20", @@ -3574,10 +3588,8 @@ "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5", - "b #+0x70", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "b #+0x68", + "cbz x0, #+0x60", "sub x1, x1, #0x1f (31)", "sub x0, x0, #0x20 (32)", "tbnz x0, #63, #+0x3c", @@ -3606,17 +3618,15 @@ ] }, "rep stosw": { - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 55, "Comment": "0xab", "ExpectedArm64ASM": [ "uxth w20, w4", "ldrsb x21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w21, #1, #+0x6c", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "tbnz w21, #1, #+0x64", + "cbz x0, #+0x58", "sub x0, x0, #0x10 (16)", "tbnz x0, #63, #+0x3c", "dup v1.8h, w20", @@ -3639,10 +3649,8 @@ "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #1", - "b #+0x70", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "b #+0x68", + "cbz x0, #+0x60", "sub x1, x1, #0x1e (30)", "sub x0, x0, #0x10 (16)", "tbnz x0, #63, #+0x3c", @@ -3671,17 +3679,15 @@ ] }, "rep stosd": { - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 55, "Comment": "0xab", "ExpectedArm64ASM": [ "mov w20, w4", "ldrsb x21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w21, #1, #+0x6c", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "tbnz w21, #1, #+0x64", + "cbz x0, #+0x58", "sub x0, x0, #0x8 (8)", "tbnz x0, #63, #+0x3c", "dup v1.4s, w20", @@ -3704,10 +3710,8 @@ "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #2", - "b #+0x70", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "b #+0x68", + "cbz x0, #+0x60", "sub x1, x1, #0x1c (28)", "sub x0, x0, #0x8 (8)", "tbnz x0, #63, #+0x3c", @@ -3736,7 +3740,7 @@ ] }, "rep stosq": { - "ExpectedInstructionCount": 58, + "ExpectedInstructionCount": 54, "Comment": [ "Unrolling the loop for faster memset can be done.", "Taking advantage of ARM MOPs instructions can be done", @@ -3746,10 +3750,8 @@ "ldrsb x20, [x28, #714]", "mov x0, x5", "mov x1, x11", - "tbnz w20, #1, #+0x6c", - "cbz x0, #+0x60", - "and x3, x1, #0x3", - "cbnz x3, #+0x4c", + "tbnz w20, #1, #+0x64", + "cbz x0, #+0x58", "sub x0, x0, #0x4 (4)", "tbnz x0, #63, #+0x3c", "dup v1.2d, x4", @@ -3772,10 +3774,8 @@ "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #3", - "b #+0x70", - "cbz x0, #+0x68", - "and x3, x1, #0x3", - "cbnz x3, #+0x54", + "b #+0x68", + "cbz x0, #+0x60", "sub x1, x1, #0x18 (24)", "sub x0, x0, #0x4 (4)", "tbnz x0, #63, #+0x3c",