diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst.json b/unittests/InstructionCountCI/FEXOpt/MultiInst.json index d9da698cd6..2cac041893 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst.json @@ -210,7 +210,7 @@ ] }, "positive rep movsb": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -225,6 +225,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x30", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #1", "strb w3, [x1], #1", @@ -241,7 +260,7 @@ ] }, "positive rep movsw": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -256,6 +275,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x30", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #2", "strh w3, [x1], #2", @@ -272,7 +310,7 @@ ] }, "positive rep movsd": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -287,6 +325,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x30", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #4", "str w3, [x1], #4", @@ -303,7 +360,7 @@ ] }, "positive rep movsq": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -318,6 +375,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x30", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #8", "str x3, [x1], #8", @@ -334,7 +410,7 @@ ] }, "negative rep movsb": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -349,6 +425,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x30", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #-1", "strb w3, [x1], #-1", @@ -365,7 +460,7 @@ ] }, "negative rep movsw": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -380,6 +475,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x30", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #-2", "strh w3, [x1], #-2", @@ -396,7 +510,7 @@ ] }, "negative rep movsd": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -411,6 +525,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x30", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #-4", "str w3, [x1], #-4", @@ -427,7 +560,7 @@ ] }, "negative rep movsq": { - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 37, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -442,6 +575,25 @@ "mov x0, x5", "mov x1, x11", "mov x2, x10", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x30", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #-8", "str x3, [x1], #-8", @@ -458,7 +610,7 @@ ] }, "positive rep stosb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 27, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -473,6 +625,22 @@ "uxtb w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.16b, w21", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x28", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w21, [x1], #1", "sub x0, x0, #0x1 (1)", @@ -482,7 +650,7 @@ ] }, "positive rep stosw": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 27, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -497,6 +665,22 @@ "uxth w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.8h, w21", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x28", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w21, [x1], #2", "sub x0, x0, #0x1 (1)", @@ -506,7 +690,7 @@ ] }, "positive rep stosd": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 27, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -521,6 +705,22 @@ "mov w21, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.4s, w21", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x28", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w21, [x1], #4", "sub x0, x0, #0x1 (1)", @@ -530,7 +730,7 @@ ] }, "positive rep stosq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 26, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -544,6 +744,22 @@ "strb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x28", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #8", "sub x0, x0, #0x1 (1)", @@ -553,7 +769,7 @@ ] }, "negative rep stosb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 27, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -568,6 +784,22 @@ "uxtb w20, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.16b, w20", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x28", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w20, [x1], #-1", "sub x0, x0, #0x1 (1)", @@ -577,7 +809,7 @@ ] }, "negative rep stosw": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 27, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -592,6 +824,22 @@ "uxth w20, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.8h, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x28", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w20, [x1], #-2", "sub x0, x0, #0x1 (1)", @@ -601,7 +849,7 @@ ] }, "negative rep stosd": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 27, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -616,6 +864,22 @@ "mov w20, w4", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.4s, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x28", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w20, [x1], #-4", "sub x0, x0, #0x1 (1)", @@ -625,7 +889,7 @@ ] }, "negative rep stosq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 26, "Comment": [ "When direction flag is a compile time constant we can optimize", "loads and stores can turn in to post-increment when known" @@ -639,6 +903,22 @@ "strb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x28", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #-8", "sub x0, x0, #0x1 (1)", diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index a7f8fad12c..37a6c1f5f6 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -2861,14 +2861,33 @@ ] }, "rep movsb": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 67, "Comment": "0xa4", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x7c", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x30", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #1", "strb w3, [x1], #1", @@ -2879,7 +2898,26 @@ "mov x2, x5", "add x20, x0, x2", "add x21, x1, x2", - "b #+0x2c", + "b #+0x78", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x30", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x14", "ldrb w3, [x2], #-1", "strb w3, [x1], #-1", @@ -2896,14 +2934,33 @@ ] }, "rep movsw": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 67, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x7c", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x30", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #2", "strh w3, [x1], #2", @@ -2914,7 +2971,26 @@ "mov x2, x5", "add x20, x0, x2, lsl #1", "add x21, x1, x2, lsl #1", - "b #+0x2c", + "b #+0x78", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x30", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x14", "ldrh w3, [x2], #-2", "strh w3, [x1], #-2", @@ -2931,14 +3007,33 @@ ] }, "rep movsd": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 67, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x7c", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x30", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #4", "str w3, [x1], #4", @@ -2949,7 +3044,26 @@ "mov x2, x5", "add x20, x0, x2, lsl #2", "add x21, x1, x2, lsl #2", - "b #+0x2c", + "b #+0x78", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x30", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x14", "ldr w3, [x2], #-4", "str w3, [x1], #-4", @@ -2966,14 +3080,33 @@ ] }, "rep movsq": { - "ExpectedInstructionCount": 29, + "ExpectedInstructionCount": 67, "Comment": "0xa5", "ExpectedArm64ASM": [ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", "mov x2, x10", - "cbnz x20, #+0x30", + "cbnz x20, #+0x7c", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x18", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x30", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #32", + "stp q0, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #8", "str x3, [x1], #8", @@ -2984,7 +3117,26 @@ "mov x2, x5", "add x20, x0, x2, lsl #3", "add x21, x1, x2, lsl #3", - "b #+0x2c", + "b #+0x78", + "cbz x0, #+0x60", + "orr x3, x1, x2", + "and x3, x3, #0x3", + "cbnz x3, #+0x44", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x18", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x18", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x30", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0x10", + "ldp q0, q1, [x2], #-32", + "stp q0, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x14", "ldr x3, [x2], #-8", "str x3, [x1], #-8", @@ -3368,20 +3520,52 @@ ] }, "rep stosb": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 49, "Comment": "0xaa", "ExpectedArm64ASM": [ "uxtb w20, w4", "ldrb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x21, #+0x1c", + "cbnz x21, #+0x5c", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.16b, w20", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x28", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w20, [x1], #1", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5", - "b #+0x18", + "b #+0x58", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.16b, w20", + "sub x0, x0, #0x40 (64)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x40 (64)", + "cbz x0, #+0x28", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x20 (32)", "cbz x0, #+0x10", "strb w20, [x1], #-1", "sub x0, x0, #0x1 (1)", @@ -3391,20 +3575,52 @@ ] }, "rep stosw": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 49, "Comment": "0xab", "ExpectedArm64ASM": [ "uxth w20, w4", "ldrb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x21, #+0x1c", + "cbnz x21, #+0x5c", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.8h, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x28", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w20, [x1], #2", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #1", - "b #+0x18", + "b #+0x58", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.8h, w20", + "sub x0, x0, #0x20 (32)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x20 (32)", + "cbz x0, #+0x28", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x10 (16)", "cbz x0, #+0x10", "strh w20, [x1], #-2", "sub x0, x0, #0x1 (1)", @@ -3414,20 +3630,52 @@ ] }, "rep stosd": { - "ExpectedInstructionCount": 17, + "ExpectedInstructionCount": 49, "Comment": "0xab", "ExpectedArm64ASM": [ "mov w20, w4", "ldrb w21, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x21, #+0x1c", + "cbnz x21, #+0x5c", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.4s, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x28", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w20, [x1], #4", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #2", - "b #+0x18", + "b #+0x58", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.4s, w20", + "sub x0, x0, #0x10 (16)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x10 (16)", + "cbz x0, #+0x28", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x8 (8)", "cbz x0, #+0x10", "str w20, [x1], #-4", "sub x0, x0, #0x1 (1)", @@ -3437,7 +3685,7 @@ ] }, "rep stosq": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 48, "Comment": [ "Unrolling the loop for faster memset can be done.", "Taking advantage of ARM MOPs instructions can be done", @@ -3447,13 +3695,45 @@ "ldrb w20, [x28, #714]", "mov x0, x5", "mov x1, x11", - "cbnz x20, #+0x1c", + "cbnz x20, #+0x5c", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #32", + "stp q1, q1, [x1], #32", + "b #-0x10", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x28", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #32", + "b #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #8", "sub x0, x0, #0x1 (1)", "cbnz x0, #-0x8", "add x11, x11, x5, lsl #3", - "b #+0x18", + "b #+0x58", + "cbz x0, #+0x50", + "and x3, x1, #0x3", + "cbnz x3, #+0x3c", + "dup v1.2d, x4", + "sub x0, x0, #0x8 (8)", + "tbnz x0, #63, #+0x10", + "stp q1, q1, [x1], #-32", + "stp q1, q1, [x1], #-32", + "b #-0x10", + "add x0, x0, #0x8 (8)", + "cbz x0, #+0x28", + "sub x0, x0, #0x4 (4)", + "tbnz x0, #63, #+0xc", + "stp q1, q1, [x1], #-32", + "b #-0xc", + "add x0, x0, #0x4 (4)", "cbz x0, #+0x10", "str x4, [x1], #-8", "sub x0, x0, #0x1 (1)",