From c513b9685d0aa005f3f3b8fdc27c6a42f7b26bae Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 29 Mar 2024 09:53:07 -0400 Subject: [PATCH 1/2] OpcodeDispatcher: eliminate crossblock liveness in xsave/xrstor Signed-off-by: Alyssa Rosenzweig --- .../Source/Interface/Core/OpcodeDispatcher.h | 1 + .../Core/OpcodeDispatcher/Vector.cpp | 38 +++++++++---------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index d9f8fb08b4..9c58e50725 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -809,6 +809,7 @@ friend class FEXCore::IR::PassManager; void FXSaveOp(OpcodeArgs); void FXRStoreOp(OpcodeArgs); + OrderedNode *XSaveBase(X86Tables::DecodedOp Op); void XSaveOp(OpcodeArgs); void PAlignrOp(OpcodeArgs); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index 3ee2d75159..c72cef15c5 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -3001,16 +3001,15 @@ void OpDispatchBuilder::XSaveOp(OpcodeArgs) { XSaveOpImpl(Op); } -void OpDispatchBuilder::XSaveOpImpl(OpcodeArgs) { - const auto XSaveBase = [this, Op] { - OrderedNode *Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - return AppendSegmentOffset(Mem, Op->Flags); - }; +OrderedNode *OpDispatchBuilder::XSaveBase(X86Tables::DecodedOp Op) { + OrderedNode *Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); + return AppendSegmentOffset(Mem, Op->Flags); +} +void OpDispatchBuilder::XSaveOpImpl(OpcodeArgs) { // NOTE: Mask should be EAX and EDX concatenated, but we only need to test // for features that are in the lower 32 bits, so EAX only is sufficient. OrderedNode *Mask = LoadGPRRegister(X86State::REG_RAX); - OrderedNode *Base = XSaveBase(); const auto OpSize = IR::SizeToOpSize(CTX->GetGPRSize()); const auto StoreIfFlagSet = [&](uint32_t BitIndex, auto fn, uint32_t FieldSize = 1){ @@ -3034,25 +3033,26 @@ void OpDispatchBuilder::XSaveOpImpl(OpcodeArgs) { // x87 { - StoreIfFlagSet(0, [this, Op, Base] { SaveX87State(Op, Base); }); + StoreIfFlagSet(0, [this, Op] { SaveX87State(Op, XSaveBase(Op)); }); } // SSE { - StoreIfFlagSet(1, [this, Base] { SaveSSEState(Base); }); + StoreIfFlagSet(1, [this, Op] { SaveSSEState(XSaveBase(Op)); }); } // AVX if (CTX->HostFeatures.SupportsAVX) { - StoreIfFlagSet(2, [this, Base] { SaveAVXState(Base); }); + StoreIfFlagSet(2, [this, Op] { SaveAVXState(XSaveBase(Op)); }); } // We need to save MXCSR and MXCSR_MASK if either SSE or AVX are requested to be saved { - StoreIfFlagSet(1, [this, Base] { SaveMXCSRState(Base); }, 2); + StoreIfFlagSet(1, [this, Op] { SaveMXCSRState(XSaveBase(Op)); }, 2); } // Update XSTATE_BV region of the XSAVE header { + OrderedNode *Base = XSaveBase(Op); OrderedNode *HeaderOffset = _Add(OpSize, Base, _Constant(512)); // NOTE: We currently only support the first 3 bits (x87, SSE, and AVX) @@ -3210,14 +3210,11 @@ void OpDispatchBuilder::FXRStoreOp(OpcodeArgs) { void OpDispatchBuilder::XRstorOpImpl(OpcodeArgs) { const auto OpSize = IR::SizeToOpSize(CTX->GetGPRSize()); - const auto XSaveBase = [this, Op] { - OrderedNode *Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - return AppendSegmentOffset(Mem, Op->Flags); - }; - // Set up base address for the XSAVE region to restore from, and also read the // XSTATE_BV bit flags out of the XSTATE header. - OrderedNode *Base = XSaveBase(); + // + // Note: we rematerialize Base in each block to avoid crossblock liveness. + OrderedNode *Base = XSaveBase(Op); OrderedNode *Mask = _LoadMem(GPRClass, 8, _Add(OpSize, Base, _Constant(512)), 8); // If a bit in our XSTATE_BV is set, then we restore from that region of the XSAVE area, @@ -3253,27 +3250,28 @@ void OpDispatchBuilder::XRstorOpImpl(OpcodeArgs) { // x87 { RestoreIfFlagSetOrDefault(0, - [this, Base] { RestoreX87State(Base); }, + [this, Op] { RestoreX87State(XSaveBase(Op)); }, [this, Op] { DefaultX87State(Op); }); } // SSE { RestoreIfFlagSetOrDefault(1, - [this, Base] { RestoreSSEState(Base); }, + [this, Op] { RestoreSSEState(XSaveBase(Op)); }, [this] { DefaultSSEState(); }); } // AVX if (CTX->HostFeatures.SupportsAVX) { RestoreIfFlagSetOrDefault(2, - [this, Base] { RestoreAVXState(Base); }, + [this, Op] { RestoreAVXState(XSaveBase(Op)); }, [this] { DefaultAVXState(); }); } { // We need to restore the MXCSR if either SSE or AVX are requested to be saved RestoreIfFlagSetOrDefault(1, - [this, Base, OpSize] { + [this, Op, OpSize] { + OrderedNode *Base = XSaveBase(Op); OrderedNode *MXCSRLocation = _Add(OpSize, Base, _Constant(24)); OrderedNode *MXCSR = _LoadMem(GPRClass, 4, MXCSRLocation, 4); RestoreMXCSRState(MXCSR); From 1d96631af7f45a589cdddb2971110b0063f8bd9e Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 29 Mar 2024 09:58:03 -0400 Subject: [PATCH 2/2] InstCountCI: Update Signed-off-by: Alyssa Rosenzweig --- .../FlagM/SecondaryGroup.json | 232 +++++++++--------- .../InstructionCountCI/SecondaryGroup.json | 232 +++++++++--------- 2 files changed, 230 insertions(+), 234 deletions(-) diff --git a/unittests/InstructionCountCI/FlagM/SecondaryGroup.json b/unittests/InstructionCountCI/FlagM/SecondaryGroup.json index 6d1e2e9639..76f03aa253 100644 --- a/unittests/InstructionCountCI/FlagM/SecondaryGroup.json +++ b/unittests/InstructionCountCI/FlagM/SecondaryGroup.json @@ -1407,80 +1407,79 @@ ] }, "xsave [rax]": { - "ExpectedInstructionCount": 71, + "ExpectedInstructionCount": 70, "Comment": "GROUP15 0x0F 0xAE /4", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x4", - "ubfx x22, x20, #0, #1", - "cbnz x22, #+0x8", + "ubfx x21, x20, #0, #1", + "cbnz x21, #+0x8", "b #+0x84", - "ldrh w22, [x28, #1024]", - "strh w22, [x21]", - "mov w22, #0x0", - "ldrb w23, [x28, #747]", - "bfi x22, x23, #11, #3", - "ldrb w23, [x28, #744]", - "ldrb w24, [x28, #745]", - "ldrb w25, [x28, #746]", - "ldrb w30, [x28, #750]", - "orr x22, x22, x23, lsl #8", - "orr x22, x22, x24, lsl #9", - "orr x22, x22, x25, lsl #10", - "orr x22, x22, x30, lsl #14", - "strh w22, [x21, #2]", - "ldrb w22, [x28, #1026]", - "strb w22, [x21, #4]", + "ldrh w21, [x28, #1024]", + "strh w21, [x4]", + "mov w21, #0x0", + "ldrb w22, [x28, #747]", + "bfi x21, x22, #11, #3", + "ldrb w22, [x28, #744]", + "ldrb w23, [x28, #745]", + "ldrb w24, [x28, #746]", + "ldrb w25, [x28, #750]", + "orr x21, x21, x22, lsl #8", + "orr x21, x21, x23, lsl #9", + "orr x21, x21, x24, lsl #10", + "orr x21, x21, x25, lsl #14", + "strh w21, [x4, #2]", + "ldrb w21, [x28, #1026]", + "strb w21, [x4, #4]", "ldr q2, [x28, #768]", - "str q2, [x21, #32]", + "str q2, [x4, #32]", "ldr q2, [x28, #784]", - "str q2, [x21, #48]", + "str q2, [x4, #48]", "ldr q2, [x28, #800]", - "str q2, [x21, #64]", + "str q2, [x4, #64]", "ldr q2, [x28, #816]", - "str q2, [x21, #80]", + "str q2, [x4, #80]", "ldr q2, [x28, #832]", - "str q2, [x21, #96]", + "str q2, [x4, #96]", "ldr q2, [x28, #848]", - "str q2, [x21, #112]", + "str q2, [x4, #112]", "ldr q2, [x28, #864]", - "str q2, [x21, #128]", + "str q2, [x4, #128]", "ldr q2, [x28, #880]", - "str q2, [x21, #144]", - "ubfx x22, x20, #1, #1", - "cbnz x22, #+0x8", + "str q2, [x4, #144]", + "ubfx x21, x20, #1, #1", + "cbnz x21, #+0x8", "b #+0x44", - "str q16, [x21, #160]", - "str q17, [x21, #176]", - "str q18, [x21, #192]", - "str q19, [x21, #208]", - "str q20, [x21, #224]", - "str q21, [x21, #240]", - "str q22, [x21, #256]", - "str q23, [x21, #272]", - "str q24, [x21, #288]", - "str q25, [x21, #304]", - "str q26, [x21, #320]", - "str q27, [x21, #336]", - "str q28, [x21, #352]", - "str q29, [x21, #368]", - "str q30, [x21, #384]", - "str q31, [x21, #400]", - "ubfx x22, x20, #1, #2", - "cbnz x22, #+0x8", + "str q16, [x4, #160]", + "str q17, [x4, #176]", + "str q18, [x4, #192]", + "str q19, [x4, #208]", + "str q20, [x4, #224]", + "str q21, [x4, #240]", + "str q22, [x4, #256]", + "str q23, [x4, #272]", + "str q24, [x4, #288]", + "str q25, [x4, #304]", + "str q26, [x4, #320]", + "str q27, [x4, #336]", + "str q28, [x4, #352]", + "str q29, [x4, #368]", + "str q30, [x4, #384]", + "str q31, [x4, #400]", + "ubfx x21, x20, #1, #2", + "cbnz x21, #+0x8", "b #+0x2c", - "mov w22, #0x1f80", - "mrs x23, fpcr", - "ubfx x23, x23, #22, #3", - "rbit w0, w23", - "bfi x23, x0, #30, #2", - "bfi w22, w23, #13, #3", - "add x23, x21, #0x18 (24)", - "str w22, [x21, #24]", - "mov w22, #0xffff", - "str w22, [x23, #4]", + "mov w21, #0x1f80", + "mrs x22, fpcr", + "ubfx x22, x22, #22, #3", + "rbit w0, w22", + "bfi x22, x0, #30, #2", + "bfi w21, w22, #13, #3", + "add x22, x4, #0x18 (24)", + "str w21, [x4, #24]", + "mov w21, #0xffff", + "str w21, [x22, #4]", "ubfx x20, x20, #0, #3", - "str x20, [x21, #512]" + "str x20, [x4, #512]" ] }, "lfence": { @@ -1491,55 +1490,54 @@ ] }, "xrstor [rax]": { - "ExpectedInstructionCount": 104, + "ExpectedInstructionCount": 103, "Comment": "GROUP15 0x0F 0xAE /5", "ExpectedArm64ASM": [ - "mov x20, x4", - "ldr x21, [x20, #512]", - "ubfx x22, x21, #0, #1", - "cbnz x22, #+0x8", + "ldr x20, [x4, #512]", + "ubfx x21, x20, #0, #1", + "cbnz x21, #+0x8", "b #+0x84", - "ldrh w22, [x20]", - "strh w22, [x28, #1024]", - "ldrh w22, [x20, #2]", - "ubfx w23, w22, #11, #3", - "strb w23, [x28, #747]", - "ubfx w23, w22, #8, #1", - "ubfx w24, w22, #9, #1", - "ubfx w25, w22, #10, #1", - "ubfx w22, w22, #14, #1", - "strb w23, [x28, #744]", - "strb w24, [x28, #745]", - "strb w25, [x28, #746]", - "strb w22, [x28, #750]", - "ldrb w22, [x20, #4]", - "strb w22, [x28, #1026]", - "ldr q2, [x20, #32]", + "ldrh w21, [x4]", + "strh w21, [x28, #1024]", + "ldrh w21, [x4, #2]", + "ubfx w22, w21, #11, #3", + "strb w22, [x28, #747]", + "ubfx w22, w21, #8, #1", + "ubfx w23, w21, #9, #1", + "ubfx w24, w21, #10, #1", + "ubfx w21, w21, #14, #1", + "strb w22, [x28, #744]", + "strb w23, [x28, #745]", + "strb w24, [x28, #746]", + "strb w21, [x28, #750]", + "ldrb w21, [x4, #4]", + "strb w21, [x28, #1026]", + "ldr q2, [x4, #32]", "str q2, [x28, #768]", - "ldr q2, [x20, #48]", + "ldr q2, [x4, #48]", "str q2, [x28, #784]", - "ldr q2, [x20, #64]", + "ldr q2, [x4, #64]", "str q2, [x28, #800]", - "ldr q2, [x20, #80]", + "ldr q2, [x4, #80]", "str q2, [x28, #816]", - "ldr q2, [x20, #96]", + "ldr q2, [x4, #96]", "str q2, [x28, #832]", - "ldr q2, [x20, #112]", + "ldr q2, [x4, #112]", "str q2, [x28, #848]", - "ldr q2, [x20, #128]", + "ldr q2, [x4, #128]", "str q2, [x28, #864]", - "ldr q2, [x20, #144]", + "ldr q2, [x4, #144]", "str q2, [x28, #880]", "b #+0x4c", - "mov w22, #0x0", - "mov w23, #0x37f", - "strh w23, [x28, #1024]", - "strb w22, [x28, #747]", - "strb w22, [x28, #744]", - "strb w22, [x28, #745]", - "strb w22, [x28, #746]", - "strb w22, [x28, #750]", - "strb w22, [x28, #1026]", + "mov w21, #0x0", + "mov w22, #0x37f", + "strh w22, [x28, #1024]", + "strb w21, [x28, #747]", + "strb w21, [x28, #744]", + "strb w21, [x28, #745]", + "strb w21, [x28, #746]", + "strb w21, [x28, #750]", + "strb w21, [x28, #1026]", "movi v2.2d, #0x0", "str q2, [x28, #768]", "str q2, [x28, #784]", @@ -1549,25 +1547,25 @@ "str q2, [x28, #848]", "str q2, [x28, #864]", "str q2, [x28, #880]", - "ubfx x22, x21, #1, #1", - "cbnz x22, #+0x8", + "ubfx x21, x20, #1, #1", + "cbnz x21, #+0x8", "b #+0x48", - "ldr q16, [x20, #160]", - "ldr q17, [x20, #176]", - "ldr q18, [x20, #192]", - "ldr q19, [x20, #208]", - "ldr q20, [x20, #224]", - "ldr q21, [x20, #240]", - "ldr q22, [x20, #256]", - "ldr q23, [x20, #272]", - "ldr q24, [x20, #288]", - "ldr q25, [x20, #304]", - "ldr q26, [x20, #320]", - "ldr q27, [x20, #336]", - "ldr q28, [x20, #352]", - "ldr q29, [x20, #368]", - "ldr q30, [x20, #384]", - "ldr q31, [x20, #400]", + "ldr q16, [x4, #160]", + "ldr q17, [x4, #176]", + "ldr q18, [x4, #192]", + "ldr q19, [x4, #208]", + "ldr q20, [x4, #224]", + "ldr q21, [x4, #240]", + "ldr q22, [x4, #256]", + "ldr q23, [x4, #272]", + "ldr q24, [x4, #288]", + "ldr q25, [x4, #304]", + "ldr q26, [x4, #320]", + "ldr q27, [x4, #336]", + "ldr q28, [x4, #352]", + "ldr q29, [x4, #368]", + "ldr q30, [x4, #384]", + "ldr q31, [x4, #400]", "b #+0x44", "movi v16.2d, #0x0", "mov v17.16b, v16.16b", @@ -1585,10 +1583,10 @@ "mov v29.16b, v16.16b", "mov v30.16b, v16.16b", "mov v31.16b, v16.16b", - "ubfx x21, x21, #1, #2", - "cbnz x21, #+0x8", + "ubfx x20, x20, #1, #2", + "cbnz x20, #+0x8", "b #+0x2c", - "ldr w20, [x20, #24]", + "ldr w20, [x4, #24]", "ubfx w20, w20, #13, #3", "rbit w1, w20", "lsr w1, w1, #30", diff --git a/unittests/InstructionCountCI/SecondaryGroup.json b/unittests/InstructionCountCI/SecondaryGroup.json index 4e5cbc5a74..825100170a 100644 --- a/unittests/InstructionCountCI/SecondaryGroup.json +++ b/unittests/InstructionCountCI/SecondaryGroup.json @@ -1587,80 +1587,79 @@ ] }, "xsave [rax]": { - "ExpectedInstructionCount": 71, + "ExpectedInstructionCount": 70, "Comment": "GROUP15 0x0F 0xAE /4", "ExpectedArm64ASM": [ "mov x20, x4", - "mov x21, x4", - "ubfx x22, x20, #0, #1", - "cbnz x22, #+0x8", + "ubfx x21, x20, #0, #1", + "cbnz x21, #+0x8", "b #+0x84", - "ldrh w22, [x28, #1024]", - "strh w22, [x21]", - "mov w22, #0x0", - "ldrb w23, [x28, #747]", - "bfi x22, x23, #11, #3", - "ldrb w23, [x28, #744]", - "ldrb w24, [x28, #745]", - "ldrb w25, [x28, #746]", - "ldrb w30, [x28, #750]", - "orr x22, x22, x23, lsl #8", - "orr x22, x22, x24, lsl #9", - "orr x22, x22, x25, lsl #10", - "orr x22, x22, x30, lsl #14", - "strh w22, [x21, #2]", - "ldrb w22, [x28, #1026]", - "strb w22, [x21, #4]", + "ldrh w21, [x28, #1024]", + "strh w21, [x4]", + "mov w21, #0x0", + "ldrb w22, [x28, #747]", + "bfi x21, x22, #11, #3", + "ldrb w22, [x28, #744]", + "ldrb w23, [x28, #745]", + "ldrb w24, [x28, #746]", + "ldrb w25, [x28, #750]", + "orr x21, x21, x22, lsl #8", + "orr x21, x21, x23, lsl #9", + "orr x21, x21, x24, lsl #10", + "orr x21, x21, x25, lsl #14", + "strh w21, [x4, #2]", + "ldrb w21, [x28, #1026]", + "strb w21, [x4, #4]", "ldr q2, [x28, #768]", - "str q2, [x21, #32]", + "str q2, [x4, #32]", "ldr q2, [x28, #784]", - "str q2, [x21, #48]", + "str q2, [x4, #48]", "ldr q2, [x28, #800]", - "str q2, [x21, #64]", + "str q2, [x4, #64]", "ldr q2, [x28, #816]", - "str q2, [x21, #80]", + "str q2, [x4, #80]", "ldr q2, [x28, #832]", - "str q2, [x21, #96]", + "str q2, [x4, #96]", "ldr q2, [x28, #848]", - "str q2, [x21, #112]", + "str q2, [x4, #112]", "ldr q2, [x28, #864]", - "str q2, [x21, #128]", + "str q2, [x4, #128]", "ldr q2, [x28, #880]", - "str q2, [x21, #144]", - "ubfx x22, x20, #1, #1", - "cbnz x22, #+0x8", + "str q2, [x4, #144]", + "ubfx x21, x20, #1, #1", + "cbnz x21, #+0x8", "b #+0x44", - "str q16, [x21, #160]", - "str q17, [x21, #176]", - "str q18, [x21, #192]", - "str q19, [x21, #208]", - "str q20, [x21, #224]", - "str q21, [x21, #240]", - "str q22, [x21, #256]", - "str q23, [x21, #272]", - "str q24, [x21, #288]", - "str q25, [x21, #304]", - "str q26, [x21, #320]", - "str q27, [x21, #336]", - "str q28, [x21, #352]", - "str q29, [x21, #368]", - "str q30, [x21, #384]", - "str q31, [x21, #400]", - "ubfx x22, x20, #1, #2", - "cbnz x22, #+0x8", + "str q16, [x4, #160]", + "str q17, [x4, #176]", + "str q18, [x4, #192]", + "str q19, [x4, #208]", + "str q20, [x4, #224]", + "str q21, [x4, #240]", + "str q22, [x4, #256]", + "str q23, [x4, #272]", + "str q24, [x4, #288]", + "str q25, [x4, #304]", + "str q26, [x4, #320]", + "str q27, [x4, #336]", + "str q28, [x4, #352]", + "str q29, [x4, #368]", + "str q30, [x4, #384]", + "str q31, [x4, #400]", + "ubfx x21, x20, #1, #2", + "cbnz x21, #+0x8", "b #+0x2c", - "mov w22, #0x1f80", - "mrs x23, fpcr", - "ubfx x23, x23, #22, #3", - "rbit w0, w23", - "bfi x23, x0, #30, #2", - "bfi w22, w23, #13, #3", - "add x23, x21, #0x18 (24)", - "str w22, [x21, #24]", - "mov w22, #0xffff", - "str w22, [x23, #4]", + "mov w21, #0x1f80", + "mrs x22, fpcr", + "ubfx x22, x22, #22, #3", + "rbit w0, w22", + "bfi x22, x0, #30, #2", + "bfi w21, w22, #13, #3", + "add x22, x4, #0x18 (24)", + "str w21, [x4, #24]", + "mov w21, #0xffff", + "str w21, [x22, #4]", "ubfx x20, x20, #0, #3", - "str x20, [x21, #512]" + "str x20, [x4, #512]" ] }, "lfence": { @@ -1671,55 +1670,54 @@ ] }, "xrstor [rax]": { - "ExpectedInstructionCount": 104, + "ExpectedInstructionCount": 103, "Comment": "GROUP15 0x0F 0xAE /5", "ExpectedArm64ASM": [ - "mov x20, x4", - "ldr x21, [x20, #512]", - "ubfx x22, x21, #0, #1", - "cbnz x22, #+0x8", + "ldr x20, [x4, #512]", + "ubfx x21, x20, #0, #1", + "cbnz x21, #+0x8", "b #+0x84", - "ldrh w22, [x20]", - "strh w22, [x28, #1024]", - "ldrh w22, [x20, #2]", - "ubfx w23, w22, #11, #3", - "strb w23, [x28, #747]", - "ubfx w23, w22, #8, #1", - "ubfx w24, w22, #9, #1", - "ubfx w25, w22, #10, #1", - "ubfx w22, w22, #14, #1", - "strb w23, [x28, #744]", - "strb w24, [x28, #745]", - "strb w25, [x28, #746]", - "strb w22, [x28, #750]", - "ldrb w22, [x20, #4]", - "strb w22, [x28, #1026]", - "ldr q2, [x20, #32]", + "ldrh w21, [x4]", + "strh w21, [x28, #1024]", + "ldrh w21, [x4, #2]", + "ubfx w22, w21, #11, #3", + "strb w22, [x28, #747]", + "ubfx w22, w21, #8, #1", + "ubfx w23, w21, #9, #1", + "ubfx w24, w21, #10, #1", + "ubfx w21, w21, #14, #1", + "strb w22, [x28, #744]", + "strb w23, [x28, #745]", + "strb w24, [x28, #746]", + "strb w21, [x28, #750]", + "ldrb w21, [x4, #4]", + "strb w21, [x28, #1026]", + "ldr q2, [x4, #32]", "str q2, [x28, #768]", - "ldr q2, [x20, #48]", + "ldr q2, [x4, #48]", "str q2, [x28, #784]", - "ldr q2, [x20, #64]", + "ldr q2, [x4, #64]", "str q2, [x28, #800]", - "ldr q2, [x20, #80]", + "ldr q2, [x4, #80]", "str q2, [x28, #816]", - "ldr q2, [x20, #96]", + "ldr q2, [x4, #96]", "str q2, [x28, #832]", - "ldr q2, [x20, #112]", + "ldr q2, [x4, #112]", "str q2, [x28, #848]", - "ldr q2, [x20, #128]", + "ldr q2, [x4, #128]", "str q2, [x28, #864]", - "ldr q2, [x20, #144]", + "ldr q2, [x4, #144]", "str q2, [x28, #880]", "b #+0x4c", - "mov w22, #0x0", - "mov w23, #0x37f", - "strh w23, [x28, #1024]", - "strb w22, [x28, #747]", - "strb w22, [x28, #744]", - "strb w22, [x28, #745]", - "strb w22, [x28, #746]", - "strb w22, [x28, #750]", - "strb w22, [x28, #1026]", + "mov w21, #0x0", + "mov w22, #0x37f", + "strh w22, [x28, #1024]", + "strb w21, [x28, #747]", + "strb w21, [x28, #744]", + "strb w21, [x28, #745]", + "strb w21, [x28, #746]", + "strb w21, [x28, #750]", + "strb w21, [x28, #1026]", "movi v2.2d, #0x0", "str q2, [x28, #768]", "str q2, [x28, #784]", @@ -1729,25 +1727,25 @@ "str q2, [x28, #848]", "str q2, [x28, #864]", "str q2, [x28, #880]", - "ubfx x22, x21, #1, #1", - "cbnz x22, #+0x8", + "ubfx x21, x20, #1, #1", + "cbnz x21, #+0x8", "b #+0x48", - "ldr q16, [x20, #160]", - "ldr q17, [x20, #176]", - "ldr q18, [x20, #192]", - "ldr q19, [x20, #208]", - "ldr q20, [x20, #224]", - "ldr q21, [x20, #240]", - "ldr q22, [x20, #256]", - "ldr q23, [x20, #272]", - "ldr q24, [x20, #288]", - "ldr q25, [x20, #304]", - "ldr q26, [x20, #320]", - "ldr q27, [x20, #336]", - "ldr q28, [x20, #352]", - "ldr q29, [x20, #368]", - "ldr q30, [x20, #384]", - "ldr q31, [x20, #400]", + "ldr q16, [x4, #160]", + "ldr q17, [x4, #176]", + "ldr q18, [x4, #192]", + "ldr q19, [x4, #208]", + "ldr q20, [x4, #224]", + "ldr q21, [x4, #240]", + "ldr q22, [x4, #256]", + "ldr q23, [x4, #272]", + "ldr q24, [x4, #288]", + "ldr q25, [x4, #304]", + "ldr q26, [x4, #320]", + "ldr q27, [x4, #336]", + "ldr q28, [x4, #352]", + "ldr q29, [x4, #368]", + "ldr q30, [x4, #384]", + "ldr q31, [x4, #400]", "b #+0x44", "movi v16.2d, #0x0", "mov v17.16b, v16.16b", @@ -1765,10 +1763,10 @@ "mov v29.16b, v16.16b", "mov v30.16b, v16.16b", "mov v31.16b, v16.16b", - "ubfx x21, x21, #1, #2", - "cbnz x21, #+0x8", + "ubfx x20, x20, #1, #2", + "cbnz x20, #+0x8", "b #+0x2c", - "ldr w20, [x20, #24]", + "ldr w20, [x4, #24]", "ubfx w20, w20, #13, #3", "rbit w1, w20", "lsr w1, w1, #30",