From f366964a77c71b1a28cba88ca3c3f3b5cd7d3904 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 28 Aug 2024 13:08:26 +0800 Subject: [PATCH] internal/subtle: s390x, check if VLM/VSTM issue --- internal/subtle/xor_s390x.s | 48 +++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/internal/subtle/xor_s390x.s b/internal/subtle/xor_s390x.s index 7c5acab..007193f 100644 --- a/internal/subtle/xor_s390x.s +++ b/internal/subtle/xor_s390x.s @@ -10,19 +10,53 @@ TEXT ·xorBytes(SB),NOSPLIT,$0-32 MOVD n+24(FP), R4 MOVD $0, R5 - CMPBLT R4, $16, tail + CMPBLT R4, $64, tail -loop16b: +loop_64: VL 0(R2)(R5*1), V0 - VL 0(R3)(R5*1), V1 - VX V0, V1, V2 + VL 16(R2)(R5*1), V1 + VL 32(R2)(R5*1), V2 + VL 48(R2)(R5*1), V3 + VL 0(R3)(R5*1), V4 + VL 16(R3)(R5*1), V5 + VL 32(R3)(R5*1), V6 + VL 48(R3)(R5*1), V7 + VX V0, V4, V4 + VX V1, V5, V5 + VX V2, V6, V6 + VX V3, V7, V7 + VST V4, 0(R1)(R5*1) + VST V5, 16(R1)(R5*1) + VST V6, 32(R1)(R5*1) + VST V7, 48(R1)(R5*1) + LAY 64(R5), R5 + SUB $64, R4 + CMPBGE R4, $64, loop_64 + +tail: + CMPBEQ R4, $0, done + CMPBLT R4, $32, less_than32 + VL 0(R2)(R5*1), V0 + VL 16(R2)(R5*1), V1 + VL 0(R3)(R5*1), V4 + VL 16(R3)(R5*1), V5 + VX V0, V2, V2 + VX V1, V3, V3 VST V2, 0(R1)(R5*1) + VST V3, 16(R1)(R5*1) + LAY 32(R5), R5 + SUB $32, R4 + +less_than32: + CMPBLT R4, $16, less_than16 + VL 0(R2)(R5*1), V0 + VL 0(R3)(R5*1), V1 + VX V0, V1, V1 + VST V1, 0(R1)(R5*1) LAY 16(R5), R5 SUB $16, R4 - CMPBGE R4, $16, loop16b -tail: - CMPBEQ R4, $0, done +less_than16: CMPBLT R4, $8, less_than8 MOVD 0(R2)(R5*1), R7 MOVD 0(R3)(R5*1), R8