From c0055c1fb4cc99894451ae23fb9e724cc2bc9942 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Wed, 28 Aug 2024 08:38:36 +0800 Subject: [PATCH] intenal/subtle: s390x use VLM/VSTM --- internal/subtle/xor_s390x.s | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/internal/subtle/xor_s390x.s b/internal/subtle/xor_s390x.s index 764a810..c6fa982 100644 --- a/internal/subtle/xor_s390x.s +++ b/internal/subtle/xor_s390x.s @@ -14,16 +14,39 @@ TEXT ·xorBytes(SB),NOSPLIT,$0-32 MOVD n+24(FP), R4 MOVD $0, R5 - CMPBLT R4, $16, less_than16 + CMPBLT R4, $64, less_than64 + +loop64b: + VLM (R2)(R5*1), V0, V3 + VLM (R3)(R5*1), V4, V7 + VX V0, V4, V4 + VX V1, V5, V5 + VX V2, V6, V6 + VX V3, V7, V7 + VSTM V4, V7, (R1)(R5*1) + LAY 64(R5), R5 + SUB $64, R4 + CMPBGE R4, $64, loop64b -loop16b: +less_than64: + CMPBEQ R4, $0, done // quick end + CMPBLT R4, $32, less_than32 + VLM (R2)(R5*1), V0, V1 + VLM (R3)(R5*1), V2, V3 + VX V0, V2, V2 + VX V1, V3, V3 + VSTM V2, V3, 0(R1)(R5*1) + LAY 32(R5), R5 + SUB $32, R4 + +less_than32: + CMPBLT R4, $16, less_than16 VL 0(R2)(R5*1), V0 VL 0(R3)(R5*1), V1 VX V0, V1, V2 VST V2, 0(R1)(R5*1) LAY 16(R5), R5 SUB $16, R4 - CMPBGE R4, $16, loop16b less_than16: CMPBLT R4, $8, tail