diff --git a/internal/subtle/xor_amd64.s b/internal/subtle/xor_amd64.s index 009e9c44..f567f9bf 100644 --- a/internal/subtle/xor_amd64.s +++ b/internal/subtle/xor_amd64.s @@ -24,6 +24,7 @@ non_avx2: aligned: MOVQ $0, AX // position in slices + PCALIGN $16 loop16b: MOVOU (SI)(AX*1), X0 // XOR 16byte forwards. MOVOU (CX)(AX*1), X1 @@ -34,6 +35,7 @@ loop16b: JNE loop16b RET + PCALIGN $16 loop_1b: SUBQ $1, DX // XOR 1byte backwards. MOVB (SI)(DX*1), DI @@ -62,34 +64,34 @@ ret: RET avx2: - TESTQ $31, DX // AND 31 & len, if not zero jump to avx2_not_aligned. + TESTQ $31, DX // AND 31 & len, if not zero jump to avx2_not_aligned. JNZ avx2_not_aligned -avx2_aligned: +avx2_aligned: // input length = 16*n, where n is greater or equal 2. TESTQ $16, DX // AND 16 & len, if zero jump to loop32b_start. JE loop32b_start SUBQ $16, DX // XOR 16bytes backwards. - MOVOU (SI)(DX*1), X0 - MOVOU (CX)(DX*1), X1 - PXOR X1, X0 - MOVOU X0, (BX)(DX*1) - CMPQ DX, $0 // if len is 0, ret. - JE avx2_ret + VMOVDQU (SI)(DX*1), X0 + VPXOR (CX)(DX*1), X0, X0 + VMOVDQU X0, (BX)(DX*1) loop32b_start: MOVQ $0, AX // position in slices + PCALIGN $32 loop32b: VMOVDQU (SI)(AX*1), Y0 // XOR 32byte forwards. - VMOVDQU (CX)(AX*1), Y1 - VPXOR Y0, Y1, Y0 + VPXOR (CX)(AX*1), Y0, Y0 VMOVDQU Y0, (BX)(AX*1) ADDQ $32, AX CMPQ DX, AX JNE loop32b + +avx2_ret: VZEROUPPER RET + PCALIGN $16 avx2_loop_1b: SUBQ $1, DX // XOR 1byte backwards. MOVB (SI)(DX*1), DI @@ -98,25 +100,17 @@ avx2_loop_1b: MOVB DI, (BX)(DX*1) TESTQ $7, DX // AND 7 & len, if not zero jump to avx2_loop_1b. JNZ avx2_loop_1b - CMPQ DX, $0 // if len is 0, ret. - JE avx2_ret TESTQ $15, DX // AND 15 & len, if zero jump to aligned. JZ avx2_aligned avx2_not_aligned: TESTQ $7, DX // AND $7 & len, if not zero jump to avx2_loop_1b. JNE avx2_loop_1b - TESTQ $8, DX // AND $8 & len, if zero jump to avx2_16b. - JE avx2_16b + TESTQ $8, DX // AND $8 & len, if zero jump to avx2_aligned. + JE avx2_aligned SUBQ $8, DX // XOR 8bytes backwards. MOVQ (SI)(DX*1), DI MOVQ (CX)(DX*1), AX XORQ AX, DI MOVQ DI, (BX)(DX*1) -avx2_16b: - CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. - JGE avx2_aligned - -avx2_ret: - VZEROUPPER - RET + JMP avx2_aligned