diff --git a/sm4/cbc_amd64.s b/sm4/cbc_amd64.s index ca1618b6..e256caa8 100644 --- a/sm4/cbc_amd64.s +++ b/sm4/cbc_amd64.s @@ -151,15 +151,23 @@ cbcSm4Octets: MOVOU 112(DX), XWORD7 SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7) - - PXOR -16(DX), XWORD0 - PXOR 0(DX), XWORD1 - PXOR 16(DX), XWORD2 - PXOR 32(DX), XWORD3 - PXOR 48(DX), XWORD4 - PXOR 64(DX), XWORD5 - PXOR 80(DX), XWORD6 - PXOR 96(DX), XWORD7 + + MOVOU -16(DX), XWTMP0 + PXOR XWTMP0, XWORD0 + MOVOU 0(DX), XWTMP0 + PXOR XWTMP0, XWORD1 + MOVOU 16(DX), XWTMP0 + PXOR XWTMP0, XWORD2 + MOVOU 32(DX), XWTMP0 + PXOR XWTMP0, XWORD3 + MOVOU 48(DX), XWTMP0 + PXOR XWTMP0, XWORD4 + MOVOU 64(DX), XWTMP0 + PXOR XWTMP0, XWORD5 + MOVOU 80(DX), XWTMP0 + PXOR XWTMP0, XWORD6 + MOVOU 96(DX), XWTMP0 + PXOR XWTMP0, XWORD7 MOVOU XWORD0, 0(BX) MOVOU XWORD1, 16(BX) @@ -186,10 +194,14 @@ cbcSm4Nibbles: SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - PXOR -16(DX), XWORD0 - PXOR 0(DX), XWORD1 - PXOR 16(DX), XWORD2 - PXOR 32(DX), XWORD3 + MOVUPS -16(DX), XWTMP0 + PXOR XWTMP0, XWORD0 + MOVUPS 0(DX), XWTMP0 + PXOR XWTMP0, XWORD1 + MOVUPS 16(DX), XWTMP0 + PXOR XWTMP0, XWORD2 + MOVUPS 32(DX), XWTMP0 + PXOR XWTMP0, XWORD3 MOVUPS XWORD0, 0(BX) MOVUPS XWORD1, 16(BX) @@ -213,10 +225,14 @@ cbCSm4Single: SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - PXOR 0(SI), XWORD0 - PXOR -64(DX), XWORD1 - PXOR -48(DX), XWORD2 - PXOR -32(DX), XWORD3 + MOVUPS 0(SI), XWTMP0 + PXOR XWTMP0, XWORD0 + MOVUPS -64(DX), XWTMP0 + PXOR XWTMP0, XWORD1 + MOVUPS -48(DX), XWTMP0 + PXOR XWTMP0, XWORD2 + MOVUPS -32(DX), XWTMP0 + PXOR XWTMP0, XWORD3 MOVUPS XWORD0, -64(BX) MOVUPS XWORD1, -48(BX) @@ -230,7 +246,8 @@ cbcSm4Single16: SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - PXOR 0(SI), XWORD0 + MOVUPS 0(SI), XWTMP0 + PXOR XWTMP0, XWORD0 MOVUPS XWORD0, -16(BX) @@ -242,8 +259,10 @@ cbcSm4Single32: SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - PXOR 0(SI), XWORD0 - PXOR -32(DX), XWORD1 + MOVUPS 0(SI), XWTMP0 + PXOR XWTMP0, XWORD0 + MOVUPS -32(DX), XWTMP0 + PXOR XWTMP0, XWORD1 MOVUPS XWORD0, -32(BX) MOVUPS XWORD1, -16(BX) @@ -257,9 +276,12 @@ cbcSm4Single48: SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3) - PXOR 0(SI), XWORD0 - PXOR -48(DX), XWORD1 - PXOR -32(DX), XWORD2 + MOVUPS 0(SI), XWTMP0 + PXOR XWTMP0, XWORD0 + MOVUPS -48(DX), XWTMP0 + PXOR XWTMP0, XWORD1 + MOVUPS -32(DX), XWTMP0 + PXOR XWTMP0, XWORD2 MOVUPS XWORD0, -48(BX) MOVUPS XWORD1, -32(BX) diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index 67e46a14..f46d163a 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -677,14 +677,22 @@ gcmSm4EncOctetsLoop: SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) - PXOR (16*0)(ptx), B0 - PXOR (16*1)(ptx), B1 - PXOR (16*2)(ptx), B2 - PXOR (16*3)(ptx), B3 - PXOR (16*4)(ptx), B4 - PXOR (16*5)(ptx), B5 - PXOR (16*6)(ptx), B6 - PXOR (16*7)(ptx), B7 + MOVOU (16*0)(ptx), T0 + PXOR T0, B0 + MOVOU (16*1)(ptx), T0 + PXOR T0, B1 + MOVOU (16*2)(ptx), T0 + PXOR T0, B2 + MOVOU (16*3)(ptx), T0 + PXOR T0, B3 + MOVOU (16*4)(ptx), T0 + PXOR T0, B4 + MOVOU (16*5)(ptx), T0 + PXOR T0, B5 + MOVOU (16*6)(ptx), T0 + PXOR T0, B6 + MOVOU (16*7)(ptx), T0 + PXOR T0, B7 MOVOU B0, (16*0)(ctx) PSHUFB BSWAP, B0 @@ -765,10 +773,14 @@ gcmSm4EncNibbles: MOVOU (8*16 + 3*16)(SP), B3 SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3) - PXOR (16*0)(ptx), B0 - PXOR (16*1)(ptx), B1 - PXOR (16*2)(ptx), B2 - PXOR (16*3)(ptx), B3 + MOVOU (16*0)(ptx), T0 + PXOR T0, B0 + MOVOU (16*1)(ptx), T0 + PXOR T0, B1 + MOVOU (16*2)(ptx), T0 + PXOR T0, B2 + MOVOU (16*3)(ptx), T0 + PXOR T0, B3 MOVOU B0, (16*0)(ctx) MOVOU B1, (16*1)(ctx) @@ -1683,14 +1695,22 @@ gcmSm4DecOctetsLoop: SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7) - PXOR (16*0)(ctx), B0 - PXOR (16*1)(ctx), B1 - PXOR (16*2)(ctx), B2 - PXOR (16*3)(ctx), B3 - PXOR (16*4)(ctx), B4 - PXOR (16*5)(ctx), B5 - PXOR (16*6)(ctx), B6 - PXOR (16*7)(ctx), B7 + MOVOU (16*0)(ctx), T0 + PXOR T0, B0 + MOVOU (16*1)(ctx), T0 + PXOR T0, B1 + MOVOU (16*2)(ctx), T0 + PXOR T0, B2 + MOVOU (16*3)(ctx), T0 + PXOR T0, B3 + MOVOU (16*4)(ctx), T0 + PXOR T0, B4 + MOVOU (16*5)(ctx), T0 + PXOR T0, B5 + MOVOU (16*6)(ctx), T0 + PXOR T0, B6 + MOVOU (16*7)(ctx), T0 + PXOR T0, B7 MOVOU B0, (16*0)(ptx) MOVOU B1, (16*1)(ptx) diff --git a/sm4/xts_amd64.s b/sm4/xts_amd64.s index 6f848ffc..8f1e5825 100644 --- a/sm4/xts_amd64.s +++ b/sm4/xts_amd64.s @@ -210,44 +210,60 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16 #define sseLoad4Blocks \ MOVOU (16*0)(DX), B0; \ - PXOR (16*0)(SP), B0; \ + MOVOU (16*0)(SP), T0; \ + PXOR T0, B0; \ MOVOU (16*1)(DX), B1; \ - PXOR (16*1)(SP), B1; \ + MOVOU (16*1)(SP), T0; \ + PXOR T0, B1; \ MOVOU (16*2)(DX), B2; \ - PXOR (16*2)(SP), B2; \ + MOVOU (16*2)(SP), T0; \ + PXOR T0, B2; \ MOVOU (16*3)(DX), B3; \ - PXOR (16*3)(SP), B3 + MOVOU (16*3)(SP), T0; \ + PXOR T0, B3 #define sseStore4Blocks \ - PXOR (16*0)(SP), B0; \ + MOVOU (16*0)(SP), T0; \ + PXOR T0, B0; \ MOVOU B0, (16*0)(CX); \ - PXOR (16*1)(SP), B1; \ + MOVOU (16*1)(SP), T0; \ + PXOR T0, B1; \ MOVOU B1, (16*1)(CX); \ - PXOR (16*2)(SP), B2; \ + MOVOU (16*2)(SP), T0; \ + PXOR T0, B2; \ MOVOU B2, (16*2)(CX); \ - PXOR (16*3)(SP), B3; \ + MOVOU (16*3)(SP), T0; \ + PXOR T0, B3; \ MOVOU B3, (16*3)(CX) #define sseLoad8Blocks \ sseLoad4Blocks; \ MOVOU (16*4)(DX), B4; \ - PXOR (16*4)(SP), B4; \ + MOVOU (16*4)(SP), T0; \ + PXOR T0, B4; \ MOVOU (16*5)(DX), B5; \ - PXOR (16*5)(SP), B5; \ + MOVOU (16*5)(SP), T0; \ + PXOR T0, B5; \ MOVOU (16*6)(DX), B6; \ - PXOR (16*6)(SP), B6; \ + MOVOU (16*6)(SP), T0; \ + PXOR T0, B6; \ MOVOU (16*7)(DX), B7; \ - PXOR (16*7)(SP), B7 + MOVOU (16*7)(SP), T0; \ + PXOR T0, B7 #define sseStore8Blocks \ sseStore4Blocks; \ - PXOR (16*4)(SP), B4; \ + MOVOU (16*4)(SP), T0; \ + PXOR T0, B4; \ MOVOU B4, (16*4)(CX); \ - PXOR (16*5)(SP), B5; \ + MOVOU (16*5)(SP), T0; \ + PXOR T0, B5; \ MOVOU B5, (16*5)(CX); \ - PXOR (16*6)(SP), B6; \ + MOVOU (16*6)(SP), T0; \ + PXOR T0, B6; \ MOVOU B6, (16*6)(CX); \ - PXOR (16*7)(SP), B7; \ + MOVOU (16*7)(SP), T0; \ + PXOR T0, B7; \ MOVOU B7, (16*7)(CX) #define avxLoad4Blocks \