diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index cd0f39b6..4e0e6cc9 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -20,7 +20,7 @@ DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00 DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508 GLOBL inverse_shift_rows<>(SB), 8, $32 -// Affine transform 1 (low and high hibbles) +// Affine transform 1 (low and high nibbles) DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69 @@ -33,7 +33,7 @@ DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800 DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB GLOBL m1_high<>(SB), 8, $32 -// Affine transform 2 (low and high hibbles) +// Affine transform 2 (low and high nibbles) DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61 @@ -270,10 +270,10 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ MOVOU (7*16)(RK), rk128; \ SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ - PSHUFB bswap_mask<>(SB), t3; \ - PSHUFB bswap_mask<>(SB), t2; \ - PSHUFB bswap_mask<>(SB), t1; \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ + PSHUFB bswap_mask<>(SB), t3; \ + PSHUFB bswap_mask<>(SB), t2; \ + PSHUFB bswap_mask<>(SB), t1; \ PSHUFB bswap_mask<>(SB), t0 #define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ @@ -306,33 +306,33 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) #define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ - SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ - MOVOU (0*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - MOVOU (1*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - MOVOU (2*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - MOVOU (3*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - MOVOU (4*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - MOVOU (5*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - MOVOU (6*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - MOVOU (7*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ - SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ - PSHUFB bswap_mask<>(SB), t3; \ - PSHUFB bswap_mask<>(SB), t2; \ - PSHUFB bswap_mask<>(SB), t1; \ - PSHUFB bswap_mask<>(SB), t0; \ - PSHUFB bswap_mask<>(SB), t7; \ - PSHUFB bswap_mask<>(SB), t6; \ - PSHUFB bswap_mask<>(SB), t5; \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ + SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ + MOVOU (0*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (1*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (2*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (3*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (4*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (5*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (6*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + MOVOU (7*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ + SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ + PSHUFB bswap_mask<>(SB), t3; \ + PSHUFB bswap_mask<>(SB), t2; \ + PSHUFB bswap_mask<>(SB), t1; \ + PSHUFB bswap_mask<>(SB), t0; \ + PSHUFB bswap_mask<>(SB), t7; \ + PSHUFB bswap_mask<>(SB), t6; \ + PSHUFB bswap_mask<>(SB), t5; \ PSHUFB bswap_mask<>(SB), t4 // SM4 sbox function, AVX version @@ -366,7 +366,7 @@ GLOBL fk_mask<>(SB), 8, $16 // - y: 128 bits temp register // - tmp: 128 bits temp register #define AVX_SM4_TAO_L1(x, y, tmp) \ - AVX_SM4_SBOX(x, y, tmp); \ + AVX_SM4_SBOX(x, y, tmp); \ VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16 VPXOR x, y, y; \ // y = x ^ (x <<< 8) @@ -408,7 +408,7 @@ GLOBL fk_mask<>(SB), 8, $16 // - t3: 128 bits register for data #define AVX_SM4_ROUND(index, RK, IND, x, y, tmp, t0, t1, t2, t3) \ MOVL (index * 4)(RK)(IND*1), x; \ - VPSHUFD $0, x, x; \ + VPSHUFD $0, x, x; \ // Use VBROADCASTSS ? VPXOR t1, x, x; \ VPXOR t2, x, x; \ VPXOR t3, x, x; \ @@ -434,15 +434,15 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \ #define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ - VPSHUFB flip_mask<>(SB), t0, t0 \ - VPSHUFB flip_mask<>(SB), t1, t1 \ - VPSHUFB flip_mask<>(SB), t2, t2 \ - VPSHUFB flip_mask<>(SB), t3, t3 \ - ; \ + VPSHUFB flip_mask<>(SB), t0, t0 \ + VPSHUFB flip_mask<>(SB), t1, t1 \ + VPSHUFB flip_mask<>(SB), t2, t2 \ + VPSHUFB flip_mask<>(SB), t3, t3 \ + ; \ AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) #define AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \ - TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ VMOVDQU (0*16)(RK), rk128; \ SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ VMOVDQU (1*16)(RK), rk128; \ @@ -493,31 +493,31 @@ GLOBL fk_mask<>(SB), 8, $16 VPSHUFB flip_mask<>(SB), t5, t5 \ VPSHUFB flip_mask<>(SB), t6, t6 \ VPSHUFB flip_mask<>(SB), t7, t7 \ - ; \ + ; \ AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) #define AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ - TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ - TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ - VMOVDQU (0*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - VMOVDQU (1*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - VMOVDQU (2*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - VMOVDQU (3*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - VMOVDQU (4*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - VMOVDQU (5*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - VMOVDQU (6*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ - VMOVDQU (7*16)(RK), rk128; \ - SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ + TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ + VMOVDQU (0*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (1*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (2*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (3*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (4*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (5*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (6*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ + VMOVDQU (7*16)(RK), rk128; \ + SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ ; \ // Transpose matrix 4 x 4 32bits word - TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ - TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ + TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ + TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ VPSHUFB bswap_mask<>(SB), t0, t0 \ VPSHUFB bswap_mask<>(SB), t1, t1 \ VPSHUFB bswap_mask<>(SB), t2, t2 \ @@ -538,24 +538,24 @@ GLOBL fk_mask<>(SB), 8, $16 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ VPAND yNibbleMask, x, z; \ - VMOVDQU m1_low<>(SB), y; \ + VMOVDQU m1_low<>(SB), y; \ VPSHUFB z, y, y; \ VPSRLQ $4, x, x; \ VPAND yNibbleMask, x, x; \ - VMOVDQU m1_high<>(SB), z; \ + VMOVDQU m1_high<>(SB), z; \ VPSHUFB x, z, x; \ VPXOR y, x, x; \ - VPSHUFB inverse_shift_rows<>(SB), x, x; \ + VPSHUFB inverse_shift_rows<>(SB), x, x; \ VEXTRACTI128 $1, x, yw \ VAESENCLAST xNibbleMask, xw, xw; \ VAESENCLAST xNibbleMask, yw, yw; \ VINSERTI128 $1, yw, x, x; \ VPANDN yNibbleMask, x, z; \ - VMOVDQU m2_low<>(SB), y; \ + VMOVDQU m2_low<>(SB), y; \ VPSHUFB z, y, y; \ VPSRLQ $4, x, x; \ VPAND yNibbleMask, x, x; \ - VMOVDQU m2_high<>(SB), z; \ + VMOVDQU m2_high<>(SB), z; \ VPSHUFB x, z, x; \ VPXOR y, x, x @@ -570,11 +570,11 @@ GLOBL fk_mask<>(SB), 8, $16 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ - VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 - VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16 + VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 + VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16 VPXOR x, y, y; \ // y = x ^ (x <<< 8) VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) - VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24 + VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24 VPXOR x, z, x; \ // x = x ^ (x <<< 24) VPSLLD $2, y, z; \ VPSRLD $30, y, y; \ @@ -611,10 +611,10 @@ GLOBL fk_mask<>(SB), 8, $16 // - t3: 256 bits register for data #define AVX2_SM4_ROUND2(index, RK, x, y, xw, yw, tmp, t0, t1, t2, t3) \ VPBROADCASTD (index * 4)(RK), x; \ - VPXOR t1, x, x; \ - VPXOR t2, x, x; \ - VPXOR t3, x, x; \ - AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ + VPXOR t1, x, x; \ + VPXOR t2, x, x; \ + VPXOR t3, x, x; \ + AVX2_SM4_TAO_L1(x, y, tmp, xw, yw, X_NIBBLE_MASK, NIBBLE_MASK); \ VPXOR x, t0, t0 // SM4 round function, AVX version, handle 128 bits @@ -675,6 +675,8 @@ GLOBL fk_mask<>(SB), 8, $16 // - index: round key index immediate number // - x: 256 bits temp register, MUST use XDWORD! // - y: 256 bits temp register, MUST use YDWORD! +// - tmp: 256 bits temp register +// - tmp1: 256 bits temp register // - t0: 256 bits register for data as result // - t1: 256 bits register for data // - t2: 256 bits register for data @@ -694,16 +696,16 @@ GLOBL fk_mask<>(SB), 8, $16 VPXOR x, t4, t4; \ #define AVX2_SM4_16BLOCKS(RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7) \ - AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ - AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ - AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ - AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ - AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ - AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ - AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ - AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ - AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ - AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(0, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(1, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(2, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(3, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(4, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(5, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ + AVX2_SM4_16BLOCKS_ROUND(6, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ + AVX2_SM4_16BLOCKS_ROUND(7, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ + AVX2_SM4_16BLOCKS_ROUND(8, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ + AVX2_SM4_16BLOCKS_ROUND(9, RK, x, y, xw, yw, tmp, tmp1, t1, t2, t3, t0, t5, t6, t7, t4); \ AVX2_SM4_16BLOCKS_ROUND(10, RK, x, y, xw, yw, tmp, tmp1, t2, t3, t0, t1, t6, t7, t4, t5); \ AVX2_SM4_16BLOCKS_ROUND(11, RK, x, y, xw, yw, tmp, tmp1, t3, t0, t1, t2, t7, t4, t5, t6); \ AVX2_SM4_16BLOCKS_ROUND(12, RK, x, y, xw, yw, tmp, tmp1, t0, t1, t2, t3, t4, t5, t6, t7); \ diff --git a/sm4/cbc_amd64.s b/sm4/cbc_amd64.s index 5c605925..875a24fd 100644 --- a/sm4/cbc_amd64.s +++ b/sm4/cbc_amd64.s @@ -11,7 +11,6 @@ #define t3 X5 #define XTMP6 X6 -#define IV X8 #include "aesni_macros_amd64.s" @@ -27,15 +26,14 @@ TEXT ·encryptBlocksChain(SB),NOSPLIT,$0 MOVQ src_len+40(FP), ptxLen MOVQ iv+56(FP), SI - MOVUPS (SI), IV + MOVOU (SI), t0 loopSrc: CMPQ ptxLen, $16 JB done_sm4 SUBQ $16, ptxLen - MOVOU (ptx), t0 - PXOR IV, t0 + PXOR (ptx), t0 PSHUFB flip_mask<>(SB), t0 PSHUFD $1, t0, t1 @@ -60,7 +58,6 @@ loopRound: PALIGNR $4, t1, t0 PSHUFB flip_mask<>(SB), t0 - MOVOU t0, IV MOVOU t0, (ctx) LEAQ 16(ptx), ptx @@ -69,7 +66,7 @@ loopRound: JMP loopSrc done_sm4: - MOVUPS IV, (SI) + MOVOU t0, (SI) RET #undef ctx