Skip to content

Commit

Permalink
sm4: disable PXOR use m128 directly
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Nov 1, 2023
1 parent 2f16366 commit 8f5e603
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 59 deletions.
68 changes: 45 additions & 23 deletions sm4/cbc_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,23 @@ cbcSm4Octets:
MOVOU 112(DX), XWORD7

SM4_8BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3, XWORD4, XWORD5, XWORD6, XWORD7)

PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3
PXOR 48(DX), XWORD4
PXOR 64(DX), XWORD5
PXOR 80(DX), XWORD6
PXOR 96(DX), XWORD7

MOVOU -16(DX), XWTMP0
PXOR XWTMP0, XWORD0
MOVOU 0(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVOU 16(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVOU 32(DX), XWTMP0
PXOR XWTMP0, XWORD3
MOVOU 48(DX), XWTMP0
PXOR XWTMP0, XWORD4
MOVOU 64(DX), XWTMP0
PXOR XWTMP0, XWORD5
MOVOU 80(DX), XWTMP0
PXOR XWTMP0, XWORD6
MOVOU 96(DX), XWTMP0
PXOR XWTMP0, XWORD7

MOVOU XWORD0, 0(BX)
MOVOU XWORD1, 16(BX)
Expand All @@ -186,10 +194,14 @@ cbcSm4Nibbles:

SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)

PXOR -16(DX), XWORD0
PXOR 0(DX), XWORD1
PXOR 16(DX), XWORD2
PXOR 32(DX), XWORD3
MOVUPS -16(DX), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS 0(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS 16(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS 32(DX), XWTMP0
PXOR XWTMP0, XWORD3

MOVUPS XWORD0, 0(BX)
MOVUPS XWORD1, 16(BX)
Expand All @@ -213,10 +225,14 @@ cbCSm4Single:

SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)

PXOR 0(SI), XWORD0
PXOR -64(DX), XWORD1
PXOR -48(DX), XWORD2
PXOR -32(DX), XWORD3
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS -64(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS -48(DX), XWTMP0
PXOR XWTMP0, XWORD2
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD3

MOVUPS XWORD0, -64(BX)
MOVUPS XWORD1, -48(BX)
Expand All @@ -230,7 +246,8 @@ cbcSm4Single16:

SM4_SINGLE_BLOCK(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)

PXOR 0(SI), XWORD0
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0

MOVUPS XWORD0, -16(BX)

Expand All @@ -242,8 +259,10 @@ cbcSm4Single32:

SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)

PXOR 0(SI), XWORD0
PXOR -32(DX), XWORD1
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD1

MOVUPS XWORD0, -32(BX)
MOVUPS XWORD1, -16(BX)
Expand All @@ -257,9 +276,12 @@ cbcSm4Single48:

SM4_4BLOCKS(AX, XWORD, YWORD, XWTMP0, XWTMP1, XWORD0, XWORD1, XWORD2, XWORD3)

PXOR 0(SI), XWORD0
PXOR -48(DX), XWORD1
PXOR -32(DX), XWORD2
MOVUPS 0(SI), XWTMP0
PXOR XWTMP0, XWORD0
MOVUPS -48(DX), XWTMP0
PXOR XWTMP0, XWORD1
MOVUPS -32(DX), XWTMP0
PXOR XWTMP0, XWORD2

MOVUPS XWORD0, -48(BX)
MOVUPS XWORD1, -32(BX)
Expand Down
60 changes: 40 additions & 20 deletions sm4/gcm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -677,14 +677,22 @@ gcmSm4EncOctetsLoop:

SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)

PXOR (16*0)(ptx), B0
PXOR (16*1)(ptx), B1
PXOR (16*2)(ptx), B2
PXOR (16*3)(ptx), B3
PXOR (16*4)(ptx), B4
PXOR (16*5)(ptx), B5
PXOR (16*6)(ptx), B6
PXOR (16*7)(ptx), B7
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3
MOVOU (16*4)(ptx), T0
PXOR T0, B4
MOVOU (16*5)(ptx), T0
PXOR T0, B5
MOVOU (16*6)(ptx), T0
PXOR T0, B6
MOVOU (16*7)(ptx), T0
PXOR T0, B7

MOVOU B0, (16*0)(ctx)
PSHUFB BSWAP, B0
Expand Down Expand Up @@ -765,10 +773,14 @@ gcmSm4EncNibbles:
MOVOU (8*16 + 3*16)(SP), B3

SM4_4BLOCKS_WO_BS(AX, B4, T0, T1, T2, B0, B1, B2, B3)
PXOR (16*0)(ptx), B0
PXOR (16*1)(ptx), B1
PXOR (16*2)(ptx), B2
PXOR (16*3)(ptx), B3
MOVOU (16*0)(ptx), T0
PXOR T0, B0
MOVOU (16*1)(ptx), T0
PXOR T0, B1
MOVOU (16*2)(ptx), T0
PXOR T0, B2
MOVOU (16*3)(ptx), T0
PXOR T0, B3

MOVOU B0, (16*0)(ctx)
MOVOU B1, (16*1)(ctx)
Expand Down Expand Up @@ -1683,14 +1695,22 @@ gcmSm4DecOctetsLoop:

SM4_8BLOCKS_WO_BS(rk, ACC1, T0, T1, T2, B0, B1, B2, B3, B4, B5, B6, B7)

PXOR (16*0)(ctx), B0
PXOR (16*1)(ctx), B1
PXOR (16*2)(ctx), B2
PXOR (16*3)(ctx), B3
PXOR (16*4)(ctx), B4
PXOR (16*5)(ctx), B5
PXOR (16*6)(ctx), B6
PXOR (16*7)(ctx), B7
MOVOU (16*0)(ctx), T0
PXOR T0, B0
MOVOU (16*1)(ctx), T0
PXOR T0, B1
MOVOU (16*2)(ctx), T0
PXOR T0, B2
MOVOU (16*3)(ctx), T0
PXOR T0, B3
MOVOU (16*4)(ctx), T0
PXOR T0, B4
MOVOU (16*5)(ctx), T0
PXOR T0, B5
MOVOU (16*6)(ctx), T0
PXOR T0, B6
MOVOU (16*7)(ctx), T0
PXOR T0, B7

MOVOU B0, (16*0)(ptx)
MOVOU B1, (16*1)(ptx)
Expand Down
48 changes: 32 additions & 16 deletions sm4/xts_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -210,44 +210,60 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16

#define sseLoad4Blocks \
MOVOU (16*0)(DX), B0; \
PXOR (16*0)(SP), B0; \
MOVOU (16*0)(SP), T0; \
PXOR T0, B0; \
MOVOU (16*1)(DX), B1; \
PXOR (16*1)(SP), B1; \
MOVOU (16*1)(SP), T0; \
PXOR T0, B1; \
MOVOU (16*2)(DX), B2; \
PXOR (16*2)(SP), B2; \
MOVOU (16*2)(SP), T0; \
PXOR T0, B2; \
MOVOU (16*3)(DX), B3; \
PXOR (16*3)(SP), B3
MOVOU (16*3)(SP), T0; \
PXOR T0, B3

#define sseStore4Blocks \
PXOR (16*0)(SP), B0; \
MOVOU (16*0)(SP), T0; \
PXOR T0, B0; \
MOVOU B0, (16*0)(CX); \
PXOR (16*1)(SP), B1; \
MOVOU (16*1)(SP), T0; \
PXOR T0, B1; \
MOVOU B1, (16*1)(CX); \
PXOR (16*2)(SP), B2; \
MOVOU (16*2)(SP), T0; \
PXOR T0, B2; \
MOVOU B2, (16*2)(CX); \
PXOR (16*3)(SP), B3; \
MOVOU (16*3)(SP), T0; \
PXOR T0, B3; \
MOVOU B3, (16*3)(CX)

#define sseLoad8Blocks \
sseLoad4Blocks; \
MOVOU (16*4)(DX), B4; \
PXOR (16*4)(SP), B4; \
MOVOU (16*4)(SP), T0; \
PXOR T0, B4; \
MOVOU (16*5)(DX), B5; \
PXOR (16*5)(SP), B5; \
MOVOU (16*5)(SP), T0; \
PXOR T0, B5; \
MOVOU (16*6)(DX), B6; \
PXOR (16*6)(SP), B6; \
MOVOU (16*6)(SP), T0; \
PXOR T0, B6; \
MOVOU (16*7)(DX), B7; \
PXOR (16*7)(SP), B7
MOVOU (16*7)(SP), T0; \
PXOR T0, B7

#define sseStore8Blocks \
sseStore4Blocks; \
PXOR (16*4)(SP), B4; \
MOVOU (16*4)(SP), T0; \
PXOR T0, B4; \
MOVOU B4, (16*4)(CX); \
PXOR (16*5)(SP), B5; \
MOVOU (16*5)(SP), T0; \
PXOR T0, B5; \
MOVOU B5, (16*5)(CX); \
PXOR (16*6)(SP), B6; \
MOVOU (16*6)(SP), T0; \
PXOR T0, B6; \
MOVOU B6, (16*6)(CX); \
PXOR (16*7)(SP), B7; \
MOVOU (16*7)(SP), T0; \
PXOR T0, B7; \
MOVOU B7, (16*7)(CX)

#define avxLoad4Blocks \
Expand Down

1 comment on commit 8f5e603

@emmansun
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.