Skip to content

Commit

Permalink
sm3: s390x block, init
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Sep 10, 2024
1 parent 8cb3565 commit 8c2dd95
Show file tree
Hide file tree
Showing 3 changed files with 348 additions and 1 deletion.
2 changes: 1 addition & 1 deletion sm3/sm3block_generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

//go:build purego || !(amd64 || arm64 || ppc64 || ppc64le)
//go:build purego || !(amd64 || arm64 || ppc64 || ppc64le || s390x)

package sm3

Expand Down
10 changes: 10 additions & 0 deletions sm3/sm3block_s390x.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// Copyright 2024 Sun Yimin. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

//go:build !purego

package sm3

//go:noescape
func block(dig *digest, p []byte)
337 changes: 337 additions & 0 deletions sm3/sm3block_s390x.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
// Copyright 2024 Sun Yimin. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

//go:build !purego

#include "textflag.h"
#include "sm3_const_asm.s"

#define a R1
#define b R2
#define c R3
#define d R4
#define e R5
#define f R6
#define g R7
#define h R8

#define CTX R9
#define INP R10
#define LEN R11
#define END R12

#define y0 R9
#define y1 R11
#define y2 R12

#define XWORD0 V0
#define XWORD1 V1
#define XWORD2 V2
#define XWORD3 V3

#define XTMP0 V4
#define XTMP1 V5
#define XTMP2 V6
#define XTMP3 V7
#define XTMP4 V8

#define XFER V9

#define SS12(a, e, const, ss1, ss2) \
RLL $12, a, ss2; \ // y0 = a <<< 12
ADD $const, e, ss1; \
ADD ss2, ss1; \ // y2 = a <<< 12 + e + T
RLL $7, ss1; \ // y2 = SS1
XOR ss1, ss2

#define P0(tt2, tmp, out) \
RLL $9, tt2, tmp; \
RLL $17, tt2, out; \
XOR tmp, out; \
XOR tt2, out

// For rounds [0 - 16)
// addr1 for w, addr2 for w'
#define DO_ROUND_N_0(addr1, addr2, const, a, b, c, d, e, f, g, h) \
; \
SS12(a, e, const, y2, y0); \
MOVWZ addr1, y1; \
ADD y1, y2; \ // y2 = SS1 + W
ADD h, y2; \ // y2 = h + SS1 + W
MOVWZ addr2, y1; \
ADD y1, y0; \ // y0 = SS2 + W'
ADD d, y0; \ // y0 = d + SS2 + W'
; \
XOR a, b, h; \
XOR c, h; \
ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1
; \
XOR e, f, y1; \
XOR g, y1; \
ADD y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2
; \
RLL $9, b; \
RLL $19, f; \
; \
P0(y2, y0, d)

// For rounds [16 - 64)
// addr1 for w, addr2 for w'
#define DO_ROUND_N_1(addr1, addr2, const, a, b, c, d, e, f, g, h) \
; \
SS12(a, e, const, y2, y0); \
MOVWZ addr1, y1; \
ADD y1, y2; \ // y2 = SS1 + W
ADD h, y2; \ // y2 = h + SS1 + W
MOVWZ addr2, y1; \
ADD y1, y0; \ // y0 = SS2 + W'
ADD d, y0; \ // y0 = d + SS2 + W'
; \
OR a, b, y1; \
AND a, b, h; \
AND c, y1; \
OR y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c)
ADD y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1
; \
XOR f, g, y1; \
AND e, y1; \
XOR g, y1; \ // y1 = GG2(e, f, g)
ADD y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2
; \
RLL $9, b; \
RLL $19, f; \
; \
P0(y2, y0, d)

// r = s <<< n
#define PROLD(s, r, n) \
VERLLF $n, s, r

#define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \
VSLDB $12, XWORD0, XWORD1, XTMP0; \ // XTMP0 = W[-13] = {w3, w4, w5, w6}
PROLD(XTMP0, XTMP1, 7); \ // XTMP1 = W[-13] rol 7
VSLDB $8, XWORD2, XWORD3, XTMP0; \ // XTMP0 = W[-6] = {w10, w11, w12, w13}
VX XTMP0, XTMP1, XTMP0; \ // XTMP0 = W[-6] xor (W[-13] rol 7)
; \ // Prepare P1 parameters
VSLDB $12, XWORD1, XWORD2, XTMP1; \ // XTMP1 = W[-9] = {w7, w8, w9, w10}
VX XTMP1, XWORD0, XTMP1; \ // XTMP1 = W[-9] xor W[-16]
VSLDB $4, XWORD3, XWORD2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w8}
PROLD(XTMP3, XTMP2, 15); \ // XTMP2 = W[-3] rol 15
VX XTMP1, XTMP2, XTMP2; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx}
; \ // P1
PROLD(XTMP2, XTMP4, 15); \ // XTMP4 = = XTMP2 rol 15 {ABxx}
PROLD(XTMP4, XTMP3, 8); \ // XTMP3 = XTMP2 rol 23 {ABxx}
VX XTMP2, XTMP4, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx})
VX XTMP4, XTMP3, XTMP4; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx})
; \ // First 2 words message schedule result
VX XTMP4, XTMP0, XTMP2; \ // XTMP2 = {w[0], w[1], ..., ...}
; \ // Prepare P1 parameters
VSLDB $4, XWORD3, XTMP2, XTMP3; \ // XTMP3 = W[-3] = {w13, w14, w15, w0}
PROLD(XTMP3, XTMP4, 15); \ // XTMP4 = W[-3] rol 15
VX XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD}
; \ // P1
PROLD(XTMP4, XTMP3, 15); \ // XTMP3 = = XTMP4 rol 15 {ABCD}
PROLD(XTMP3, XTMP1, 8); \ // XTMP1 = XTMP4 rol 23 {ABCD}
VX XTMP4, XTMP3, XTMP3; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD})
VX XTMP3, XTMP1, XTMP1; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD})
; \ // 4 words message schedule result
VX XTMP1, XTMP0, XWORD0; \ // XWORD0 = {w[0], w[1], w[2], w[3]}

// func block(dig *digest, p []byte)
TEXT ·block(SB),NOSPLIT,$72-32
MOVD dig+0(FP), CTX
MOVD p+8(FP), INP
MOVD p_len+16(FP), LEN
AND $-64, LEN
LAY (INP)(LEN*1), END

CMPBEQ INP, END, end
MOVD END, tmp-8(SP) // backup END
LMY 0(CTX), a, h

loop:
STMY a, h, tmp-40(SP) // backup state
VLM (INP), XWORD0, XWORD3

schedule_compress: // for w0 - w47
// Do 4 rounds and scheduling
VST XWORD0, tmp-56(SP)
VX XWORD0, XWORD1, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T0, a, b, c, d, e, f, g, h)
DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T1, h, a, b, c, d, e, f, g)
MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T2, g, h, a, b, c, d, e, f)
DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T3, f, g, h, a, b, c, d, e)

// Do 4 rounds and scheduling
VST XWORD1, tmp-56(SP)
VX XWORD1, XWORD2, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T4, e, f, g, h, a, b, c, d)
DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T5, d, e, f, g, h, a, b, c)
MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T6, c, d, e, f, g, h, a, b)
DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T7, b, c, d, e, f, g, h, a)

// Do 4 rounds and scheduling
VST XWORD2, tmp-56(SP)
VX XWORD2, XWORD3, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T8, a, b, c, d, e, f, g, h)
DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T9, h, a, b, c, d, e, f, g)
MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T10, g, h, a, b, c, d, e, f)
DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T11, f, g, h, a, b, c, d, e)

// Do 4 rounds and scheduling
VST XWORD3, tmp-56(SP)
VX XWORD3, XWORD0, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T12, e, f, g, h, a, b, c, d)
DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T13, d, e, f, g, h, a, b, c)
MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T14, c, d, e, f, g, h, a, b)
DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T15, b, c, d, e, f, g, h, a)

// Do 4 rounds and scheduling
VST XWORD0, tmp-56(SP)
VX XWORD0, XWORD1, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T16, a, b, c, d, e, f, g, h)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T17, h, a, b, c, d, e, f, g)
MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T18, g, h, a, b, c, d, e, f)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T19, f, g, h, a, b, c, d, e)

// Do 4 rounds and scheduling
VST XWORD1, tmp-56(SP)
VX XWORD1, XWORD2, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T20, e, f, g, h, a, b, c, d)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T21, d, e, f, g, h, a, b, c)
MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T22, c, d, e, f, g, h, a, b)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T23, b, c, d, e, f, g, h, a)

// Do 4 rounds and scheduling
VST XWORD2, tmp-56(SP)
VX XWORD2, XWORD3, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T24, a, b, c, d, e, f, g, h)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T25, h, a, b, c, d, e, f, g)
MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T26, g, h, a, b, c, d, e, f)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T27, f, g, h, a, b, c, d, e)

// Do 4 rounds and scheduling
VST XWORD3, tmp-56(SP)
VX XWORD3, XWORD0, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T28, e, f, g, h, a, b, c, d)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T29, d, e, f, g, h, a, b, c)
MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T30, c, d, e, f, g, h, a, b)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T31, b, c, d, e, f, g, h, a)

// Do 4 rounds and scheduling
VST XWORD0, tmp-56(SP)
VX XWORD0, XWORD1, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T32, a, b, c, d, e, f, g, h)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T33, h, a, b, c, d, e, f, g)
MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T34, g, h, a, b, c, d, e, f)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T35, f, g, h, a, b, c, d, e)

// Do 4 rounds and scheduling
VST XWORD1, tmp-56(SP)
VX XWORD1, XWORD2, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T36, e, f, g, h, a, b, c, d)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T37, d, e, f, g, h, a, b, c)
MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T38, c, d, e, f, g, h, a, b)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T39, b, c, d, e, f, g, h, a)

// Do 4 rounds and scheduling
VST XWORD2, tmp-56(SP)
VX XWORD2, XWORD3, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T40, a, b, c, d, e, f, g, h)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T41, h, a, b, c, d, e, f, g)
MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T42, g, h, a, b, c, d, e, f)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T43, f, g, h, a, b, c, d, e)

// Do 4 rounds and scheduling
VST XWORD3, tmp-56(SP)
VX XWORD3, XWORD0, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T44, e, f, g, h, a, b, c, d)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T45, d, e, f, g, h, a, b, c)
MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T46, c, d, e, f, g, h, a, b)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T47, b, c, d, e, f, g, h, a)

// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
// Do 4 rounds
VST XWORD0, tmp-56(SP)
VX XWORD0, XWORD1, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T48, a, b, c, d, e, f, g, h)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T49, h, a, b, c, d, e, f, g)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T50, g, h, a, b, c, d, e, f)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T51, f, g, h, a, b, c, d, e)

VST XWORD1, tmp-56(SP)
VX XWORD1, XWORD2, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T52, e, f, g, h, a, b, c, d)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T53, d, e, f, g, h, a, b, c)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T54, c, d, e, f, g, h, a, b)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T55, b, c, d, e, f, g, h, a)

VST XWORD2, tmp-56(SP)
VX XWORD2, XWORD3, XFER
VST XFER, tmp-72(SP)
MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T56, a, b, c, d, e, f, g, h)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T57, h, a, b, c, d, e, f, g)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T58, g, h, a, b, c, d, e, f)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T59, f, g, h, a, b, c, d, e)

VST XWORD3, tmp-56(SP)
VX XWORD3, XWORD0, XFER
VST XFER, tmp-72(SP)
DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T60, e, f, g, h, a, b, c, d)
DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T61, d, e, f, g, h, a, b, c)
DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T62, c, d, e, f, g, h, a, b)
DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T63, b, c, d, e, f, g, h, a)

MOVWZ tmp-40(SP), END
XOR END, a
MOVWZ tmp-36(SP), END
XOR END, b
MOVWZ tmp-32(SP), END
XOR END, c
MOVWZ tmp-28(SP), END
XOR END, d
MOVWZ tmp-24(SP), END
XOR END, e
MOVWZ tmp-20(SP), END
XOR END, f
MOVWZ tmp-16(SP), END
XOR END, g
MOVWZ tmp-12(SP), END
XOR END, h

LA 64(INP), INP
MOVD tmp-8(SP), END
CMPBLT INP, END, loop

end:
MOVD dig+0(FP), CTX
STMY a, h, 0(CTX)
RET

1 comment on commit 8c2dd95

@emmansun
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.