Skip to content

Commit

Permalink
Prevent memory allocation for maskRounds array
Browse files Browse the repository at this point in the history
  • Loading branch information
fwessels committed Apr 24, 2020
1 parent 50624ea commit 12d4541
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 34 deletions.
38 changes: 20 additions & 18 deletions block_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,41 +101,41 @@ func init() {
}

// Interface function to assembly code
func blockMd5_x16(s *digest16, input [16][]byte, bases [2][]byte, half bool) {
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
if hasAVX512 {
blockMd5_avx512(s, input)
blockMd5_avx512(d, input, &s.maskRounds16)
} else {
s8a, s8b := digest8{}, digest8{}
for i := range s8a.v0 {
d8a, d8b := digest8{}, digest8{}
for i := range d8a.v0 {
j := i + 8
s8a.v0[i], s8a.v1[i], s8a.v2[i], s8a.v3[i] = s.v0[i], s.v1[i], s.v2[i], s.v3[i]
s8b.v0[i], s8b.v1[i], s8b.v2[i], s8b.v3[i] = s.v0[j], s.v1[j], s.v2[j], s.v3[j]
d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}

i8 := [2][8][]byte{}
for i := range i8[0] {
i8[0][i], i8[1][i] = input[i], input[8+i]
}
if half {
blockMd5_avx2(&s8a, i8[0], bases[0])
blockMd5_avx2(&d8a, i8[0], s.bases[0], &s.maskRounds8a)
} else {
wg := sync.WaitGroup{}
wg.Add(2)
go func() { blockMd5_avx2(&s8a, i8[0], bases[0]); wg.Done() }()
go func() { blockMd5_avx2(&s8b, i8[1], bases[1]); wg.Done() }()
go func() { blockMd5_avx2(&d8a, i8[0], s.bases[0], &s.maskRounds8a); wg.Done() }()
go func() { blockMd5_avx2(&d8b, i8[1], s.bases[1], &s.maskRounds8b); wg.Done() }()
wg.Wait()
}

for i := range s8a.v0 {
for i := range d8a.v0 {
j := i + 8
s.v0[i], s.v1[i], s.v2[i], s.v3[i] = s8a.v0[i], s8a.v1[i], s8a.v2[i], s8a.v3[i]
s.v0[j], s.v1[j], s.v2[j], s.v3[j] = s8b.v0[i], s8b.v1[i], s8b.v2[i], s8b.v3[i]
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
}
}
}

// Interface function to AVX512 assembly code
func blockMd5_avx512(s *digest16, input [16][]byte) {
func blockMd5_avx512(s *digest16, input [16][]byte, maskRounds *[16]maskRounds) {

// Sanity check to make sure we're not passing in more data than internalBlockSize
{
Expand All @@ -156,9 +156,10 @@ func blockMd5_avx512(s *digest16, input [16][]byte) {

sdup := *s // create copy of initial states to receive intermediate updates

maskRounds := generateMaskAndRounds16(input)
rounds := generateMaskAndRounds16(input, maskRounds)

for _, m := range maskRounds {
for r := 0; r < rounds; r++ {
m := maskRounds[r]

block16(&sdup.v0[0], &ptrs[0], m.mask, int(64*m.rounds))

Expand All @@ -172,7 +173,7 @@ func blockMd5_avx512(s *digest16, input [16][]byte) {
}

// Interface function to AVX2 assembly code
func blockMd5_avx2(s *digest8, input [8][]byte, base []byte) {
func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) {

// Sanity check to make sure we're not passing in more data than internalBlockSize
{
Expand All @@ -191,9 +192,10 @@ func blockMd5_avx2(s *digest8, input [8][]byte, base []byte) {

sdup := *s // create copy of initial states to receive intermediate updates

maskRounds := generateMaskAndRounds8(input)
rounds := generateMaskAndRounds8(input, maskRounds)

for _, m := range maskRounds {
for r := 0; r < rounds; r++ {
m := maskRounds[r]
var cache cache8 // stack storage for block8 tmp state
block8(&sdup.v0[0], uintptr(unsafe.Pointer(&(base[0]))), &bufs[0], &cache[0], int(64*m.rounds))

Expand Down
15 changes: 9 additions & 6 deletions md5-server_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,14 @@ type lanesInfo [Lanes]blockInput

// md5Server - Type to implement parallel handling of MD5 invocations
type md5Server struct {
uidCounter uint64
cycle chan uint64 // client with uid has update.
newInput chan newClient // Add new client.
digests map[uint64][Size]byte // Map of uids to (interim) digest results
bases [2][]byte // base memory (only for non-AVX512 mode)
uidCounter uint64
cycle chan uint64 // client with uid has update.
newInput chan newClient // Add new client.
digests map[uint64][Size]byte // Map of uids to (interim) digest results
maskRounds16 [16]maskRounds // Pre-allocated static array for max 16 rounds
maskRounds8a [8]maskRounds // Pre-allocated static array for max 8 rounds (1st AVX2 core)
maskRounds8b [8]maskRounds // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
bases [2][]byte // base memory (only for non-AVX512 mode)
}

// NewServer - Create new object for parallel processing handling
Expand Down Expand Up @@ -263,7 +266,7 @@ func (s *md5Server) blocks(lanes []blockInput) {
// Collect active digests...
state := s.getDigests(lanes)
// Process all lanes...
blockMd5_x16(&state, inputs, s.bases, len(lanes) <= 8)
s.blockMd5_x16(&state, inputs, len(lanes) <= 8)

for i, lane := range lanes {
uid := lane.uid
Expand Down
16 changes: 7 additions & 9 deletions md5-util_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ type maskRounds struct {
rounds uint64
}

func generateMaskAndRounds8(input [8][]byte) (mr []maskRounds) {
func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [8]lane
for c, inpt := range input {
Expand All @@ -31,22 +31,20 @@ func generateMaskAndRounds8(input [8][]byte) (mr []maskRounds) {
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xff), uint64(0)

// TODO: reuse this slice...
mr = make([]maskRounds, 0, 8)
for _, s := range sorted {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr = append(mr, maskRounds{m, (uint64(s.len) >> 6) - round})
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
rounds++
}
round = uint64(s.len) >> 6
}
m = m & ^(1 << uint(s.pos))
}

return
}

func generateMaskAndRounds16(input [16][]byte) (mr []maskRounds) {
func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) {

// Sort on blocks length small to large
var sorted [16]lane
Expand All @@ -57,16 +55,16 @@ func generateMaskAndRounds16(input [16][]byte) (mr []maskRounds) {

// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xffff), uint64(0)
mr = make([]maskRounds, 0, 16)

for _, s := range sorted {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr = append(mr, maskRounds{m, (uint64(s.len) >> 6) - round})
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
rounds++
}
round = uint64(s.len) >> 6
}
m = m & ^(1 << uint(s.pos))
}

return
}
8 changes: 7 additions & 1 deletion md5-util_amd64_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,19 @@ var goldenMask = []maskTest{

func TestGenerateMaskAndRounds(t *testing.T) {
input := [8][]byte{}
maskRound := [8]maskRounds{}
for gcase, g := range goldenMask {
for i, l := range g.in {
buf := make([]byte, l)
input[i] = buf[:]
}

mr := generateMaskAndRounds8(input)
rounds := generateMaskAndRounds8(input, &maskRound)

mr := make([]maskRounds, 0, 8)
for r := 0; r < rounds; r++ {
mr = append(mr, maskRound[r])
}

if !reflect.DeepEqual(mr, g.out) {
t.Fatalf("case %d: got %04x\n want %04x", gcase, mr, g.out)
Expand Down

0 comments on commit 12d4541

Please sign in to comment.