Skip to content

Commit

Permalink
Poly1305: inline functions and unroll loops.
Browse files Browse the repository at this point in the history
Makes it up to 4x faster.
  • Loading branch information
dchest committed Sep 9, 2014
1 parent 1ea7e43 commit 819624d
Show file tree
Hide file tree
Showing 2 changed files with 235 additions and 46 deletions.
279 changes: 234 additions & 45 deletions nacl-fast.js
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,6 @@ function crypto_stream_xor(c,cpos,m,mpos,d,n,k) {
* https://github.com/floodyberry/poly1305-donna
*/

function U8TO16(p, i) { return (p[i] & 0xff) | ((p[i+1] & 0xff) << 8); }
function U16TO8(p, i, v) { p[i] = (v >>> 0) & 0xff; p[i+1] = (v >>> 8) & 0xff; }

var poly1305 = function(key) {
this.buffer = new Uint8Array(16);
this.r = new Uint16Array(10);
Expand All @@ -477,63 +474,240 @@ var poly1305 = function(key) {
this.leftover = 0;
this.fin = 0;

var i, t0, t1, t2, t3, t4, t5, t6, t7;
var t0, t1, t2, t3, t4, t5, t6, t7;

t0 = U8TO16(key, 0); this.r[0] = ( t0 ) & 0x1fff;
t1 = U8TO16(key, 2); this.r[1] = ((t0 >>> 13) | (t1 << 3)) & 0x1fff;
t2 = U8TO16(key, 4); this.r[2] = ((t1 >>> 10) | (t2 << 6)) & 0x1f03;
t3 = U8TO16(key, 6); this.r[3] = ((t2 >>> 7) | (t3 << 9)) & 0x1fff;
t4 = U8TO16(key, 8); this.r[4] = ((t3 >>> 4) | (t4 << 12)) & 0x00ff;
t0 = key[ 0] & 0xff | (key[ 1] & 0xff) << 8; this.r[0] = ( t0 ) & 0x1fff;
t1 = key[ 2] & 0xff | (key[ 3] & 0xff) << 8; this.r[1] = ((t0 >>> 13) | (t1 << 3)) & 0x1fff;
t2 = key[ 4] & 0xff | (key[ 5] & 0xff) << 8; this.r[2] = ((t1 >>> 10) | (t2 << 6)) & 0x1f03;
t3 = key[ 6] & 0xff | (key[ 7] & 0xff) << 8; this.r[3] = ((t2 >>> 7) | (t3 << 9)) & 0x1fff;
t4 = key[ 8] & 0xff | (key[ 9] & 0xff) << 8; this.r[4] = ((t3 >>> 4) | (t4 << 12)) & 0x00ff;
this.r[5] = ((t4 >>> 1)) & 0x1ffe;
t5 = U8TO16(key,10); this.r[6] = ((t4 >>> 14) | (t5 << 2)) & 0x1fff;
t6 = U8TO16(key,12); this.r[7] = ((t5 >>> 11) | (t6 << 5)) & 0x1f81;
t7 = U8TO16(key,14); this.r[8] = ((t6 >>> 8) | (t7 << 8)) & 0x1fff;
t5 = key[10] & 0xff | (key[11] & 0xff) << 8; this.r[6] = ((t4 >>> 14) | (t5 << 2)) & 0x1fff;
t6 = key[12] & 0xff | (key[13] & 0xff) << 8; this.r[7] = ((t5 >>> 11) | (t6 << 5)) & 0x1f81;
t7 = key[14] & 0xff | (key[15] & 0xff) << 8; this.r[8] = ((t6 >>> 8) | (t7 << 8)) & 0x1fff;
this.r[9] = ((t7 >>> 5)) & 0x007f;

for (i = 0; i < 8; i++) this.pad[i] = U8TO16(key, 16 + (2 * i));
this.pad[0] = key[16] & 0xff | (key[17] & 0xff) << 8;
this.pad[1] = key[18] & 0xff | (key[19] & 0xff) << 8;
this.pad[2] = key[20] & 0xff | (key[21] & 0xff) << 8;
this.pad[3] = key[22] & 0xff | (key[23] & 0xff) << 8;
this.pad[4] = key[24] & 0xff | (key[25] & 0xff) << 8;
this.pad[5] = key[26] & 0xff | (key[27] & 0xff) << 8;
this.pad[6] = key[28] & 0xff | (key[29] & 0xff) << 8;
this.pad[7] = key[30] & 0xff | (key[31] & 0xff) << 8;
};

poly1305.prototype.blocks = function(m, mpos, bytes) {
var hibit = this.fin ? 0 : (1 << 11);
var t0, t1, t2, t3, t4, t5, t6, t7;
var d = new Uint32Array(10);
var c, i, j;
var t0, t1, t2, t3, t4, t5, t6, t7, c;
var d0, d1, d2, d3, d4, d5, d6, d7, d8, d9;

var h0 = this.h[0],
h1 = this.h[1],
h2 = this.h[2],
h3 = this.h[3],
h4 = this.h[4],
h5 = this.h[5],
h6 = this.h[6],
h7 = this.h[7],
h8 = this.h[8],
h9 = this.h[9];

var r0 = this.r[0],
r1 = this.r[1],
r2 = this.r[2],
r3 = this.r[3],
r4 = this.r[4],
r5 = this.r[5],
r6 = this.r[6],
r7 = this.r[7],
r8 = this.r[8],
r9 = this.r[9];

while (bytes >= 16) {
t0 = U8TO16(m, mpos+0); this.h[0] += ( t0 ) & 0x1fff;
t1 = U8TO16(m, mpos+2); this.h[1] += ((t0 >>> 13) | (t1 << 3)) & 0x1fff;
t2 = U8TO16(m, mpos+4); this.h[2] += ((t1 >>> 10) | (t2 << 6)) & 0x1fff;
t3 = U8TO16(m, mpos+6); this.h[3] += ((t2 >>> 7) | (t3 << 9)) & 0x1fff;
t4 = U8TO16(m, mpos+8); this.h[4] += ((t3 >>> 4) | (t4 << 12)) & 0x1fff;
this.h[5] += ((t4 >>> 1)) & 0x1fff;
t5 = U8TO16(m, mpos+10); this.h[6] += ((t4 >>> 14) | (t5 << 2)) & 0x1fff;
t6 = U8TO16(m, mpos+12); this.h[7] += ((t5 >>> 11) | (t6 << 5)) & 0x1fff;
t7 = U8TO16(m, mpos+14); this.h[8] += ((t6 >>> 8) | (t7 << 8)) & 0x1fff;
this.h[9] += ((t7 >>> 5)) | hibit;

for (i = 0, c = 0; i < 10; i++) {
d[i] = c;
for (j = 0; j < 10; j++) {
d[i] += this.h[j] * ((j <= i) ? this.r[i - j] : (5 * this.r[i + 10 - j]));
if (j === 4) {
c = (d[i] >>> 13);
d[i] &= 0x1fff;
}
}
c += (d[i] >>> 13);
d[i] &= 0x1fff;
}
t0 = m[mpos+ 0] & 0xff | (m[mpos+ 1] & 0xff) << 8; h0 += ( t0 ) & 0x1fff;
t1 = m[mpos+ 2] & 0xff | (m[mpos+ 3] & 0xff) << 8; h1 += ((t0 >>> 13) | (t1 << 3)) & 0x1fff;
t2 = m[mpos+ 4] & 0xff | (m[mpos+ 5] & 0xff) << 8; h2 += ((t1 >>> 10) | (t2 << 6)) & 0x1fff;
t3 = m[mpos+ 6] & 0xff | (m[mpos+ 7] & 0xff) << 8; h3 += ((t2 >>> 7) | (t3 << 9)) & 0x1fff;
t4 = m[mpos+ 8] & 0xff | (m[mpos+ 9] & 0xff) << 8; h4 += ((t3 >>> 4) | (t4 << 12)) & 0x1fff;
h5 += ((t4 >>> 1)) & 0x1fff;
t5 = m[mpos+10] & 0xff | (m[mpos+11] & 0xff) << 8; h6 += ((t4 >>> 14) | (t5 << 2)) & 0x1fff;
t6 = m[mpos+12] & 0xff | (m[mpos+13] & 0xff) << 8; h7 += ((t5 >>> 11) | (t6 << 5)) & 0x1fff;
t7 = m[mpos+14] & 0xff | (m[mpos+15] & 0xff) << 8; h8 += ((t6 >>> 8) | (t7 << 8)) & 0x1fff;
h9 += ((t7 >>> 5)) | hibit;

c = 0;

d0 = c;
d0 += h0 * r0;
d0 += h1 * (5 * r9);
d0 += h2 * (5 * r8);
d0 += h3 * (5 * r7);
d0 += h4 * (5 * r6);
c = (d0 >>> 13); d0 &= 0x1fff;
d0 += h5 * (5 * r5);
d0 += h6 * (5 * r4);
d0 += h7 * (5 * r3);
d0 += h8 * (5 * r2);
d0 += h9 * (5 * r1);
c += (d0 >>> 13); d0 &= 0x1fff;

d1 = c;
d1 += h0 * r1;
d1 += h1 * r0;
d1 += h2 * (5 * r9);
d1 += h3 * (5 * r8);
d1 += h4 * (5 * r7);
c = (d1 >>> 13); d1 &= 0x1fff;
d1 += h5 * (5 * r6);
d1 += h6 * (5 * r5);
d1 += h7 * (5 * r4);
d1 += h8 * (5 * r3);
d1 += h9 * (5 * r2);
c += (d1 >>> 13); d1 &= 0x1fff;

d2 = c;
d2 += h0 * r2;
d2 += h1 * r1;
d2 += h2 * r0;
d2 += h3 * (5 * r9);
d2 += h4 * (5 * r8);
c = (d2 >>> 13); d2 &= 0x1fff;
d2 += h5 * (5 * r7);
d2 += h6 * (5 * r6);
d2 += h7 * (5 * r5);
d2 += h8 * (5 * r4);
d2 += h9 * (5 * r3);
c += (d2 >>> 13); d2 &= 0x1fff;

d3 = c;
d3 += h0 * r3;
d3 += h1 * r2;
d3 += h2 * r1;
d3 += h3 * r0;
d3 += h4 * (5 * r9);
c = (d3 >>> 13); d3 &= 0x1fff;
d3 += h5 * (5 * r8);
d3 += h6 * (5 * r7);
d3 += h7 * (5 * r6);
d3 += h8 * (5 * r5);
d3 += h9 * (5 * r4);
c += (d3 >>> 13); d3 &= 0x1fff;

d4 = c;
d4 += h0 * r4;
d4 += h1 * r3;
d4 += h2 * r2;
d4 += h3 * r1;
d4 += h4 * r0;
c = (d4 >>> 13); d4 &= 0x1fff;
d4 += h5 * (5 * r9);
d4 += h6 * (5 * r8);
d4 += h7 * (5 * r7);
d4 += h8 * (5 * r6);
d4 += h9 * (5 * r5);
c += (d4 >>> 13); d4 &= 0x1fff;

d5 = c;
d5 += h0 * r5;
d5 += h1 * r4;
d5 += h2 * r3;
d5 += h3 * r2;
d5 += h4 * r1;
c = (d5 >>> 13); d5 &= 0x1fff;
d5 += h5 * r0;
d5 += h6 * (5 * r9);
d5 += h7 * (5 * r8);
d5 += h8 * (5 * r7);
d5 += h9 * (5 * r6);
c += (d5 >>> 13); d5 &= 0x1fff;

d6 = c;
d6 += h0 * r6;
d6 += h1 * r5;
d6 += h2 * r4;
d6 += h3 * r3;
d6 += h4 * r2;
c = (d6 >>> 13); d6 &= 0x1fff;
d6 += h5 * r1;
d6 += h6 * r0;
d6 += h7 * (5 * r9);
d6 += h8 * (5 * r8);
d6 += h9 * (5 * r7);
c += (d6 >>> 13); d6 &= 0x1fff;

d7 = c;
d7 += h0 * r7;
d7 += h1 * r6;
d7 += h2 * r5;
d7 += h3 * r4;
d7 += h4 * r3;
c = (d7 >>> 13); d7 &= 0x1fff;
d7 += h5 * r2;
d7 += h6 * r1;
d7 += h7 * r0;
d7 += h8 * (5 * r9);
d7 += h9 * (5 * r8);
c += (d7 >>> 13); d7 &= 0x1fff;

d8 = c;
d8 += h0 * r8;
d8 += h1 * r7;
d8 += h2 * r6;
d8 += h3 * r5;
d8 += h4 * r4;
c = (d8 >>> 13); d8 &= 0x1fff;
d8 += h5 * r3;
d8 += h6 * r2;
d8 += h7 * r1;
d8 += h8 * r0;
d8 += h9 * (5 * r9);
c += (d8 >>> 13); d8 &= 0x1fff;

d9 = c;
d9 += h0 * r9;
d9 += h1 * r8;
d9 += h2 * r7;
d9 += h3 * r6;
d9 += h4 * r5;
c = (d9 >>> 13); d9 &= 0x1fff;
d9 += h5 * r4;
d9 += h6 * r3;
d9 += h7 * r2;
d9 += h8 * r1;
d9 += h9 * r0;
c += (d9 >>> 13); d9 &= 0x1fff;

c = (((c << 2) + c)) | 0;
c = (c + d[0]) | 0;
d[0] = c & 0x1fff;
c = (c + d0) | 0;
d0 = c & 0x1fff;
c = (c >>> 13);
d[1] += c;

for (i = 0; i < 10; i++) this.h[i] = d[i];
d1 += c;

h0 = d0;
h1 = d1;
h2 = d2;
h3 = d3;
h4 = d4;
h5 = d5;
h6 = d6;
h7 = d7;
h8 = d8;
h9 = d9;

mpos += 16;
bytes -= 16;
}
this.h[0] = h0;
this.h[1] = h1;
this.h[2] = h2;
this.h[3] = h3;
this.h[4] = h4;
this.h[5] = h5;
this.h[6] = h6;
this.h[7] = h7;
this.h[8] = h8;
this.h[9] = h9;
};

poly1305.prototype.finish = function(mac, macpos) {
Expand Down Expand Up @@ -594,7 +768,22 @@ poly1305.prototype.finish = function(mac, macpos) {
this.h[i] = f & 0xffff;
}

for (i = 0; i < 8; i++) U16TO8(mac, macpos + i*2, this.h[i]);
mac[macpos+ 0] = (this.h[0] >>> 0) & 0xff;
mac[macpos+ 1] = (this.h[0] >>> 8) & 0xff;
mac[macpos+ 2] = (this.h[1] >>> 0) & 0xff;
mac[macpos+ 3] = (this.h[1] >>> 8) & 0xff;
mac[macpos+ 4] = (this.h[2] >>> 0) & 0xff;
mac[macpos+ 5] = (this.h[2] >>> 8) & 0xff;
mac[macpos+ 6] = (this.h[3] >>> 0) & 0xff;
mac[macpos+ 7] = (this.h[3] >>> 8) & 0xff;
mac[macpos+ 8] = (this.h[4] >>> 0) & 0xff;
mac[macpos+ 9] = (this.h[4] >>> 8) & 0xff;
mac[macpos+10] = (this.h[5] >>> 0) & 0xff;
mac[macpos+11] = (this.h[5] >>> 8) & 0xff;
mac[macpos+12] = (this.h[6] >>> 0) & 0xff;
mac[macpos+13] = (this.h[6] >>> 8) & 0xff;
mac[macpos+14] = (this.h[7] >>> 0) & 0xff;
mac[macpos+15] = (this.h[7] >>> 8) & 0xff;
};

poly1305.prototype.update = function(m, mpos, bytes) {
Expand Down
Loading

0 comments on commit 819624d

Please sign in to comment.