Skip to content

Commit

Permalink
performance improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
karthikbhargavan committed Jun 3, 2024
1 parent 8fc2d4e commit ffa185e
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 68 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -857,9 +857,9 @@ ifdef CONFIG_READABLE_ASM
KBUILD_CFLAGS += -fno-reorder-blocks -fno-ipa-cp-clone -fno-partial-inlining
endif

ifneq ($(CONFIG_FRAME_WARN),0)
KBUILD_CFLAGS += -Wframe-larger-than=$(CONFIG_FRAME_WARN)
endif
#ifneq ($(CONFIG_FRAME_WARN),0)
#KBUILD_CFLAGS += -Wframe-larger-than=$(CONFIG_FRAME_WARN)
#endif

stackp-flags-y := -fno-stack-protector
stackp-flags-$(CONFIG_STACKPROTECTOR) := -fstack-protector
Expand Down
26 changes: 2 additions & 24 deletions crypto/hacl_bignum.c
Original file line number Diff line number Diff line change
Expand Up @@ -1037,18 +1037,7 @@ bn_mont_reduction_u64(uint32_t len, uint64_t *n, uint64_t nInv, uint64_t *c, uin
uint64_t c1 = 0ULL;
for (uint32_t i = 0U; i < len / 4U; i++)
{
uint64_t a_i = n[4U * i];
uint64_t *res_i0 = res_j0 + 4U * i;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i, qj, c1, res_i0);
uint64_t a_i0 = n[4U * i + 1U];
uint64_t *res_i1 = res_j0 + 4U * i + 1U;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i0, qj, c1, res_i1);
uint64_t a_i1 = n[4U * i + 2U];
uint64_t *res_i2 = res_j0 + 4U * i + 2U;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i1, qj, c1, res_i2);
uint64_t a_i2 = n[4U * i + 3U];
uint64_t *res_i = res_j0 + 4U * i + 3U;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i2, qj, c1, res_i);
c1 = bn_mul_add4_u64(n+4*i,qj,res_j0+4*i,c1);
}
for (uint32_t i = len / 4U * 4U; i < len; i++)
{
Expand Down Expand Up @@ -1293,18 +1282,7 @@ Hacl_Bignum_AlmostMontgomery_bn_almost_mont_reduction_u64(
uint64_t c1 = 0ULL;
for (uint32_t i = 0U; i < len / 4U; i++)
{
uint64_t a_i = n[4U * i];
uint64_t *res_i0 = res_j0 + 4U * i;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i, qj, c1, res_i0);
uint64_t a_i0 = n[4U * i + 1U];
uint64_t *res_i1 = res_j0 + 4U * i + 1U;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i0, qj, c1, res_i1);
uint64_t a_i1 = n[4U * i + 2U];
uint64_t *res_i2 = res_j0 + 4U * i + 2U;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i1, qj, c1, res_i2);
uint64_t a_i2 = n[4U * i + 3U];
uint64_t *res_i = res_j0 + 4U * i + 3U;
c1 = Hacl_Bignum_Base_mul_wide_add2_u64(a_i2, qj, c1, res_i);
c1 = bn_mul_add4_u64(n+4*i,qj,res_j0+4*i,c1);
}
for (uint32_t i = len / 4U * 4U; i < len; i++)
{
Expand Down
101 changes: 71 additions & 30 deletions crypto/hacl_bignum.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ static inline uint32_t
Hacl_Bignum_Base_mul_wide_add2_u32(uint32_t a, uint32_t b, uint32_t c_in, uint32_t *out)
{
uint32_t out0 = out[0U];
uint64_t res = (uint64_t)a * (uint64_t)b + (uint64_t)c_in + (uint64_t)out0;
out[0U] = (uint32_t)res;
uint64_t res = (uint64_t)a * (uint64_t)b + (uint64_t)out0;
res = (uint32_t)res + (uint64_t)c_in;
out[0] = res;
return (uint32_t)(res >> 32U);
}

Expand All @@ -23,10 +24,9 @@ Hacl_Bignum_Base_mul_wide_add2_u64(uint64_t a, uint64_t b, uint64_t c_in, uint64
{
uint64_t out0 = out[0U];
FStar_UInt128_uint128
res =
FStar_UInt128_add(FStar_UInt128_add(FStar_UInt128_mul_wide(a, b),
FStar_UInt128_uint64_to_uint128(c_in)),
FStar_UInt128_uint64_to_uint128(out0));
res = FStar_UInt128_mul_wide(a, b);
res = FStar_UInt128_add(res,FStar_UInt128_uint64_to_uint128(out0));
res = FStar_UInt128_add(res,FStar_UInt128_uint64_to_uint128(c_in));
out[0U] = FStar_UInt128_uint128_to_uint64(res);
return FStar_UInt128_uint128_to_uint64(FStar_UInt128_shift_right(res, 64U));
}
Expand Down Expand Up @@ -295,6 +295,69 @@ Hacl_Bignum_Multiplication_bn_mul_u32(
}
}

static inline uint64_t bn_mul_add4_u64(uint64_t* n, uint64_t qj, uint64_t* res_j0, uint64_t c1) {
FStar_UInt128_uint128 ab0 = FStar_UInt128_mul_wide(n[0], qj);
FStar_UInt128_uint128 ab1 = FStar_UInt128_mul_wide(n[1], qj);
ab0 = FStar_UInt128_add(ab0, FStar_UInt128_uint64_to_uint128(res_j0[0]));
ab1 = FStar_UInt128_add(ab1, FStar_UInt128_uint64_to_uint128(res_j0[1]));
FStar_UInt128_uint128 ab2 = FStar_UInt128_mul_wide(n[2], qj);
FStar_UInt128_uint128 ab3 = FStar_UInt128_mul_wide(n[3], qj);
ab2 = FStar_UInt128_add(ab2, FStar_UInt128_uint64_to_uint128(res_j0[2]));
ab3 = FStar_UInt128_add(ab3, FStar_UInt128_uint64_to_uint128(res_j0[3]));

uint64_t carry = c1;
ab0 = FStar_UInt128_add(ab0, carry);
carry = FStar_UInt128_shift_right(ab0, 64U);
ab1 = FStar_UInt128_add(ab1, carry);
carry = FStar_UInt128_shift_right(ab1, 64U);
ab2 = FStar_UInt128_add(ab2, carry);
carry = FStar_UInt128_shift_right(ab2, 64U);
ab3 = FStar_UInt128_add(ab3, carry);
carry = FStar_UInt128_shift_right(ab3, 64U);
c1 = carry;

res_j0[0] = ab0;
res_j0[1] = ab1;
res_j0[2] = ab2;
res_j0[3] = ab3;
return c1;
}

static inline uint64_t bn_mul_add4_u64_intrin(uint64_t* n, uint64_t qj, uint64_t* res_j0, uint64_t c1) {
FStar_UInt128_uint128 ab0 = FStar_UInt128_mul_wide(n[0], qj);
FStar_UInt128_uint128 ab1 = FStar_UInt128_mul_wide(n[1], qj);
FStar_UInt128_uint128 ab2 = FStar_UInt128_mul_wide(n[2], qj);
FStar_UInt128_uint128 ab3 = FStar_UInt128_mul_wide(n[3], qj);

uint64_t ab0l = ab0;
uint64_t ab1l = ab1;
uint64_t ab2l = ab2;
uint64_t ab3l = ab3;

uint64_t c = 0;

uint64_t abo0l, abo1l, abo2l, abo3l;
c = Lib_IntTypes_Intrinsics_add_carry_u64(c, ab0l, res_j0[0], &abo0l);
c = Lib_IntTypes_Intrinsics_add_carry_u64(c, ab1l, res_j0[1], &abo1l);
c = Lib_IntTypes_Intrinsics_add_carry_u64(c, ab2l, res_j0[2], &abo2l);
c = Lib_IntTypes_Intrinsics_add_carry_u64(c, ab3l, res_j0[3], &abo3l);

uint64_t ab0h = ab0 >> 64;
uint64_t ab1h = ab1 >> 64;
uint64_t ab2h = ab2 >> 64;
uint64_t ab3h = ab3 >> 64;

uint64_t x = 0;
x = Lib_IntTypes_Intrinsics_add_carry_u64(x, abo0l, c1, res_j0);
x = Lib_IntTypes_Intrinsics_add_carry_u64(x, abo1l, ab0h, res_j0 + 1);
x = Lib_IntTypes_Intrinsics_add_carry_u64(x, abo2l, ab1h, res_j0 + 2);
x = Lib_IntTypes_Intrinsics_add_carry_u64(x, abo3l, ab2h, res_j0 + 3);

x = Lib_IntTypes_Intrinsics_add_carry_u64(x, ab3h, c, &c1);
return c1;
}


static inline void
Hacl_Bignum_Multiplication_bn_mul_u64(
uint32_t aLen,
Expand All @@ -312,18 +375,7 @@ Hacl_Bignum_Multiplication_bn_mul_u64(
uint64_t c = 0ULL;
for (uint32_t i = 0U; i < aLen / 4U; i++)
{
uint64_t a_i = a[4U * i];
uint64_t *res_i0 = res_j + 4U * i;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i, bj, c, res_i0);
uint64_t a_i0 = a[4U * i + 1U];
uint64_t *res_i1 = res_j + 4U * i + 1U;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i0, bj, c, res_i1);
uint64_t a_i1 = a[4U * i + 2U];
uint64_t *res_i2 = res_j + 4U * i + 2U;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i1, bj, c, res_i2);
uint64_t a_i2 = a[4U * i + 3U];
uint64_t *res_i = res_j + 4U * i + 3U;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i2, bj, c, res_i);
c = bn_mul_add4_u64(a+4*i,bj,res_j+4*i,c);
}
for (uint32_t i = aLen / 4U * 4U; i < aLen; i++)
{
Expand Down Expand Up @@ -399,18 +451,7 @@ Hacl_Bignum_Multiplication_bn_sqr_u64(uint32_t aLen, uint64_t *a, uint64_t *res)
uint64_t c = 0ULL;
for (uint32_t i = 0U; i < i0 / 4U; i++)
{
uint64_t a_i = ab[4U * i];
uint64_t *res_i0 = res_j + 4U * i;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i, a_j, c, res_i0);
uint64_t a_i0 = ab[4U * i + 1U];
uint64_t *res_i1 = res_j + 4U * i + 1U;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i0, a_j, c, res_i1);
uint64_t a_i1 = ab[4U * i + 2U];
uint64_t *res_i2 = res_j + 4U * i + 2U;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i1, a_j, c, res_i2);
uint64_t a_i2 = ab[4U * i + 3U];
uint64_t *res_i = res_j + 4U * i + 3U;
c = Hacl_Bignum_Base_mul_wide_add2_u64(a_i2, a_j, c, res_i);
c = bn_mul_add4_u64(ab+4*i,a_j,res_j+4*i,c);
}
for (uint32_t i = i0 / 4U * 4U; i < i0; i++)
{
Expand Down
35 changes: 24 additions & 11 deletions crypto/hacl_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,28 +115,41 @@ Hacl_IntTypes_Intrinsics_sub_borrow_u32(uint32_t cin, uint32_t x, uint32_t y, ui


static inline uint64_t
Hacl_IntTypes_Intrinsics_sub_borrow_u64(uint64_t cin, uint64_t x, uint64_t y, uint64_t *r)
Hacl_IntTypes_Intrinsics_add_carry_u64(uint64_t cin, uint64_t x, uint64_t y, uint64_t *r)
{
uint64_t res = x - y - cin;
uint64_t
c =
((FStar_UInt64_gte_mask(res, x) & ~FStar_UInt64_eq_mask(res, x))
| (FStar_UInt64_eq_mask(res, x) & cin))
& (uint64_t)1U;
u128 res = (u128) x + (u128) y + (cin & 1);
u64 c = (res >> 64) & 1;
r[0U] = res;
return c;
}

static inline uint64_t
Hacl_IntTypes_Intrinsics_add_carry_u64(uint64_t cin, uint64_t x, uint64_t y, uint64_t *r)
Hacl_IntTypes_Intrinsics_sub_borrow_u64(uint64_t cin, uint64_t x, uint64_t y, uint64_t *r)
{
uint64_t res = x + cin + y;
uint64_t
c = (~FStar_UInt64_gte_mask(res, x) | (FStar_UInt64_eq_mask(res, x) & cin)) & (uint64_t)1U;
u128 res = (u128) x - (u128) y - (cin & 1);
u64 c = (res >> 64) & 1;
r[0U] = res;
return c;
}

/*
static inline uint64_t
Hacl_IntTypes_Intrinsics_add_carry_u64(uint64_t cin, uint64_t x, uint64_t y, uint64_t *r)
{
uint64_t cout = 0;
*r = __builtin_addcll(x,y,cin,&cout);
return cout;
}
static inline uint64_t
Hacl_IntTypes_Intrinsics_sub_borrow_u64(uint64_t cin, uint64_t x, uint64_t y, uint64_t *r)
{
uint64_t cout = 0;
*r = __builtin_subcll(x,y,cin,&cout);
return cout;
}
*/

#define Lib_IntTypes_Intrinsics_sub_borrow_u32(x1, x2, x3, x4) \
(Hacl_IntTypes_Intrinsics_sub_borrow_u32(x1, x2, x3, x4))

Expand Down

0 comments on commit ffa185e

Please sign in to comment.