Skip to content

Commit

Permalink
v2
Browse files Browse the repository at this point in the history
  • Loading branch information
monkins1010 committed Feb 3, 2019
1 parent 1c31b30 commit f5e0be8
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 41 deletions.
14 changes: 7 additions & 7 deletions verus/haraka.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Optimized Implementations for Haraka256 and Haraka512

#include <stdio.h>
#include "haraka.h"

#include <stdint.h>
u128 rc[40];
u128 rc0[40] = {0};

Expand Down Expand Up @@ -434,7 +434,6 @@ void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc
s[3] = LOAD(in + 48);

AES4(s[0], s[1], s[2], s[3], 0);

MIX4(s[0], s[1], s[2], s[3]);

AES4(s[0], s[1], s[2], s[3], 8);
Expand All @@ -444,17 +443,18 @@ void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc
MIX4(s[0], s[1], s[2], s[3]);

AES4(s[0], s[1], s[2], s[3], 24);
MIX4(s[0], s[1], s[2], s[3]);
MIX4_LAST(s[0], s[1], s[2], s[3]);

AES4_LAST(s[0], s[1], s[2], s[3], 32);

AES4(s[0], s[1], s[2], s[3], 32);
MIX4LAST(s[0], s[1], s[2], s[3]);

// s[0] = _mm_xor_si128(s[0], LOAD(in));
// s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
// s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
// s[3] = _mm_xor_si128(s[0], LOAD(in + 48));
((uint32_t*)&out[0])[7] = ((uint32_t*)&s[0])[10] ^ ((uint32_t*)&in[52])[0];

TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
//TRUNCSTORE(out, s[0],s[1], s[2], s[3]);
}

void haraka512_4x(unsigned char *out, const unsigned char *in) {
Expand Down
34 changes: 10 additions & 24 deletions verus/haraka.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,10 @@ extern u128 rc[40];
s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \
s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \

#define AES4LAST(s0, s1, s2, s3, rci) \
s0 = _mm_aesenc_si128(s0, rc[rci]); \
s1 = _mm_aesenc_si128(s1, rc[rci + 1]); \
#define AES4_LAST(s0, s1, s2, s3, rci) \
s2 = _mm_aesenc_si128(s2, rc[rci + 2]); \
s3 = _mm_aesenc_si128(s3, rc[rci + 3]); \
s0 = _mm_aesenc_si128(s0, rc[rci + 4]); \
s1 = _mm_aesenc_si128(s1, rc[rci + 5]); \
s2 = _mm_aesenc_si128(s2, rc[rci + 6]); \
s3 = _mm_aesenc_si128(s3, rc[rci + 7]); \
s2 = _mm_aesenc_si128(s2, rc[rci + 6]);


#define AES4_zero(s0, s1, s2, s3, rci) \
s0 = _mm_aesenc_si128(s0, rc0[rci]); \
Expand Down Expand Up @@ -220,24 +215,15 @@ extern u128 rc[40];
s2 = _mm_unpackhi_epi32(s1, tmp); \
s1 = _mm_unpacklo_epi32(s1, tmp);

#define MIX4LAST(s0, s1, s2, s3) \
s0 = _mm_unpackhi_epi32(s0, s1); \
s2 = _mm_unpackhi_epi32(s2, s3); \
s3 = _mm_unpacklo_epi32(s0, s2);

#define MIX4_LASTBUT1(s0, s1, s2, s3) \
#define MIX4_LAST(s0, s1, s2, s3) \
tmp = _mm_unpacklo_epi32(s0, s1); \
s1 = _mm_unpacklo_epi32(s2, s3); \
s2 = _mm_unpackhi_epi32(s1, tmp);

#define AES4_LAST(s3, rci) \
_mm_aesenc_si128(s3, rc[rci + 2]); \
_mm_aesenc_si128(s3, rc[rci + 6]); \

#define TRUNCSTORE(out, s0, s1, s2, s3) \
*(u64*)(out) = *(((u64*)&s0 + 1)); \
*(u64*)(out + 8) = *(((u64*)&s1 + 1)); \
*(u64*)(out + 16) = *(((u64*)&s2 + 0)); \
s2 = _mm_unpackhi_epi32(s1, tmp);

#define TRUNCSTORE(out, s0, s1, s2, s3) \
*(u64*)(out) = *(((u64*)&s0 + 1)); \
*(u64*)(out + 8) = *(((u64*)&s1 + 1)); \
*(u64*)(out + 16) = *(((u64*)&s2 + 0)); \
*(u64*)(out + 24) = *(((u64*)&s3 + 0));

void load_constants();
Expand Down
13 changes: 7 additions & 6 deletions verus/verus_clhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

#include <assert.h>
#include <string.h>
#include "immintrin.h"
//#include <intrin.h>
//#include "cpu_verushash.hpp"

#ifdef _WIN32
Expand Down Expand Up @@ -76,7 +76,7 @@ static inline uint64_t precompReduction64( __m128i A) {
}

// verus intermediate hash extra
static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex)
static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i * __restrict randomsource, const __m128i * __restrict buf, uint64_t keyMask, uint32_t * __restrict fixrand, uint32_t * __restrict fixrandex)
{
__m128i const *pbuf;

Expand All @@ -101,6 +101,7 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
// the random buffer must have at least 32 16 byte dwords after the keymask to work with this
// algorithm. we take the value from the last element inside the keyMask + 2, as that will never
// be used to xor into the accumulator before it is hashed with other values first

for (uint64_t i = 0; i < 32; i++)
{

Expand Down Expand Up @@ -402,7 +403,7 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource

// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
// returning a 64 bit hash value
uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex) {
uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t * __restrict fixrand, uint32_t * __restrict fixrandex) {
const __m128i lazy = _mm_cvtsi32_si128( 0x00010000);
__m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, fixrand, fixrandex);
acc = _mm_xor_si128(acc, lazy);
Expand Down Expand Up @@ -430,9 +431,9 @@ inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, c

AES4(s[0], s[1], s[2], s[3], 24);

MIX4_LASTBUT1(s[0], s[1], s[2], s[3]);
//MIX4_LASTBUT1(s[0], s[1], s[2], s[3]);

AES4_LAST(s[2], 32);
// AES4_LAST(s[2], 32);

// MIX4(s[0], s[1], s[2], s[3]);

Expand All @@ -441,7 +442,7 @@ inline void haraka512_keyed_local(unsigned char *out, const unsigned char *in, c

// s[0] = _mm_xor_si128(s[0], LOAD(in));
// s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
s[2] = _mm_xor_si128(s[2], LOAD(in + 46));
// s[2] = _mm_xor_si128(s[2], LOAD(in + 46));
// s[3] = _mm_xor_si128(s[3], LOAD(in + 48));

// TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
Expand Down
8 changes: 4 additions & 4 deletions verus/verusscan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ extern "C" void VerusHashHalf(void *result2, unsigned char *data, size_t len)


extern "C" void Verus2hash(unsigned char *hash, unsigned char *curBuf, uint32_t nonce,
u128 *data_key, uint8_t *gpu_init, uint32_t *fixrand, uint32_t *fixrandex, u128 *data_key_master)
u128 * __restrict data_key, uint8_t *gpu_init, uint32_t * __restrict fixrand, uint32_t * __restrict fixrandex, u128 * __restrict data_key_master)
{
uint64_t mask = VERUS_KEY_SIZE128; //552
if (!gpu_init[0]) {
Expand Down Expand Up @@ -203,7 +203,7 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = nonce_buf + throughput;
Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, data_key, &gpuinit, fixrand, fixrandex, data_key_master);

if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
if (vhash[7] <= Htarg )
{
*((uint32_t *)full_data + 368) = nonce_buf;
work->valid_nonces++;
Expand Down Expand Up @@ -235,8 +235,8 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce,
solps = (double)nonce_buf / secs;

pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
free(data_key);
free(data_key_master);
//free(data_key);
//free(data_key_master);
return work->valid_nonces;
}

Expand Down

0 comments on commit f5e0be8

Please sign in to comment.