Skip to content

Commit

Permalink
Update version
Browse files Browse the repository at this point in the history
  • Loading branch information
monkins1010 committed May 20, 2023
1 parent 1bd9340 commit 7c739a0
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 346 deletions.
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
AC_INIT([ccminer_CPU], [3.7.1], [], [ccminer], [http://github.com/monkins1010/ccminer])
AC_INIT([ccminer_CPU], [3.8.0], [], [ccminer], [http://github.com/monkins1010/ccminer])

AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
Expand Down
338 changes: 0 additions & 338 deletions verus/verus_clhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,336 +57,6 @@ uint64_t precompReduction64(__m128i A) {
return _mm_cvtsi128_si64(precompReduction64_si128(A));
}

// verus intermediate hash extra
__m128i __verusclmulwithoutreduction64alignedrepeatv2_1(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask,
uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex)
{
const __m128i *pbuf;
const __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] };

// divide key mask by 16 from bytes to __m128i
// keyMask >>= 4;

__m128i acc = _mm_load_si128(randomsource + 513);

// the random buffer must have at least 32 16 byte dwords after the keymask to work with this
// algorithm. we take the value from the last element inside the keyMask + 2, as that will never
// be used to xor into the accumulator before it is hashed with other values first

//#pragma unroll 32

for (uint32_t i = 0; i < 32; i++)
{

const uint64_t selector = _mm_cvtsi128_si64(acc);

uint32_t prand_idx = (selector >> 5) & keyMask;
uint32_t prandex_idx = (selector >> 32) & keyMask;
// get two random locations in the key, which will be mutated and swapped
__m128i *prand = randomsource + prand_idx;
__m128i *prandex = randomsource + prandex_idx;

// select random start and order of pbuf processing
pbuf = pbuf_copy + (selector & 3);
_mm_store_si128(&g_prand[i], prand[0]);
_mm_store_si128(&g_prandex[i], prandex[0]);
fixrand[i] = prand_idx;
fixrandex[i] = prandex_idx;

switch (selector & 0x1c)
{

case 0:
{
const __m128i temp1 = _mm_load_si128(prandex);
const __m128i temp2 = pbuf[(selector & 1) ? -1 : 1];
const __m128i add1 = _mm_xor_si128(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);

const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);

const __m128i temp12 = _mm_load_si128(prand);
*prand = _mm_xor_si128(tempa1, temp1);
// _mm_store_si128(prand, tempa2);

const __m128i temp22 = _mm_load_si128(pbuf);
const __m128i add12 = _mm_xor_si128(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
acc = _mm_xor_si128(clprod12, acc);


const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
*prandex = _mm_xor_si128(tempb1, temp12);
// _mm_store_si128(prandex, tempb2);

break;
}
case 4:
{
const __m128i temp1 = _mm_load_si128(prand);
const __m128i temp2 = _mm_load_si128(pbuf);
const __m128i add1 = _mm_xor_si128(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);
__m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
acc = _mm_xor_si128(clprod2, acc);

const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);

const __m128i temp12 = _mm_load_si128(prandex);
*prand = _mm_xor_si128(tempa1, temp1);
// _mm_store_si128(prandex, tempa2);

const __m128i temp22 = pbuf[(selector & 1) ? -1 : 1];
const __m128i add12 = _mm_xor_si128(temp12, temp22);
acc = _mm_xor_si128(add12, acc);


const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
*prand = _mm_xor_si128(tempb1, temp12);
// _mm_store_si128(prand, tempb2);

break;
}
case 8:
{
const __m128i temp1 = _mm_load_si128(prandex);
const __m128i temp2 = _mm_load_si128(pbuf);
const __m128i add1 = _mm_xor_si128(temp1, temp2);
acc = _mm_xor_si128(add1, acc);

const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);

const __m128i temp12 = _mm_load_si128(prand);
*prand = _mm_xor_si128(tempa1, temp1);
// _mm_store_si128(prand, tempa2);

const __m128i temp22 = pbuf[(selector & 1) ? -1 : 1];
const __m128i add12 = _mm_xor_si128(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
acc = _mm_xor_si128(clprod12, acc);
const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
acc = _mm_xor_si128(clprod22, acc);


const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
*prandex = _mm_xor_si128(tempb1, temp12);
// _mm_store_si128(prandex, tempb2);

break;
}
case 0xc:
{
const __m128i temp1 = _mm_load_si128(prand);
const __m128i temp2 = pbuf[(selector & 1) ? -1 : 1];
const __m128i add1 = _mm_xor_si128(temp1, temp2);

// cannot be zero here
int32_t divisor = (uint32_t)selector;

acc = _mm_xor_si128(add1, acc);

int64_t dividend = _mm_cvtsi128_si64(acc);
const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
acc = _mm_xor_si128(modulo, acc);

const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);

if (dividend & 1)
{
const __m128i temp12 = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa2);

const __m128i temp22 = _mm_load_si128(pbuf);
const __m128i add12 = _mm_xor_si128(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
acc = _mm_xor_si128(clprod12, acc);
const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
acc = _mm_xor_si128(clprod22, acc);

const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
_mm_store_si128(prand, tempb2);
}
else
{

*prand = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa2);
// _mm_store_si128(prand, tempb3);
}

break;
}
case 0x10:
{
// a few AES operations
const __m128i *rc = prand;
__m128i tmp;

__m128i temp1 = pbuf[(selector & 1) ? -1 : 1];
__m128i temp2 = _mm_load_si128(pbuf);

AES2(temp1, temp2, 0);

MIX2(temp1, temp2);

AES2(temp1, temp2, 4);
MIX2(temp1, temp2);

AES2(temp1, temp2, 8);
MIX2(temp1, temp2);

acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));

const __m128i tempa1 = _mm_load_si128(prand);
const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);

const __m128i tempa4 = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa3);
_mm_store_si128(prand, tempa4);


break;
}
case 0x14:
{
// we'll just call this one the monkins loop, inspired by Chris
const __m128i *buftmp = &pbuf[(selector & 1) ? -1 : 1];
__m128i tmp; // used by MIX2

uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
const __m128i *rc = prand;
uint64_t aesroundoffset = 0, loop_c;
__m128i onekey;

do
{
loop_c = selector & (((uint64_t)0x10000000) << rounds);
if (loop_c)
{
onekey = _mm_load_si128(rc++);
const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
const __m128i add1 = _mm_xor_si128(onekey, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);
}
else
{
onekey = _mm_load_si128(rc++);
__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);

AES2(onekey, temp2, aesroundoffset);

aesroundoffset += 4;
MIX2(onekey, temp2);

acc = _mm_xor_si128(onekey, acc);
acc = _mm_xor_si128(temp2, acc);
}

} while (rounds--);


const __m128i tempa1 = _mm_load_si128(prand);
const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);

*prand = _mm_load_si128(prandex);
*prandex = _mm_xor_si128(tempa1, tempa2);
// _mm_store_si128(prandex, tempa3);
// _mm_store_si128(prand, tempa4);

break;
}
case 0x18:
{
const __m128i *buftmp = &pbuf[(selector & 1) ? -1 : 1];
__m128i tmp; // used by MIX2

uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
__m128i *rc = prand;
uint64_t aesroundoffset = 0;
__m128i onekey;

do
{
if (selector & (((uint64_t)0x10000000) << rounds))
{
onekey = _mm_load_si128(rc++);
__m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
__m128i add1 = _mm_xor_si128(onekey, temp2);
// cannot be zero here, may be negative
int32_t divisor = (uint32_t)selector;
int64_t dividend = _mm_cvtsi128_si64(add1);
__m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
acc = _mm_xor_si128(modulo, acc);
}
else
{
onekey = _mm_load_si128(rc++);
__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
__m128i add1 = _mm_xor_si128(onekey, temp2);
__m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
__m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1);
acc = _mm_xor_si128(clprod2, acc);
}
} while (rounds--);

__m128i tempa3 = _mm_load_si128(prandex);
*prandex = _mm_xor_si128(tempa3, acc);
// _mm_store_si128(prandex, tempa4);
_mm_store_si128(prand, onekey);
break;
}
case 0x1c:
{
const __m128i temp1 = _mm_load_si128(pbuf);
const __m128i temp2 = _mm_load_si128(prandex);
const __m128i add1 = _mm_xor_si128(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);

const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);

const __m128i tempa3 = _mm_load_si128(prand);
#ifdef VERUSHASHDEBUGo

printf("[cpu] tempa1 : ");
printf("%016llx%016llx", ((uint64_t*)&tempa1)[0], ((uint64_t*)&tempa1)[1]);
printf("\n");
printf("[cpu] tempa2 : ");
printf("%016llx%016llx", ((uint64_t*)&tempa2)[0], ((uint64_t*)&tempa2)[1]);
printf("\n");
printf("[cpu] tempa3 : ");
printf("%016llx%016llx", ((uint64_t*)&tempa3)[0], ((uint64_t*)&tempa3)[1]);
printf("\n");

#endif
_mm_store_si128(prand, tempa2);

acc = _mm_xor_si128(tempa3, acc);


const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
_mm_store_si128(prandex, tempb2);


break;
}
}



}

return acc;
}

__m128i __verusclmulwithoutreduction64alignedrepeatv2_2(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask,
uint32_t *fixrand, uint32_t *fixrandex, u128 *g_prand, u128 *g_prandex)
{
Expand Down Expand Up @@ -675,14 +345,6 @@ __m128i __verusclmulwithoutreduction64alignedrepeatv2_2(__m128i *randomsource, c

// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
// returning a 64 bit hash value
uint64_t verusclhashv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
u128 *g_prand, u128 *g_prandex) {
__m128i acc = __verusclmulwithoutreduction64alignedrepeatv2_1((__m128i *)random, (const __m128i *)buf, 511, fixrand, fixrandex, g_prand, g_prandex);
acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));


return precompReduction64(acc);
}

uint64_t verusclhashv2_2(void * random, const unsigned char buf[64], uint64_t keyMask, uint32_t *fixrand, uint32_t *fixrandex,
u128 *g_prand, u128 *g_prandex) {
Expand Down
11 changes: 4 additions & 7 deletions verus/verusscan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce,
memcpy(full_data, pdata, 140);
memcpy(sol_data, block_41970, 3);
memcpy(sol_data + 3, work->solution, 1344);
int version = work->solution[0];
uint8_t version = work->solution[0];

if (version >= 7) {

Expand All @@ -198,7 +198,7 @@ extern "C" int scanhash_verus(int thr_id, struct work *work, uint32_t max_nonce,
do {

*hashes_done = nonce_buf + throughput;
Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, data_key,
Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, data_key,
&gpuinit, fixrand, fixrandex , data_key_prand, data_key_prandex, version);

if (vhash[7] <= Htarg )
Expand All @@ -216,7 +216,6 @@ Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, d
goto out;
}

//}
if ((uint64_t)throughput + (uint64_t)nonce_buf >= (uint64_t)max_nonce) {

break;
Expand All @@ -228,13 +227,11 @@ Verus2hash((unsigned char *)vhash, (unsigned char *)blockhash_half, nonce_buf, d

out:
gettimeofday(&tv_end, NULL);
//timeval_subtract(&diff, &tv_end, &tv_start);
//secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
//solps = (double)nonce_buf / secs;


pdata[NONCE_OFT] = ((uint32_t*)full_data)[NONCE_OFT] + 1;
free(data_key);
//free(data_key_master);

return work->valid_nonces;
}

Expand Down

0 comments on commit 7c739a0

Please sign in to comment.