Add optimizations for AVX-capable processors
This commit is contained in:
parent
94cb469bbf
commit
18033f6a04
5 changed files with 848 additions and 381 deletions
25
configure.ac
25
configure.ac
|
@ -23,15 +23,15 @@ AC_CHECK_HEADERS(syslog.h)
|
||||||
AC_FUNC_ALLOCA
|
AC_FUNC_ALLOCA
|
||||||
|
|
||||||
case $target in
|
case $target in
|
||||||
x86_64-*)
|
i*86-*-*)
|
||||||
have_x86_64=true
|
have_x86=true
|
||||||
;;
|
;;
|
||||||
*)
|
x86_64-*-*)
|
||||||
have_x86_64=false
|
have_x86=true
|
||||||
|
have_x86_64=true
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
have_win32=false
|
|
||||||
PTHREAD_FLAGS="-pthread"
|
PTHREAD_FLAGS="-pthread"
|
||||||
WS2_LIBS=""
|
WS2_LIBS=""
|
||||||
|
|
||||||
|
@ -43,6 +43,18 @@ case $target in
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
if test x$have_x86 = xtrue
|
||||||
|
then
|
||||||
|
AC_MSG_CHECKING(whether we can compile AVX code)
|
||||||
|
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
|
||||||
|
AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
|
||||||
|
AC_MSG_RESULT(yes)
|
||||||
|
,
|
||||||
|
AC_MSG_RESULT(no)
|
||||||
|
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
|
||||||
|
)
|
||||||
|
fi
|
||||||
|
|
||||||
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
|
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
|
||||||
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
|
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
|
||||||
AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
|
AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
|
||||||
|
@ -52,7 +64,8 @@ AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
|
||||||
|
|
||||||
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
|
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
|
||||||
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
|
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
|
||||||
AM_CONDITIONAL([HAVE_x86_64], [test x$have_x86_64 = xtrue])
|
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
|
||||||
|
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
|
||||||
|
|
||||||
if test x$request_jansson = xtrue
|
if test x$request_jansson = xtrue
|
||||||
then
|
then
|
||||||
|
|
|
@ -508,7 +508,7 @@ static void *miner_thread(void *userdata)
|
||||||
unsigned long hashes_done;
|
unsigned long hashes_done;
|
||||||
struct timeval tv_start, tv_end, diff;
|
struct timeval tv_start, tv_end, diff;
|
||||||
int64_t max64;
|
int64_t max64;
|
||||||
bool rc;
|
int rc;
|
||||||
|
|
||||||
/* obtain new work from internal workio thread */
|
/* obtain new work from internal workio thread */
|
||||||
pthread_mutex_lock(&g_work_lock);
|
pthread_mutex_lock(&g_work_lock);
|
||||||
|
|
963
scrypt-x64.S
963
scrypt-x64.S
File diff suppressed because it is too large
Load diff
60
scrypt-x86.S
60
scrypt-x86.S
|
@ -24,6 +24,8 @@
|
||||||
* SUCH DAMAGE.
|
* SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "cpuminer-config.h"
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ELF__)
|
#if defined(__linux__) && defined(__ELF__)
|
||||||
.section .note.GNU-stack,"",%progbits
|
.section .note.GNU-stack,"",%progbits
|
||||||
#endif
|
#endif
|
||||||
|
@ -65,7 +67,7 @@
|
||||||
movl %edx, \do+56(\dest)
|
movl %edx, \do+56(\dest)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro gen_salsa8_core_quadround
|
.macro salsa8_core_gen_quadround
|
||||||
movl 52(%esp), %ecx
|
movl 52(%esp), %ecx
|
||||||
movl 4(%esp), %edx
|
movl 4(%esp), %edx
|
||||||
movl 20(%esp), %ebx
|
movl 20(%esp), %ebx
|
||||||
|
@ -387,9 +389,9 @@
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.p2align 5
|
.p2align 5
|
||||||
gen_salsa8_core:
|
salsa8_core_gen:
|
||||||
gen_salsa8_core_quadround
|
salsa8_core_gen_quadround
|
||||||
gen_salsa8_core_quadround
|
salsa8_core_gen_quadround
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
@ -408,9 +410,9 @@ _scrypt_core:
|
||||||
movl $1, %eax
|
movl $1, %eax
|
||||||
cpuid
|
cpuid
|
||||||
andl $0x04000000, %edx
|
andl $0x04000000, %edx
|
||||||
jnz xmm_scrypt_core
|
jnz scrypt_core_sse2
|
||||||
|
|
||||||
gen_scrypt_core:
|
scrypt_core_gen:
|
||||||
movl 20(%esp), %edi
|
movl 20(%esp), %edi
|
||||||
movl 24(%esp), %esi
|
movl 24(%esp), %esi
|
||||||
subl $72, %esp
|
subl $72, %esp
|
||||||
|
@ -452,7 +454,7 @@ gen_scrypt_core:
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
leal 131072(%esi), %ecx
|
leal 131072(%esi), %ecx
|
||||||
gen_scrypt_core_loop1:
|
scrypt_core_gen_loop1:
|
||||||
movl %esi, 64(%esp)
|
movl %esi, 64(%esp)
|
||||||
movl %ecx, 68(%esp)
|
movl %ecx, 68(%esp)
|
||||||
|
|
||||||
|
@ -473,7 +475,7 @@ gen_scrypt_core_loop1:
|
||||||
scrypt_core_macro1a 56, 120
|
scrypt_core_macro1a 56, 120
|
||||||
scrypt_core_macro1a 60, 124
|
scrypt_core_macro1a 60, 124
|
||||||
|
|
||||||
call gen_salsa8_core
|
call salsa8_core_gen
|
||||||
|
|
||||||
movl 92(%esp), %edi
|
movl 92(%esp), %edi
|
||||||
scrypt_core_macro2 0, 64
|
scrypt_core_macro2 0, 64
|
||||||
|
@ -493,7 +495,7 @@ gen_scrypt_core_loop1:
|
||||||
scrypt_core_macro2 56, 120
|
scrypt_core_macro2 56, 120
|
||||||
scrypt_core_macro2 60, 124
|
scrypt_core_macro2 60, 124
|
||||||
|
|
||||||
call gen_salsa8_core
|
call salsa8_core_gen
|
||||||
|
|
||||||
movl 92(%esp), %edi
|
movl 92(%esp), %edi
|
||||||
scrypt_core_macro3 0, 64
|
scrypt_core_macro3 0, 64
|
||||||
|
@ -517,11 +519,11 @@ gen_scrypt_core_loop1:
|
||||||
movl 68(%esp), %ecx
|
movl 68(%esp), %ecx
|
||||||
addl $128, %esi
|
addl $128, %esi
|
||||||
cmpl %ecx, %esi
|
cmpl %ecx, %esi
|
||||||
jne gen_scrypt_core_loop1
|
jne scrypt_core_gen_loop1
|
||||||
|
|
||||||
movl 96(%esp), %esi
|
movl 96(%esp), %esi
|
||||||
movl $1024, %ecx
|
movl $1024, %ecx
|
||||||
gen_scrypt_core_loop2:
|
scrypt_core_gen_loop2:
|
||||||
movl %ecx, 68(%esp)
|
movl %ecx, 68(%esp)
|
||||||
|
|
||||||
movl 64(%edi), %edx
|
movl 64(%edi), %edx
|
||||||
|
@ -545,7 +547,7 @@ gen_scrypt_core_loop2:
|
||||||
scrypt_core_macro1b 56, 120
|
scrypt_core_macro1b 56, 120
|
||||||
scrypt_core_macro1b 60, 124
|
scrypt_core_macro1b 60, 124
|
||||||
|
|
||||||
call gen_salsa8_core
|
call salsa8_core_gen
|
||||||
|
|
||||||
movl 92(%esp), %edi
|
movl 92(%esp), %edi
|
||||||
scrypt_core_macro2 0, 64
|
scrypt_core_macro2 0, 64
|
||||||
|
@ -565,7 +567,7 @@ gen_scrypt_core_loop2:
|
||||||
scrypt_core_macro2 56, 120
|
scrypt_core_macro2 56, 120
|
||||||
scrypt_core_macro2 60, 124
|
scrypt_core_macro2 60, 124
|
||||||
|
|
||||||
call gen_salsa8_core
|
call salsa8_core_gen
|
||||||
|
|
||||||
movl 92(%esp), %edi
|
movl 92(%esp), %edi
|
||||||
movl 96(%esp), %esi
|
movl 96(%esp), %esi
|
||||||
|
@ -588,7 +590,7 @@ gen_scrypt_core_loop2:
|
||||||
|
|
||||||
movl 68(%esp), %ecx
|
movl 68(%esp), %ecx
|
||||||
subl $1, %ecx
|
subl $1, %ecx
|
||||||
ja gen_scrypt_core_loop2
|
ja scrypt_core_gen_loop2
|
||||||
|
|
||||||
addl $72, %esp
|
addl $72, %esp
|
||||||
popl %esi
|
popl %esi
|
||||||
|
@ -598,7 +600,7 @@ gen_scrypt_core_loop2:
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
.macro xmm_salsa8_core_doubleround
|
.macro salsa8_core_sse2_doubleround
|
||||||
movdqa %xmm1, %xmm4
|
movdqa %xmm1, %xmm4
|
||||||
paddd %xmm0, %xmm4
|
paddd %xmm0, %xmm4
|
||||||
movdqa %xmm4, %xmm5
|
movdqa %xmm4, %xmm5
|
||||||
|
@ -670,15 +672,15 @@ gen_scrypt_core_loop2:
|
||||||
pxor %xmm5, %xmm0
|
pxor %xmm5, %xmm0
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro xmm_salsa8_core
|
.macro salsa8_core_sse2
|
||||||
xmm_salsa8_core_doubleround
|
salsa8_core_sse2_doubleround
|
||||||
xmm_salsa8_core_doubleround
|
salsa8_core_sse2_doubleround
|
||||||
xmm_salsa8_core_doubleround
|
salsa8_core_sse2_doubleround
|
||||||
xmm_salsa8_core_doubleround
|
salsa8_core_sse2_doubleround
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.p2align 5
|
.p2align 5
|
||||||
xmm_scrypt_core:
|
scrypt_core_sse2:
|
||||||
movl 20(%esp), %edi
|
movl 20(%esp), %edi
|
||||||
movl 24(%esp), %esi
|
movl 24(%esp), %esi
|
||||||
movl %esp, %ebp
|
movl %esp, %ebp
|
||||||
|
@ -693,7 +695,7 @@ xmm_scrypt_core:
|
||||||
|
|
||||||
movl %esi, %edx
|
movl %esi, %edx
|
||||||
leal 131072(%esi), %ecx
|
leal 131072(%esi), %ecx
|
||||||
xmm_scrypt_core_loop1:
|
scrypt_core_sse2_loop1:
|
||||||
movdqa 0(%esp), %xmm0
|
movdqa 0(%esp), %xmm0
|
||||||
movdqa 16(%esp), %xmm1
|
movdqa 16(%esp), %xmm1
|
||||||
movdqa 32(%esp), %xmm2
|
movdqa 32(%esp), %xmm2
|
||||||
|
@ -713,7 +715,7 @@ xmm_scrypt_core_loop1:
|
||||||
movdqa %xmm6, 96(%edx)
|
movdqa %xmm6, 96(%edx)
|
||||||
movdqa %xmm7, 112(%edx)
|
movdqa %xmm7, 112(%edx)
|
||||||
|
|
||||||
xmm_salsa8_core
|
salsa8_core_sse2
|
||||||
paddd 0(%edx), %xmm0
|
paddd 0(%edx), %xmm0
|
||||||
paddd 16(%edx), %xmm1
|
paddd 16(%edx), %xmm1
|
||||||
paddd 32(%edx), %xmm2
|
paddd 32(%edx), %xmm2
|
||||||
|
@ -731,7 +733,7 @@ xmm_scrypt_core_loop1:
|
||||||
movdqa %xmm1, 80(%esp)
|
movdqa %xmm1, 80(%esp)
|
||||||
movdqa %xmm2, %xmm6
|
movdqa %xmm2, %xmm6
|
||||||
movdqa %xmm3, %xmm7
|
movdqa %xmm3, %xmm7
|
||||||
xmm_salsa8_core
|
salsa8_core_sse2
|
||||||
paddd 64(%esp), %xmm0
|
paddd 64(%esp), %xmm0
|
||||||
paddd 80(%esp), %xmm1
|
paddd 80(%esp), %xmm1
|
||||||
paddd %xmm2, %xmm6
|
paddd %xmm2, %xmm6
|
||||||
|
@ -741,13 +743,13 @@ xmm_scrypt_core_loop1:
|
||||||
|
|
||||||
addl $128, %edx
|
addl $128, %edx
|
||||||
cmpl %ecx, %edx
|
cmpl %ecx, %edx
|
||||||
jne xmm_scrypt_core_loop1
|
jne scrypt_core_sse2_loop1
|
||||||
|
|
||||||
movdqa 64(%esp), %xmm4
|
movdqa 64(%esp), %xmm4
|
||||||
movdqa 80(%esp), %xmm5
|
movdqa 80(%esp), %xmm5
|
||||||
|
|
||||||
movl $1024, %ecx
|
movl $1024, %ecx
|
||||||
xmm_scrypt_core_loop2:
|
scrypt_core_sse2_loop2:
|
||||||
movdqa 0(%esp), %xmm0
|
movdqa 0(%esp), %xmm0
|
||||||
movdqa 16(%esp), %xmm1
|
movdqa 16(%esp), %xmm1
|
||||||
movdqa 32(%esp), %xmm2
|
movdqa 32(%esp), %xmm2
|
||||||
|
@ -768,7 +770,7 @@ xmm_scrypt_core_loop2:
|
||||||
movdqa %xmm1, 16(%esp)
|
movdqa %xmm1, 16(%esp)
|
||||||
movdqa %xmm2, 32(%esp)
|
movdqa %xmm2, 32(%esp)
|
||||||
movdqa %xmm3, 48(%esp)
|
movdqa %xmm3, 48(%esp)
|
||||||
xmm_salsa8_core
|
salsa8_core_sse2
|
||||||
paddd 0(%esp), %xmm0
|
paddd 0(%esp), %xmm0
|
||||||
paddd 16(%esp), %xmm1
|
paddd 16(%esp), %xmm1
|
||||||
paddd 32(%esp), %xmm2
|
paddd 32(%esp), %xmm2
|
||||||
|
@ -790,7 +792,7 @@ xmm_scrypt_core_loop2:
|
||||||
movdqa %xmm1, 80(%esp)
|
movdqa %xmm1, 80(%esp)
|
||||||
movdqa %xmm2, %xmm6
|
movdqa %xmm2, %xmm6
|
||||||
movdqa %xmm3, %xmm7
|
movdqa %xmm3, %xmm7
|
||||||
xmm_salsa8_core
|
salsa8_core_sse2
|
||||||
paddd 64(%esp), %xmm0
|
paddd 64(%esp), %xmm0
|
||||||
paddd 80(%esp), %xmm1
|
paddd 80(%esp), %xmm1
|
||||||
paddd %xmm2, %xmm6
|
paddd %xmm2, %xmm6
|
||||||
|
@ -801,7 +803,7 @@ xmm_scrypt_core_loop2:
|
||||||
movdqa %xmm1, 80(%esp)
|
movdqa %xmm1, 80(%esp)
|
||||||
|
|
||||||
subl $1, %ecx
|
subl $1, %ecx
|
||||||
ja xmm_scrypt_core_loop2
|
ja scrypt_core_sse2_loop2
|
||||||
|
|
||||||
movdqa %xmm6, 96(%esp)
|
movdqa %xmm6, 96(%esp)
|
||||||
movdqa %xmm7, 112(%esp)
|
movdqa %xmm7, 112(%esp)
|
||||||
|
|
179
scrypt.c
179
scrypt.c
|
@ -403,24 +403,18 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__)
|
||||||
|
|
||||||
#define SCRYPT_3WAY
|
#define SCRYPT_MAX_WAYS 3
|
||||||
#define SCRYPT_BUFFER_SIZE (3 * 131072 + 63)
|
|
||||||
|
|
||||||
int scrypt_best_throughput();
|
int scrypt_best_throughput();
|
||||||
void scrypt_core(uint32_t *X, uint32_t *V);
|
void scrypt_core(uint32_t *X, uint32_t *V);
|
||||||
void scrypt_core_2way(uint32_t *X, uint32_t *Y, uint32_t *V);
|
void scrypt_core_2way(uint32_t *X, uint32_t *V);
|
||||||
void scrypt_core_3way(uint32_t *X, uint32_t *Y, uint32_t *Z, uint32_t *V);
|
void scrypt_core_3way(uint32_t *X, uint32_t *V);
|
||||||
|
|
||||||
#elif defined(__i386__)
|
#elif defined(__i386__)
|
||||||
|
|
||||||
#define SCRYPT_BUFFER_SIZE (131072 + 63)
|
|
||||||
|
|
||||||
void scrypt_core(uint32_t *X, uint32_t *V);
|
void scrypt_core(uint32_t *X, uint32_t *V);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define SCRYPT_BUFFER_SIZE (131072 + 63)
|
|
||||||
|
|
||||||
static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
|
static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
|
||||||
{
|
{
|
||||||
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
|
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
|
||||||
|
@ -512,6 +506,13 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef SCRYPT_MAX_WAYS
|
||||||
|
#define SCRYPT_MAX_WAYS 1
|
||||||
|
#define scrypt_best_throughput() 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
|
||||||
|
|
||||||
unsigned char *scrypt_buffer_alloc()
|
unsigned char *scrypt_buffer_alloc()
|
||||||
{
|
{
|
||||||
return malloc(SCRYPT_BUFFER_SIZE);
|
return malloc(SCRYPT_BUFFER_SIZE);
|
||||||
|
@ -533,38 +534,35 @@ static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output,
|
||||||
return PBKDF2_SHA256_128_32(tstate, ostate, X, output);
|
return PBKDF2_SHA256_128_32(tstate, ostate, X, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SCRYPT_3WAY
|
#if SCRYPT_MAX_WAYS >= 2
|
||||||
|
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input,
|
||||||
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1,
|
uint32_t *output, unsigned char *scratchpad)
|
||||||
const uint32_t *input2, uint32_t *output1, uint32_t *output2,
|
|
||||||
unsigned char *scratchpad)
|
|
||||||
{
|
{
|
||||||
uint32_t tstate1[8], tstate2[8];
|
uint32_t tstate1[8], tstate2[8];
|
||||||
uint32_t ostate1[8], ostate2[8];
|
uint32_t ostate1[8], ostate2[8];
|
||||||
uint32_t *V;
|
uint32_t *V;
|
||||||
uint32_t X[32], Y[32];
|
uint32_t X[2 * 32], *Y = X + 32;
|
||||||
|
|
||||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||||
|
|
||||||
HMAC_SHA256_80_init(input1, tstate1, ostate1);
|
HMAC_SHA256_80_init(input, tstate1, ostate1);
|
||||||
HMAC_SHA256_80_init(input2, tstate2, ostate2);
|
HMAC_SHA256_80_init(input + 20, tstate2, ostate2);
|
||||||
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
|
PBKDF2_SHA256_80_128(tstate1, ostate1, input, X);
|
||||||
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
|
PBKDF2_SHA256_80_128(tstate2, ostate2, input + 20, Y);
|
||||||
|
|
||||||
scrypt_core_2way(X, Y, V);
|
scrypt_core_2way(X, V);
|
||||||
|
|
||||||
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
|
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output);
|
||||||
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
|
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output + 8);
|
||||||
}
|
}
|
||||||
|
#endif /* SCRYPT_MAX_WAYS >= 2 */
|
||||||
|
|
||||||
static void scrypt_1024_1_1_256_sp_3way(
|
#if SCRYPT_MAX_WAYS >= 3
|
||||||
const uint32_t *input1, const uint32_t *input2, const uint32_t *input3,
|
static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input,
|
||||||
uint32_t *output1, uint32_t *output2, uint32_t *output3,
|
uint32_t *output, unsigned char *scratchpad)
|
||||||
unsigned char *scratchpad)
|
|
||||||
{
|
{
|
||||||
#ifdef SHA256_4WAY
|
|
||||||
uint32_t tstate[4 * 8], ostate[4 * 8];
|
uint32_t tstate[4 * 8], ostate[4 * 8];
|
||||||
uint32_t input[4 * 20], output[4 * 32];
|
uint32_t X[3 * 32];
|
||||||
uint32_t X[32], Y[32], Z[32];
|
|
||||||
uint32_t W[4 * 32];
|
uint32_t W[4 * 32];
|
||||||
uint32_t *V;
|
uint32_t *V;
|
||||||
int i;
|
int i;
|
||||||
|
@ -572,53 +570,31 @@ static void scrypt_1024_1_1_256_sp_3way(
|
||||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||||
|
|
||||||
for (i = 0; i < 20; i++) {
|
for (i = 0; i < 20; i++) {
|
||||||
input[4 * i + 0] = input1[i];
|
W[4 * i + 0] = input[i];
|
||||||
input[4 * i + 1] = input2[i];
|
W[4 * i + 1] = input[i + 20];
|
||||||
input[4 * i + 2] = input3[i];
|
W[4 * i + 2] = input[i + 40];
|
||||||
}
|
}
|
||||||
HMAC_SHA256_80_init_4way(input, tstate, ostate);
|
HMAC_SHA256_80_init_4way(W, tstate, ostate);
|
||||||
PBKDF2_SHA256_80_128_4way(tstate, ostate, input, W);
|
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++) {
|
||||||
X[i] = W[4 * i + 0];
|
X[0 * 32 + i] = W[4 * i + 0];
|
||||||
Y[i] = W[4 * i + 1];
|
X[1 * 32 + i] = W[4 * i + 1];
|
||||||
Z[i] = W[4 * i + 2];
|
X[2 * 32 + i] = W[4 * i + 2];
|
||||||
}
|
}
|
||||||
scrypt_core_3way(X, Y, Z, V);
|
scrypt_core_3way(X, V);
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++) {
|
||||||
W[4 * i + 0] = X[i];
|
W[4 * i + 0] = X[0 * 32 + i];
|
||||||
W[4 * i + 1] = Y[i];
|
W[4 * i + 1] = X[1 * 32 + i];
|
||||||
W[4 * i + 2] = Z[i];
|
W[4 * i + 2] = X[2 * 32 + i];
|
||||||
}
|
}
|
||||||
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, output);
|
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
|
||||||
for (i = 0; i < 8; i++) {
|
for (i = 0; i < 8; i++) {
|
||||||
output1[i] = output[4 * i + 0];
|
output[i] = W[4 * i + 0];
|
||||||
output2[i] = output[4 * i + 1];
|
output[i + 8] = W[4 * i + 1];
|
||||||
output3[i] = output[4 * i + 2];
|
output[i + 16] = W[4 * i + 2];
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
uint32_t tstate1[8], tstate2[8], tstate3[8];
|
|
||||||
uint32_t ostate1[8], ostate2[8], ostate3[8];
|
|
||||||
uint32_t X[32], Y[32], Z[32];
|
|
||||||
uint32_t *V;
|
|
||||||
|
|
||||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
|
||||||
|
|
||||||
HMAC_SHA256_80_init(input1, tstate1, ostate1);
|
|
||||||
HMAC_SHA256_80_init(input2, tstate2, ostate2);
|
|
||||||
HMAC_SHA256_80_init(input3, tstate3, ostate3);
|
|
||||||
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
|
|
||||||
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
|
|
||||||
PBKDF2_SHA256_80_128(tstate3, ostate3, input3, Z);
|
|
||||||
|
|
||||||
scrypt_core_3way(X, Y, Z, V);
|
|
||||||
|
|
||||||
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
|
|
||||||
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
|
|
||||||
PBKDF2_SHA256_128_32(tstate3, ostate3, Z, output3);
|
|
||||||
#endif /* SHA256_4WAY*/
|
|
||||||
}
|
}
|
||||||
|
#endif /* SCRYPT_MAX_WAYS >= 3 */
|
||||||
#endif /* SCRYPT_3WAY */
|
|
||||||
|
|
||||||
__attribute__ ((noinline)) static int confirm_hash(const uint32_t *hash,
|
__attribute__ ((noinline)) static int confirm_hash(const uint32_t *hash,
|
||||||
const uint32_t *target)
|
const uint32_t *target)
|
||||||
|
@ -638,63 +614,46 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata,
|
||||||
unsigned char *scratchbuf, const unsigned char *ptarget,
|
unsigned char *scratchbuf, const unsigned char *ptarget,
|
||||||
uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done)
|
uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done)
|
||||||
{
|
{
|
||||||
uint32_t data[20], hash[8];
|
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
|
||||||
#ifdef SCRYPT_3WAY
|
|
||||||
uint32_t data2[20], hash2[8];
|
|
||||||
uint32_t data3[20], hash3[8];
|
|
||||||
int throughput;
|
|
||||||
#endif
|
|
||||||
unsigned long first_nonce = *next_nonce;
|
unsigned long first_nonce = *next_nonce;
|
||||||
uint32_t n = *next_nonce;
|
uint32_t n = *next_nonce;
|
||||||
uint32_t Htarg = le32dec(&((const uint32_t *)ptarget)[7]);
|
uint32_t Htarg = le32dec(&((const uint32_t *)ptarget)[7]);
|
||||||
|
const int throughput = scrypt_best_throughput();
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i = 0; i < 19; i++)
|
for (i = 0; i < 19; i++)
|
||||||
data[i] = be32dec(&((const uint32_t *)pdata)[i]);
|
data[i] = be32dec(&((const uint32_t *)pdata)[i]);
|
||||||
#ifdef SCRYPT_3WAY
|
for (i = 1; i < throughput; i++)
|
||||||
memcpy(data2, data, 80);
|
memcpy(data + i * 20, data, 80);
|
||||||
memcpy(data3, data, 80);
|
|
||||||
throughput = scrypt_best_throughput();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
do {
|
do {
|
||||||
data[19] = n++;
|
for (i = 0; i < throughput; i++)
|
||||||
#ifdef SCRYPT_3WAY
|
data[i * 20 + 19] = n++;
|
||||||
if (throughput >= 2 && n <= max_nonce) {
|
|
||||||
data2[19] = n++;
|
#if SCRYPT_MAX_WAYS >= 3
|
||||||
if (throughput >= 3 && n <= max_nonce) {
|
if (throughput == 3)
|
||||||
data3[19] = n++;
|
scrypt_1024_1_1_256_sp_3way(data, hash, scratchbuf);
|
||||||
scrypt_1024_1_1_256_sp_3way(data, data2, data3, hash, hash2, hash3, scratchbuf);
|
else
|
||||||
if (hash3[7] <= Htarg && confirm_hash(hash3, (uint32_t *)ptarget)) {
|
|
||||||
be32enc(&((uint32_t *)pdata)[19], data3[19]);
|
|
||||||
*next_nonce = n;
|
|
||||||
*hashes_done = n - first_nonce;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
scrypt_1024_1_1_256_sp_2way(data, data2, hash, hash2, scratchbuf);
|
|
||||||
}
|
|
||||||
if (hash2[7] <= Htarg && confirm_hash(hash2, (uint32_t *)ptarget)) {
|
|
||||||
be32enc(&((uint32_t *)pdata)[19], data2[19]);
|
|
||||||
*next_nonce = n;
|
|
||||||
*hashes_done = n - first_nonce;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
|
|
||||||
#endif
|
#endif
|
||||||
if (hash[7] <= Htarg && confirm_hash(hash, (uint32_t *)ptarget)) {
|
#if SCRYPT_MAX_WAYS >= 2
|
||||||
be32enc(&((uint32_t *)pdata)[19], data[19]);
|
if (throughput == 2)
|
||||||
|
scrypt_1024_1_1_256_sp_2way(data, hash, scratchbuf);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
|
||||||
|
|
||||||
|
for (i = 0; i < throughput; i++) {
|
||||||
|
if (unlikely(hash[i * 8 + 7] <= Htarg)
|
||||||
|
&& likely(confirm_hash(hash + i * 8, (uint32_t *)ptarget))) {
|
||||||
|
be32enc(&((uint32_t *)pdata)[19], data[i * 20 + 19]);
|
||||||
*next_nonce = n;
|
*next_nonce = n;
|
||||||
*hashes_done = n - first_nonce;
|
*hashes_done = n - first_nonce;
|
||||||
return true;
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} while (n <= max_nonce && !work_restart[thr_id].restart);
|
} while (n <= max_nonce && !work_restart[thr_id].restart);
|
||||||
|
|
||||||
*next_nonce = n;
|
*next_nonce = n;
|
||||||
*hashes_done = n - first_nonce;
|
*hashes_done = n - first_nonce;
|
||||||
return false;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue