Add optimizations for AVX-capable processors

This commit is contained in:
pooler 2012-03-05 19:32:52 +01:00
parent 94cb469bbf
commit 18033f6a04
5 changed files with 848 additions and 381 deletions

View file

@ -23,15 +23,15 @@ AC_CHECK_HEADERS(syslog.h)
AC_FUNC_ALLOCA
case $target in
x86_64-*)
have_x86_64=true
i*86-*-*)
have_x86=true
;;
*)
have_x86_64=false
x86_64-*-*)
have_x86=true
have_x86_64=true
;;
esac
have_win32=false
PTHREAD_FLAGS="-pthread"
WS2_LIBS=""
@ -43,6 +43,18 @@ case $target in
;;
esac
if test x$have_x86 = xtrue
then
AC_MSG_CHECKING(whether we can compile AVX code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])],
AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
)
fi
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",
@ -52,7 +64,8 @@ AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread",
AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([HAVE_x86_64], [test x$have_x86_64 = xtrue])
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
if test x$request_jansson = xtrue
then

View file

@ -508,7 +508,7 @@ static void *miner_thread(void *userdata)
unsigned long hashes_done;
struct timeval tv_start, tv_end, diff;
int64_t max64;
bool rc;
int rc;
/* obtain new work from internal workio thread */
pthread_mutex_lock(&g_work_lock);

File diff suppressed because it is too large Load diff

View file

@ -24,6 +24,8 @@
* SUCH DAMAGE.
*/
#include "cpuminer-config.h"
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
@ -65,7 +67,7 @@
movl %edx, \do+56(\dest)
.endm
.macro gen_salsa8_core_quadround
.macro salsa8_core_gen_quadround
movl 52(%esp), %ecx
movl 4(%esp), %edx
movl 20(%esp), %ebx
@ -387,9 +389,9 @@
.text
.p2align 5
gen_salsa8_core:
gen_salsa8_core_quadround
gen_salsa8_core_quadround
salsa8_core_gen:
salsa8_core_gen_quadround
salsa8_core_gen_quadround
ret
@ -408,9 +410,9 @@ _scrypt_core:
movl $1, %eax
cpuid
andl $0x04000000, %edx
jnz xmm_scrypt_core
jnz scrypt_core_sse2
gen_scrypt_core:
scrypt_core_gen:
movl 20(%esp), %edi
movl 24(%esp), %esi
subl $72, %esp
@ -452,7 +454,7 @@ gen_scrypt_core:
.endm
leal 131072(%esi), %ecx
gen_scrypt_core_loop1:
scrypt_core_gen_loop1:
movl %esi, 64(%esp)
movl %ecx, 68(%esp)
@ -473,7 +475,7 @@ gen_scrypt_core_loop1:
scrypt_core_macro1a 56, 120
scrypt_core_macro1a 60, 124
call gen_salsa8_core
call salsa8_core_gen
movl 92(%esp), %edi
scrypt_core_macro2 0, 64
@ -493,7 +495,7 @@ gen_scrypt_core_loop1:
scrypt_core_macro2 56, 120
scrypt_core_macro2 60, 124
call gen_salsa8_core
call salsa8_core_gen
movl 92(%esp), %edi
scrypt_core_macro3 0, 64
@ -517,11 +519,11 @@ gen_scrypt_core_loop1:
movl 68(%esp), %ecx
addl $128, %esi
cmpl %ecx, %esi
jne gen_scrypt_core_loop1
jne scrypt_core_gen_loop1
movl 96(%esp), %esi
movl $1024, %ecx
gen_scrypt_core_loop2:
scrypt_core_gen_loop2:
movl %ecx, 68(%esp)
movl 64(%edi), %edx
@ -545,7 +547,7 @@ gen_scrypt_core_loop2:
scrypt_core_macro1b 56, 120
scrypt_core_macro1b 60, 124
call gen_salsa8_core
call salsa8_core_gen
movl 92(%esp), %edi
scrypt_core_macro2 0, 64
@ -565,7 +567,7 @@ gen_scrypt_core_loop2:
scrypt_core_macro2 56, 120
scrypt_core_macro2 60, 124
call gen_salsa8_core
call salsa8_core_gen
movl 92(%esp), %edi
movl 96(%esp), %esi
@ -588,7 +590,7 @@ gen_scrypt_core_loop2:
movl 68(%esp), %ecx
subl $1, %ecx
ja gen_scrypt_core_loop2
ja scrypt_core_gen_loop2
addl $72, %esp
popl %esi
@ -598,7 +600,7 @@ gen_scrypt_core_loop2:
ret
.macro xmm_salsa8_core_doubleround
.macro salsa8_core_sse2_doubleround
movdqa %xmm1, %xmm4
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm5
@ -670,15 +672,15 @@ gen_scrypt_core_loop2:
pxor %xmm5, %xmm0
.endm
.macro xmm_salsa8_core
xmm_salsa8_core_doubleround
xmm_salsa8_core_doubleround
xmm_salsa8_core_doubleround
xmm_salsa8_core_doubleround
.macro salsa8_core_sse2
salsa8_core_sse2_doubleround
salsa8_core_sse2_doubleround
salsa8_core_sse2_doubleround
salsa8_core_sse2_doubleround
.endm
.p2align 5
xmm_scrypt_core:
scrypt_core_sse2:
movl 20(%esp), %edi
movl 24(%esp), %esi
movl %esp, %ebp
@ -693,7 +695,7 @@ xmm_scrypt_core:
movl %esi, %edx
leal 131072(%esi), %ecx
xmm_scrypt_core_loop1:
scrypt_core_sse2_loop1:
movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1
movdqa 32(%esp), %xmm2
@ -713,7 +715,7 @@ xmm_scrypt_core_loop1:
movdqa %xmm6, 96(%edx)
movdqa %xmm7, 112(%edx)
xmm_salsa8_core
salsa8_core_sse2
paddd 0(%edx), %xmm0
paddd 16(%edx), %xmm1
paddd 32(%edx), %xmm2
@ -731,7 +733,7 @@ xmm_scrypt_core_loop1:
movdqa %xmm1, 80(%esp)
movdqa %xmm2, %xmm6
movdqa %xmm3, %xmm7
xmm_salsa8_core
salsa8_core_sse2
paddd 64(%esp), %xmm0
paddd 80(%esp), %xmm1
paddd %xmm2, %xmm6
@ -741,13 +743,13 @@ xmm_scrypt_core_loop1:
addl $128, %edx
cmpl %ecx, %edx
jne xmm_scrypt_core_loop1
jne scrypt_core_sse2_loop1
movdqa 64(%esp), %xmm4
movdqa 80(%esp), %xmm5
movl $1024, %ecx
xmm_scrypt_core_loop2:
scrypt_core_sse2_loop2:
movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1
movdqa 32(%esp), %xmm2
@ -768,7 +770,7 @@ xmm_scrypt_core_loop2:
movdqa %xmm1, 16(%esp)
movdqa %xmm2, 32(%esp)
movdqa %xmm3, 48(%esp)
xmm_salsa8_core
salsa8_core_sse2
paddd 0(%esp), %xmm0
paddd 16(%esp), %xmm1
paddd 32(%esp), %xmm2
@ -790,7 +792,7 @@ xmm_scrypt_core_loop2:
movdqa %xmm1, 80(%esp)
movdqa %xmm2, %xmm6
movdqa %xmm3, %xmm7
xmm_salsa8_core
salsa8_core_sse2
paddd 64(%esp), %xmm0
paddd 80(%esp), %xmm1
paddd %xmm2, %xmm6
@ -801,7 +803,7 @@ xmm_scrypt_core_loop2:
movdqa %xmm1, 80(%esp)
subl $1, %ecx
ja xmm_scrypt_core_loop2
ja scrypt_core_sse2_loop2
movdqa %xmm6, 96(%esp)
movdqa %xmm7, 112(%esp)

179
scrypt.c
View file

@ -403,24 +403,18 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
#if defined(__x86_64__)
#define SCRYPT_3WAY
#define SCRYPT_BUFFER_SIZE (3 * 131072 + 63)
#define SCRYPT_MAX_WAYS 3
int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V);
void scrypt_core_2way(uint32_t *X, uint32_t *Y, uint32_t *V);
void scrypt_core_3way(uint32_t *X, uint32_t *Y, uint32_t *Z, uint32_t *V);
void scrypt_core_2way(uint32_t *X, uint32_t *V);
void scrypt_core_3way(uint32_t *X, uint32_t *V);
#elif defined(__i386__)
#define SCRYPT_BUFFER_SIZE (131072 + 63)
void scrypt_core(uint32_t *X, uint32_t *V);
#else
#define SCRYPT_BUFFER_SIZE (131072 + 63)
static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
{
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
@ -512,6 +506,13 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V)
#endif
#ifndef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 1
#define scrypt_best_throughput() 1
#endif
#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
unsigned char *scrypt_buffer_alloc()
{
return malloc(SCRYPT_BUFFER_SIZE);
@ -533,38 +534,35 @@ static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output,
return PBKDF2_SHA256_128_32(tstate, ostate, X, output);
}
#ifdef SCRYPT_3WAY
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1,
const uint32_t *input2, uint32_t *output1, uint32_t *output2,
unsigned char *scratchpad)
#if SCRYPT_MAX_WAYS >= 2
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input,
uint32_t *output, unsigned char *scratchpad)
{
uint32_t tstate1[8], tstate2[8];
uint32_t ostate1[8], ostate2[8];
uint32_t *V;
uint32_t X[32], Y[32];
uint32_t X[2 * 32], *Y = X + 32;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
HMAC_SHA256_80_init(input1, tstate1, ostate1);
HMAC_SHA256_80_init(input2, tstate2, ostate2);
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
HMAC_SHA256_80_init(input, tstate1, ostate1);
HMAC_SHA256_80_init(input + 20, tstate2, ostate2);
PBKDF2_SHA256_80_128(tstate1, ostate1, input, X);
PBKDF2_SHA256_80_128(tstate2, ostate2, input + 20, Y);
scrypt_core_2way(X, Y, V);
scrypt_core_2way(X, V);
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output);
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output + 8);
}
#endif /* SCRYPT_MAX_WAYS >= 2 */
static void scrypt_1024_1_1_256_sp_3way(
const uint32_t *input1, const uint32_t *input2, const uint32_t *input3,
uint32_t *output1, uint32_t *output2, uint32_t *output3,
unsigned char *scratchpad)
#if SCRYPT_MAX_WAYS >= 3
static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input,
uint32_t *output, unsigned char *scratchpad)
{
#ifdef SHA256_4WAY
uint32_t tstate[4 * 8], ostate[4 * 8];
uint32_t input[4 * 20], output[4 * 32];
uint32_t X[32], Y[32], Z[32];
uint32_t X[3 * 32];
uint32_t W[4 * 32];
uint32_t *V;
int i;
@ -572,53 +570,31 @@ static void scrypt_1024_1_1_256_sp_3way(
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
for (i = 0; i < 20; i++) {
input[4 * i + 0] = input1[i];
input[4 * i + 1] = input2[i];
input[4 * i + 2] = input3[i];
W[4 * i + 0] = input[i];
W[4 * i + 1] = input[i + 20];
W[4 * i + 2] = input[i + 40];
}
HMAC_SHA256_80_init_4way(input, tstate, ostate);
PBKDF2_SHA256_80_128_4way(tstate, ostate, input, W);
HMAC_SHA256_80_init_4way(W, tstate, ostate);
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
for (i = 0; i < 32; i++) {
X[i] = W[4 * i + 0];
Y[i] = W[4 * i + 1];
Z[i] = W[4 * i + 2];
X[0 * 32 + i] = W[4 * i + 0];
X[1 * 32 + i] = W[4 * i + 1];
X[2 * 32 + i] = W[4 * i + 2];
}
scrypt_core_3way(X, Y, Z, V);
scrypt_core_3way(X, V);
for (i = 0; i < 32; i++) {
W[4 * i + 0] = X[i];
W[4 * i + 1] = Y[i];
W[4 * i + 2] = Z[i];
W[4 * i + 0] = X[0 * 32 + i];
W[4 * i + 1] = X[1 * 32 + i];
W[4 * i + 2] = X[2 * 32 + i];
}
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, output);
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
for (i = 0; i < 8; i++) {
output1[i] = output[4 * i + 0];
output2[i] = output[4 * i + 1];
output3[i] = output[4 * i + 2];
output[i] = W[4 * i + 0];
output[i + 8] = W[4 * i + 1];
output[i + 16] = W[4 * i + 2];
}
#else
uint32_t tstate1[8], tstate2[8], tstate3[8];
uint32_t ostate1[8], ostate2[8], ostate3[8];
uint32_t X[32], Y[32], Z[32];
uint32_t *V;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
HMAC_SHA256_80_init(input1, tstate1, ostate1);
HMAC_SHA256_80_init(input2, tstate2, ostate2);
HMAC_SHA256_80_init(input3, tstate3, ostate3);
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
PBKDF2_SHA256_80_128(tstate3, ostate3, input3, Z);
scrypt_core_3way(X, Y, Z, V);
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
PBKDF2_SHA256_128_32(tstate3, ostate3, Z, output3);
#endif /* SHA256_4WAY*/
}
#endif /* SCRYPT_3WAY */
#endif /* SCRYPT_MAX_WAYS >= 3 */
__attribute__ ((noinline)) static int confirm_hash(const uint32_t *hash,
const uint32_t *target)
@ -638,63 +614,46 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata,
unsigned char *scratchbuf, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done)
{
uint32_t data[20], hash[8];
#ifdef SCRYPT_3WAY
uint32_t data2[20], hash2[8];
uint32_t data3[20], hash3[8];
int throughput;
#endif
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
unsigned long first_nonce = *next_nonce;
uint32_t n = *next_nonce;
uint32_t Htarg = le32dec(&((const uint32_t *)ptarget)[7]);
const int throughput = scrypt_best_throughput();
int i;
for (i = 0; i < 19; i++)
data[i] = be32dec(&((const uint32_t *)pdata)[i]);
#ifdef SCRYPT_3WAY
memcpy(data2, data, 80);
memcpy(data3, data, 80);
throughput = scrypt_best_throughput();
#endif
for (i = 1; i < throughput; i++)
memcpy(data + i * 20, data, 80);
do {
data[19] = n++;
#ifdef SCRYPT_3WAY
if (throughput >= 2 && n <= max_nonce) {
data2[19] = n++;
if (throughput >= 3 && n <= max_nonce) {
data3[19] = n++;
scrypt_1024_1_1_256_sp_3way(data, data2, data3, hash, hash2, hash3, scratchbuf);
if (hash3[7] <= Htarg && confirm_hash(hash3, (uint32_t *)ptarget)) {
be32enc(&((uint32_t *)pdata)[19], data3[19]);
*next_nonce = n;
*hashes_done = n - first_nonce;
return true;
}
} else {
scrypt_1024_1_1_256_sp_2way(data, data2, hash, hash2, scratchbuf);
}
if (hash2[7] <= Htarg && confirm_hash(hash2, (uint32_t *)ptarget)) {
be32enc(&((uint32_t *)pdata)[19], data2[19]);
*next_nonce = n;
*hashes_done = n - first_nonce;
return true;
}
} else {
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
}
#else
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
for (i = 0; i < throughput; i++)
data[i * 20 + 19] = n++;
#if SCRYPT_MAX_WAYS >= 3
if (throughput == 3)
scrypt_1024_1_1_256_sp_3way(data, hash, scratchbuf);
else
#endif
if (hash[7] <= Htarg && confirm_hash(hash, (uint32_t *)ptarget)) {
be32enc(&((uint32_t *)pdata)[19], data[19]);
#if SCRYPT_MAX_WAYS >= 2
if (throughput == 2)
scrypt_1024_1_1_256_sp_2way(data, hash, scratchbuf);
else
#endif
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
for (i = 0; i < throughput; i++) {
if (unlikely(hash[i * 8 + 7] <= Htarg)
&& likely(confirm_hash(hash + i * 8, (uint32_t *)ptarget))) {
be32enc(&((uint32_t *)pdata)[19], data[i * 20 + 19]);
*next_nonce = n;
*hashes_done = n - first_nonce;
return true;
return 1;
}
}
} while (n <= max_nonce && !work_restart[thr_id].restart);
*next_nonce = n;
*hashes_done = n - first_nonce;
return false;
return 0;
}