Drop unused 2-way scrypt

This commit is contained in:
pooler 2012-03-23 16:35:21 +01:00
parent 8af4ed77e6
commit e52982ab7f
5 changed files with 82 additions and 512 deletions

View file

@ -113,7 +113,8 @@ void sha256_init(uint32_t *state);
void sha256_transform(uint32_t *state, const uint32_t *block, int swap); void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
#if defined(__x86_64__) #if defined(__x86_64__)
#define SHA256_4WAY 1 #define HAVE_SHA256_4WAY 1
int sha256_use_4way();
void sha256_init_4way(uint32_t *state); void sha256_init_4way(uint32_t *state);
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
#endif #endif

View file

@ -903,413 +903,6 @@ scrypt_core_xmm_loop2:
ret ret
.macro salsa8_core_2way_xmm_doubleround
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm6
paddd %xmm0, %xmm4
paddd %xmm8, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $7, %xmm4
pslld $7, %xmm6
psrld $25, %xmm5
psrld $25, %xmm7
pxor %xmm4, %xmm3
pxor %xmm6, %xmm11
pxor %xmm5, %xmm3
pxor %xmm7, %xmm11
movdqa %xmm0, %xmm4
movdqa %xmm8, %xmm6
paddd %xmm3, %xmm4
paddd %xmm11, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $9, %xmm4
pslld $9, %xmm6
psrld $23, %xmm5
psrld $23, %xmm7
pxor %xmm4, %xmm2
pxor %xmm6, %xmm10
movdqa %xmm3, %xmm4
movdqa %xmm11, %xmm6
pshufd $0x93, %xmm3, %xmm3
pshufd $0x93, %xmm11, %xmm11
pxor %xmm5, %xmm2
pxor %xmm7, %xmm10
paddd %xmm2, %xmm4
paddd %xmm10, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $13, %xmm4
pslld $13, %xmm6
psrld $19, %xmm5
psrld $19, %xmm7
pxor %xmm4, %xmm1
pxor %xmm6, %xmm9
movdqa %xmm2, %xmm4
movdqa %xmm10, %xmm6
pshufd $0x4e, %xmm2, %xmm2
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm5, %xmm1
pxor %xmm7, %xmm9
paddd %xmm1, %xmm4
paddd %xmm9, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $18, %xmm4
pslld $18, %xmm6
psrld $14, %xmm5
psrld $14, %xmm7
pxor %xmm4, %xmm0
pxor %xmm6, %xmm8
pshufd $0x39, %xmm1, %xmm1
pshufd $0x39, %xmm9, %xmm9
pxor %xmm5, %xmm0
pxor %xmm7, %xmm8
movdqa %xmm3, %xmm4
movdqa %xmm11, %xmm6
paddd %xmm0, %xmm4
paddd %xmm8, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $7, %xmm4
pslld $7, %xmm6
psrld $25, %xmm5
psrld $25, %xmm7
pxor %xmm4, %xmm1
pxor %xmm6, %xmm9
pxor %xmm5, %xmm1
pxor %xmm7, %xmm9
movdqa %xmm0, %xmm4
movdqa %xmm8, %xmm6
paddd %xmm1, %xmm4
paddd %xmm9, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $9, %xmm4
pslld $9, %xmm6
psrld $23, %xmm5
psrld $23, %xmm7
pxor %xmm4, %xmm2
pxor %xmm6, %xmm10
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm6
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm9, %xmm9
pxor %xmm5, %xmm2
pxor %xmm7, %xmm10
paddd %xmm2, %xmm4
paddd %xmm10, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $13, %xmm4
pslld $13, %xmm6
psrld $19, %xmm5
psrld $19, %xmm7
pxor %xmm4, %xmm3
pxor %xmm6, %xmm11
movdqa %xmm2, %xmm4
movdqa %xmm10, %xmm6
pshufd $0x4e, %xmm2, %xmm2
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm5, %xmm3
pxor %xmm7, %xmm11
paddd %xmm3, %xmm4
paddd %xmm11, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $18, %xmm4
pslld $18, %xmm6
psrld $14, %xmm5
psrld $14, %xmm7
pxor %xmm4, %xmm0
pxor %xmm6, %xmm8
pshufd $0x39, %xmm3, %xmm3
pshufd $0x39, %xmm11, %xmm11
pxor %xmm5, %xmm0
pxor %xmm7, %xmm8
.endm
.macro salsa8_core_2way_xmm
salsa8_core_2way_xmm_doubleround
salsa8_core_2way_xmm_doubleround
salsa8_core_2way_xmm_doubleround
salsa8_core_2way_xmm_doubleround
.endm
.text
.p2align 6
.globl scrypt_core_2way
.globl _scrypt_core_2way
scrypt_core_2way:
_scrypt_core_2way:
pushq %rbx
pushq %rbp
#if defined(WIN64)
subq $176, %rsp
movdqa %xmm6, 8(%rsp)
movdqa %xmm7, 24(%rsp)
movdqa %xmm8, 40(%rsp)
movdqa %xmm9, 56(%rsp)
movdqa %xmm10, 72(%rsp)
movdqa %xmm11, 88(%rsp)
movdqa %xmm12, 104(%rsp)
movdqa %xmm13, 120(%rsp)
movdqa %xmm14, 136(%rsp)
movdqa %xmm15, 152(%rsp)
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#endif
subq $264, %rsp
scrypt_shuffle %rdi, 0, %rsp, 0
scrypt_shuffle %rdi, 64, %rsp, 64
scrypt_shuffle %rdi, 128, %rsp, 128
scrypt_shuffle %rdi, 192, %rsp, 192
movdqa 192(%rsp), %xmm12
movdqa 208(%rsp), %xmm13
movdqa 224(%rsp), %xmm14
movdqa 240(%rsp), %xmm15
movq %rsi, %rbp
leaq 262144(%rsi), %rcx
scrypt_core_2way_loop1:
movdqa 0(%rsp), %xmm0
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm2
movdqa 48(%rsp), %xmm3
movdqa 64(%rsp), %xmm4
movdqa 80(%rsp), %xmm5
movdqa 96(%rsp), %xmm6
movdqa 112(%rsp), %xmm7
movdqa 128(%rsp), %xmm8
movdqa 144(%rsp), %xmm9
movdqa 160(%rsp), %xmm10
movdqa 176(%rsp), %xmm11
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
movdqa %xmm0, 0(%rbp)
movdqa %xmm1, 16(%rbp)
movdqa %xmm2, 32(%rbp)
movdqa %xmm3, 48(%rbp)
movdqa %xmm4, 64(%rbp)
movdqa %xmm5, 80(%rbp)
movdqa %xmm6, 96(%rbp)
movdqa %xmm7, 112(%rbp)
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm8, 128(%rbp)
movdqa %xmm9, 144(%rbp)
movdqa %xmm10, 160(%rbp)
movdqa %xmm11, 176(%rbp)
movdqa %xmm12, 192(%rbp)
movdqa %xmm13, 208(%rbp)
movdqa %xmm14, 224(%rbp)
movdqa %xmm15, 240(%rbp)
salsa8_core_2way_xmm
paddd 0(%rbp), %xmm0
paddd 16(%rbp), %xmm1
paddd 32(%rbp), %xmm2
paddd 48(%rbp), %xmm3
paddd 128(%rbp), %xmm8
paddd 144(%rbp), %xmm9
paddd 160(%rbp), %xmm10
paddd 176(%rbp), %xmm11
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128(%rsp)
movdqa %xmm9, 144(%rsp)
movdqa %xmm10, 160(%rsp)
movdqa %xmm11, 176(%rsp)
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, %xmm12
movdqa %xmm9, %xmm13
movdqa %xmm10, %xmm14
movdqa %xmm11, %xmm15
salsa8_core_2way_xmm
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd %xmm8, %xmm12
paddd %xmm9, %xmm13
paddd %xmm10, %xmm14
paddd %xmm11, %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
addq $256, %rbp
cmpq %rcx, %rbp
jne scrypt_core_2way_loop1
movq $1024, %rcx
scrypt_core_2way_loop2:
movdqa 0(%rsp), %xmm0
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm2
movdqa 48(%rsp), %xmm3
movdqa 64(%rsp), %xmm4
movdqa 80(%rsp), %xmm5
movdqa 96(%rsp), %xmm6
movdqa 112(%rsp), %xmm7
movdqa 128(%rsp), %xmm8
movdqa 144(%rsp), %xmm9
movdqa 160(%rsp), %xmm10
movdqa 176(%rsp), %xmm11
movd %xmm4, %ebp
andl $1023, %ebp
shll $8, %ebp
pxor 0(%rsi, %rbp), %xmm0
pxor 16(%rsi, %rbp), %xmm1
pxor 32(%rsi, %rbp), %xmm2
pxor 48(%rsi, %rbp), %xmm3
movd %xmm12, %ebx
andl $1023, %ebx
shll $8, %ebx
addl $128, %ebx
pxor 0(%rsi, %rbx), %xmm8
pxor 16(%rsi, %rbx), %xmm9
pxor 32(%rsi, %rbx), %xmm10
pxor 48(%rsi, %rbx), %xmm11
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128(%rsp)
movdqa %xmm9, 144(%rsp)
movdqa %xmm10, 160(%rsp)
movdqa %xmm11, 176(%rsp)
salsa8_core_2way_xmm
paddd 0(%rsp), %xmm0
paddd 16(%rsp), %xmm1
paddd 32(%rsp), %xmm2
paddd 48(%rsp), %xmm3
paddd 128(%rsp), %xmm8
paddd 144(%rsp), %xmm9
paddd 160(%rsp), %xmm10
paddd 176(%rsp), %xmm11
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128(%rsp)
movdqa %xmm9, 144(%rsp)
movdqa %xmm10, 160(%rsp)
movdqa %xmm11, 176(%rsp)
pxor 64(%rsi, %rbp), %xmm0
pxor 80(%rsi, %rbp), %xmm1
pxor 96(%rsi, %rbp), %xmm2
pxor 112(%rsi, %rbp), %xmm3
pxor 64(%rsi, %rbx), %xmm8
pxor 80(%rsi, %rbx), %xmm9
pxor 96(%rsi, %rbx), %xmm10
pxor 112(%rsi, %rbx), %xmm11
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, %xmm12
movdqa %xmm9, %xmm13
movdqa %xmm10, %xmm14
movdqa %xmm11, %xmm15
salsa8_core_2way_xmm
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd %xmm8, %xmm12
paddd %xmm9, %xmm13
paddd %xmm10, %xmm14
paddd %xmm11, %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
subq $1, %rcx
ja scrypt_core_2way_loop2
movdqa %xmm12, 192(%rsp)
movdqa %xmm13, 208(%rsp)
movdqa %xmm14, 224(%rsp)
movdqa %xmm15, 240(%rsp)
scrypt_shuffle %rsp, 0, %rdi, 0
scrypt_shuffle %rsp, 64, %rdi, 64
scrypt_shuffle %rsp, 128, %rdi, 128
scrypt_shuffle %rsp, 192, %rdi, 192
addq $264, %rsp
#if defined(WIN64)
popq %rsi
popq %rdi
movdqa 8(%rsp), %xmm6
movdqa 24(%rsp), %xmm7
movdqa 40(%rsp), %xmm8
movdqa 56(%rsp), %xmm9
movdqa 72(%rsp), %xmm10
movdqa 88(%rsp), %xmm11
movdqa 104(%rsp), %xmm12
movdqa 120(%rsp), %xmm13
movdqa 136(%rsp), %xmm14
movdqa 152(%rsp), %xmm15
addq $176, %rsp
#endif
popq %rbp
popq %rbx
ret
#if defined(USE_AVX) #if defined(USE_AVX)
.macro salsa8_core_3way_avx_doubleround .macro salsa8_core_3way_avx_doubleround
vpaddd %xmm0, %xmm1, %xmm4 vpaddd %xmm0, %xmm1, %xmm4

View file

@ -119,7 +119,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
} }
#ifdef SHA256_4WAY #ifdef HAVE_SHA256_4WAY
static const uint32_t keypad_4way[4 * 12] = { static const uint32_t keypad_4way[4 * 12] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
@ -253,15 +253,15 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
output[i] = swab32(ostate[i]); output[i] = swab32(ostate[i]);
} }
#endif /* SHA256_4WAY */ #endif /* HAVE_SHA256_4WAY */
#if defined(__x86_64__) #if defined(__x86_64__)
#define SCRYPT_MAX_WAYS 3 #define SCRYPT_MAX_WAYS 3
#define HAVE_SCRYPT_3WAY 1
int scrypt_best_throughput(); int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V); void scrypt_core(uint32_t *X, uint32_t *V);
void scrypt_core_2way(uint32_t *X, uint32_t *V);
void scrypt_core_3way(uint32_t *X, uint32_t *V); void scrypt_core_3way(uint32_t *X, uint32_t *V);
#elif defined(__i386__) #elif defined(__i386__)
@ -370,7 +370,7 @@ unsigned char *scrypt_buffer_alloc()
return malloc(SCRYPT_BUFFER_SIZE); return malloc(SCRYPT_BUFFER_SIZE);
} }
static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output, static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad) uint32_t *midstate, unsigned char *scratchpad)
{ {
uint32_t tstate[8], ostate[8]; uint32_t tstate[8], ostate[8];
@ -388,33 +388,8 @@ static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output,
PBKDF2_SHA256_128_32(tstate, ostate, X, output); PBKDF2_SHA256_128_32(tstate, ostate, X, output);
} }
#if SCRYPT_MAX_WAYS >= 2 #ifdef HAVE_SCRYPT_3WAY
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input, static void scrypt_1024_1_1_256_3way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{
uint32_t tstate1[8], tstate2[8];
uint32_t ostate1[8], ostate2[8];
uint32_t X[2 * 32], *Y = X + 32;
uint32_t *V;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
memcpy(tstate1, midstate, 32);
memcpy(tstate2, midstate, 32);
HMAC_SHA256_80_init(input, tstate1, ostate1);
HMAC_SHA256_80_init(input + 20, tstate2, ostate2);
PBKDF2_SHA256_80_128(tstate1, ostate1, input, X);
PBKDF2_SHA256_80_128(tstate2, ostate2, input + 20, Y);
scrypt_core_2way(X, V);
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output);
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output + 8);
}
#endif /* SCRYPT_MAX_WAYS >= 2 */
#if SCRYPT_MAX_WAYS >= 3
static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{ {
uint32_t tstate[4 * 8] __attribute__((aligned(128))); uint32_t tstate[4 * 8] __attribute__((aligned(128)));
@ -456,7 +431,7 @@ static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input,
output[i + 16] = W[4 * i + 2]; output[i + 16] = W[4 * i + 2];
} }
} }
#endif /* SCRYPT_MAX_WAYS >= 3 */ #endif /* HAVE_SCRYPT_3WAY */
int scanhash_scrypt(int thr_id, uint32_t *pdata, int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget, unsigned char *scratchbuf, const uint32_t *ptarget,
@ -469,6 +444,10 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
const int throughput = scrypt_best_throughput(); const int throughput = scrypt_best_throughput();
int i; int i;
#ifdef HAVE_SHA256_4WAY
sha256_use_4way();
#endif
for (i = 0; i < throughput; i++) for (i = 0; i < throughput; i++)
memcpy(data + i * 20, pdata, 80); memcpy(data + i * 20, pdata, 80);
@ -479,17 +458,12 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
for (i = 0; i < throughput; i++) for (i = 0; i < throughput; i++)
data[i * 20 + 19] = ++n; data[i * 20 + 19] = ++n;
#if SCRYPT_MAX_WAYS >= 3 #ifdef HAVE_SCRYPT_3WAY
if (throughput == 3) if (throughput == 3)
scrypt_1024_1_1_256_sp_3way(data, hash, midstate, scratchbuf); scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
else else
#endif #endif
#if SCRYPT_MAX_WAYS >= 2 scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
if (throughput == 2)
scrypt_1024_1_1_256_sp_2way(data, hash, midstate, scratchbuf);
else
#endif
scrypt_1024_1_1_256_sp(data, hash, midstate, scratchbuf);
for (i = 0; i < throughput; i++) { for (i = 0; i < throughput; i++) {
if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {

View file

@ -881,9 +881,6 @@ sha256_transform_4way_core_addr:
.globl _sha256_transform_4way .globl _sha256_transform_4way
sha256_transform_4way: sha256_transform_4way:
_sha256_transform_4way: _sha256_transform_4way:
movq sha256_transform_4way_core_addr(%rip), %rax
testq %rax, %rax
jz sha256_transform_4way_init
#if defined(WIN64) #if defined(WIN64)
pushq %rdi pushq %rdi
subq $96, %rsp subq $96, %rsp
@ -903,20 +900,8 @@ _sha256_transform_4way:
andq $-128, %rsp andq $-128, %rsp
testq %rdx, %rdx testq %rdx, %rdx
jz sha256_transform_4way_block_copy jnz sha256_transform_4way_swap
p2bswap_rsi_rsp 0
p2bswap_rsi_rsp 2
p2bswap_rsi_rsp 4
p2bswap_rsi_rsp 6
p2bswap_rsi_rsp 8
p2bswap_rsi_rsp 10
p2bswap_rsi_rsp 12
p2bswap_rsi_rsp 14
jmp *%rax
.p2align 6
sha256_transform_4way_block_copy:
movdqu 0*16(%rsi), %xmm0 movdqu 0*16(%rsi), %xmm0
movdqu 1*16(%rsi), %xmm1 movdqu 1*16(%rsi), %xmm1
movdqu 2*16(%rsi), %xmm2 movdqu 2*16(%rsi), %xmm2
@ -949,11 +934,19 @@ sha256_transform_4way_block_copy:
movdqa %xmm5, 13*16(%rsp) movdqa %xmm5, 13*16(%rsp)
movdqa %xmm6, 14*16(%rsp) movdqa %xmm6, 14*16(%rsp)
movdqa %xmm7, 15*16(%rsp) movdqa %xmm7, 15*16(%rsp)
jmp *%rax jmp *sha256_transform_4way_core_addr(%rip)
sha256_transform_4way_init: .p2align 6
call sha2_4way_init sha256_transform_4way_swap:
jmp sha256_transform_4way p2bswap_rsi_rsp 0
p2bswap_rsi_rsp 2
p2bswap_rsi_rsp 4
p2bswap_rsi_rsp 6
p2bswap_rsi_rsp 8
p2bswap_rsi_rsp 10
p2bswap_rsi_rsp 12
p2bswap_rsi_rsp 14
jmp *sha256_transform_4way_core_addr(%rip)
.p2align 6 .p2align 6
sha256_transform_4way_finish: sha256_transform_4way_finish:
@ -1009,14 +1002,7 @@ sha256d_4way_addr:
.globl _sha256d_4way .globl _sha256d_4way
sha256d_4way: sha256d_4way:
_sha256d_4way: _sha256d_4way:
movq sha256d_4way_addr(%rip), %rax jmp *sha256d_4way_addr(%rip)
testq %rax, %rax
jz sha256d_4way_init
jmp *%rax
sha256d_4way_init:
call sha2_4way_init
jmp sha256d_4way
.p2align 6 .p2align 6
@ -1366,8 +1352,12 @@ sha256d_4way_xop:
#endif /* USE_XOP */ #endif /* USE_XOP */
.text
.p2align 6 .p2align 6
sha2_4way_init: .globl sha256_use_4way
.globl _sha256_use_4way
sha256_use_4way:
_sha256_use_4way:
pushq %rbx pushq %rbx
pushq %rcx pushq %rcx
pushq %rdx pushq %rdx
@ -1414,6 +1404,7 @@ sha2_4way_init_done:
popq %rdx popq %rdx
popq %rcx popq %rcx
popq %rbx popq %rbx
movl $1, %eax
ret ret
#endif #endif

77
sha2.c
View file

@ -164,15 +164,12 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
state[i] += S[i]; state[i] += S[i];
} }
#if defined(__x86_64__) #ifdef HAVE_SHA256_4WAY
#define SHA256D_MAX_WAYS 4
#define SHA256D_WAYS 4
void sha256d_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate); void sha256d_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate);
#else #else
#define SHA256D_MAX_WAYS 1
#define SHA256D_WAYS 1 #endif
static const uint32_t sha256d_hash1[16] = { static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
@ -340,49 +337,63 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
hash[i] += sha256_h[i]; hash[i] += sha256_h[i];
} }
#endif
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done) uint32_t max_nonce, unsigned long *hashes_done)
{ {
uint32_t data[SHA256D_WAYS * 64] __attribute__((aligned(128))); uint32_t data[SHA256D_MAX_WAYS * 64] __attribute__((aligned(128)));
uint32_t hash[SHA256D_WAYS * 8] __attribute__((aligned(32))); uint32_t hash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
uint32_t midstate[SHA256D_WAYS * 8] __attribute__((aligned(32))); uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
uint32_t tmp[8];
uint32_t n = pdata[19] - 1; uint32_t n = pdata[19] - 1;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
#ifdef HAVE_SHA256_4WAY
const int ways = sha256_use_4way() ? 4 : 1;
#else
const int ways = 1;
#endif
int i, j; int i, j;
for (i = 15; i >= 0; i--) for (i = 15; i >= 0; i--)
for (j = 0; j < SHA256D_WAYS; j++) for (j = 0; j < ways; j++)
data[i * SHA256D_WAYS + j] = pdata[16 + i]; data[i * ways + j] = pdata[16 + i];
sha256_init(midstate); sha256_init(midstate);
sha256_transform(midstate, pdata, 0); sha256_transform(midstate, pdata, 0);
for (i = 7; i >= 0; i--) for (i = 7; i >= 0; i--)
for (j = 0; j < SHA256D_WAYS; j++) for (j = 0; j < ways; j++)
midstate[i * SHA256D_WAYS + j] = midstate[i]; midstate[i * ways + j] = midstate[i];
do { #ifdef HAVE_SHA256_4WAY
for (i = 0; i < SHA256D_WAYS; i++) if (ways == 4)
data[SHA256D_WAYS * 3 + i] = ++n; do {
for (i = 0; i < 4; i++)
data[4 * 3 + i] = ++n;
#if SHA256D_WAYS == 4 sha256d_4way(hash, data, midstate);
sha256d_4way(hash, data, midstate);
#else
sha256d(hash, data, midstate);
#endif
for (i = 0; i < SHA256D_WAYS; i++) { for (i = 0; i < 4; i++) {
if (hash[SHA256D_WAYS * 7 + i] <= Htarg) { if (hash[4 * 7 + i] <= Htarg) {
for (j = 0; j < 8; j++) uint32_t tmp[8];
tmp[j] = hash[SHA256D_WAYS * j + i]; for (j = 0; j < 8; j++)
if (fulltest(tmp, ptarget)) { tmp[j] = hash[4 * j + i];
*hashes_done = n - pdata[19] + 1; if (fulltest(tmp, ptarget)) {
pdata[19] = data[SHA256D_WAYS * 3 + i]; *hashes_done = n - pdata[19] + 1;
return 1; pdata[19] = data[4 * 3 + i];
return 1;
}
} }
} }
} while (n < max_nonce && !work_restart[thr_id].restart);
else
#endif
do {
data[3 + i] = ++n;
sha256d(hash, data, midstate);
if (hash[7 + i] <= Htarg) {
if (fulltest(hash, ptarget)) {
*hashes_done = n - pdata[19] + 1;
pdata[19] = data[3 + i];
return 1;
}
} }
} while (n < max_nonce && !work_restart[thr_id].restart); } while (n < max_nonce && !work_restart[thr_id].restart);