Add support for scrypt(N, 1, 1)

This commit is contained in:
pooler 2014-05-25 17:21:36 +02:00
parent a988337f52
commit be1b725270
7 changed files with 235 additions and 159 deletions

View file

@ -100,7 +100,7 @@ struct workio_cmd {
} u;
};
enum sha256_algos {
enum algos {
ALGO_SCRYPT, /* scrypt(1024,1,1) */
ALGO_SHA256D, /* SHA-256d */
};
@ -128,7 +128,8 @@ static int opt_fail_pause = 30;
int opt_timeout = 0;
static int opt_scantime = 5;
static const bool opt_time = true;
static enum sha256_algos opt_algo = ALGO_SCRYPT;
static enum algos opt_algo = ALGO_SCRYPT;
static int opt_scrypt_n = 1024;
static int opt_n_threads;
static int num_processors;
static char *rpc_url;
@ -170,6 +171,7 @@ Usage: " PROGRAM_NAME " [OPTIONS]\n\
Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
scrypt scrypt(1024, 1, 1) (default)\n\
scrypt:N scrypt(N, 1, 1)\n\
sha256d SHA-256d\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
@ -1080,9 +1082,13 @@ static void *miner_thread(void *userdata)
affine_to_cpu(thr_id, thr_id % num_processors);
}
if (opt_algo == ALGO_SCRYPT)
{
scratchbuf = scrypt_buffer_alloc();
if (opt_algo == ALGO_SCRYPT) {
scratchbuf = scrypt_buffer_alloc(opt_scrypt_n);
if (!scratchbuf) {
applog(LOG_ERR, "scrypt buffer allocation failed");
pthread_mutex_lock(&applog_lock);
exit(1);
}
}
while (1) {
@ -1133,8 +1139,16 @@ static void *miner_thread(void *userdata)
max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
- time(NULL);
max64 *= thr_hashrates[thr_id];
if (max64 <= 0)
max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL;
if (max64 <= 0) {
switch (opt_algo) {
case ALGO_SCRYPT:
max64 = opt_scrypt_n < 16 ? 0x3ffff : 0x3fffff / opt_scrypt_n;
break;
case ALGO_SHA256D:
max64 = 0x1fffff;
break;
}
}
if (work.data[19] + max64 > end_nonce)
max_nonce = end_nonce;
else
@ -1147,7 +1161,7 @@ static void *miner_thread(void *userdata)
switch (opt_algo) {
case ALGO_SCRYPT:
rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target,
max_nonce, &hashes_done);
max_nonce, &hashes_done, opt_scrypt_n);
break;
case ALGO_SHA256D:
@ -1471,10 +1485,21 @@ static void parse_arg(int key, char *arg, char *pname)
switch(key) {
case 'a':
for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
if (algo_names[i] &&
!strcmp(arg, algo_names[i])) {
opt_algo = i;
break;
v = strlen(algo_names[i]);
if (!strncmp(arg, algo_names[i], v)) {
if (arg[v] == '\0') {
opt_algo = i;
break;
}
if (arg[v] == ':' && i == ALGO_SCRYPT) {
char *ep;
v = strtol(arg+v+1, &ep, 10);
if (*ep || v & (v-1) || v < 2)
continue;
opt_algo = i;
opt_scrypt_n = v;
break;
}
}
}
if (i == ARRAY_SIZE(algo_names)) {

View file

@ -154,10 +154,10 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
extern unsigned char *scrypt_buffer_alloc();
extern unsigned char *scrypt_buffer_alloc(int N);
extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done);
uint32_t max_nonce, unsigned long *hashes_done, int N);
struct thr_info {
int id;

View file

@ -72,6 +72,9 @@ Possible values are:
.B scrypt
scrypt(1024, 1, 1) (used by Litecoin)
.TP
.B scrypt:\fIN\fR
scrypt(\fIN\fR, 1, 1) (\fIN\fR must be a power of 2 greater than 1)
.TP
.B sha256d
SHA-256d (used by Bitcoin)
.RE

View file

@ -1,5 +1,5 @@
/*
* Copyright 2012 pooler@litecoinpool.org
* Copyright 2012, 2014 pooler@litecoinpool.org
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
@ -472,14 +472,16 @@ scrypt_core:
_scrypt_core:
stmfd sp!, {r4-r11, lr}
mov r12, sp
sub sp, sp, #21*4
sub sp, sp, #22*4
bic sp, sp, #63
str r12, [sp, #20*4]
str r2, [sp, #21*4]
scrypt_shuffle
ldr r2, [sp, #21*4]
str r0, [sp, #16*4]
add r12, r1, #1024*32*4
add r12, r1, r2, lsl #7
str r12, [sp, #18*4]
scrypt_core_loop1:
add lr, r0, #16*4
@ -517,12 +519,14 @@ scrypt_core_loop1:
cmp r1, r12
bne scrypt_core_loop1
ldr r12, [sp, #21*4]
ldr r4, [r0, #16*4]
sub r1, r1, #1024*32*4
sub r2, r12, #1
str r2, [sp, #21*4]
sub r1, r1, r12, lsl #7
str r1, [sp, #17*4]
mov r4, r4, lsl #32-10
mov r12, #1024
add r1, r1, r4, lsr #32-10-7
and r4, r4, r2
add r1, r1, r4, lsl #7
scrypt_core_loop2:
add r2, r0, #16*4
add r3, r1, #16*4
@ -553,9 +557,10 @@ scrypt_core_loop2:
mov r1, sp
ldr r3, [sp, #17*4]
add r0, r0, #16*4
ldr r2, [sp, #21*4]
scrypt_core_macro3_x4
mov r4, r4, lsl #32-10
add r3, r3, r4, lsr #32-10-7
and r4, r4, r2
add r3, r3, r4, lsl #7
str r3, [sp, #19*4]
#ifdef __ARM_ARCH_5E_OR_6_OR_7__
pld [r3, #16*4]
@ -794,10 +799,11 @@ _scrypt_core_3way:
mov r12, sp
sub sp, sp, #24*16
bic sp, sp, #63
str r12, [sp, #4*16+3*4]
str r2, [sp, #4*16+3*4]
str r12, [sp, #4*16+4*4]
mov r2, r0
vldmia r2!, {q8-q15}
mov r3, r0
vldmia r3!, {q8-q15}
vmov.u64 q0, #0xffffffff
vmov.u32 q1, q8
vmov.u32 q2, q12
@ -809,7 +815,7 @@ _scrypt_core_3way:
vbif.u32 q14, q15, q0
vbif.u32 q11, q1, q0
vbif.u32 q15, q2, q0
vldmia r2!, {q0-q7}
vldmia r3!, {q0-q7}
vswp.u32 d17, d21
vswp.u32 d25, d29
vswp.u32 d18, d22
@ -826,7 +832,7 @@ _scrypt_core_3way:
vbif.u32 q6, q7, q8
vbif.u32 q3, q9, q8
vbif.u32 q7, q10, q8
vldmia r2, {q8-q15}
vldmia r3, {q8-q15}
vswp.u32 d1, d5
vswp.u32 d9, d13
vswp.u32 d2, d6
@ -852,7 +858,7 @@ _scrypt_core_3way:
add lr, sp, #128
vldmia lr, {q0-q7}
add r2, r1, #1024*32*4
add r2, r1, r2, lsl #7
str r0, [sp, #4*16+0*4]
str r2, [sp, #4*16+2*4]
scrypt_core_3way_loop1:
@ -863,12 +869,13 @@ scrypt_core_3way_loop1:
scrypt_core_macro1a_x4
scrypt_core_macro1a_x4
scrypt_core_macro1a_x4
ldr r2, [sp, #4*16+3*4]
scrypt_core_macro1a_x4
sub r1, r1, #4*16
add r1, r1, #1024*32*4
add r1, r1, r2, lsl #7
vstmia r1, {q0-q7}
add r3, r1, #1024*32*4
add r3, r1, r2, lsl #7
vstmia r3, {q8-q15}
add lr, sp, #128
@ -957,20 +964,22 @@ scrypt_core_3way_loop1:
cmp r1, r2
bne scrypt_core_3way_loop1
ldr r2, [sp, #4*16+3*4]
add r5, sp, #256+4*16
vstmia r5, {q12-q15}
sub r1, r1, #1024*32*4
sub r1, r1, r2, lsl #7
str r1, [sp, #4*16+1*4]
mov r2, #1024
scrypt_core_3way_loop2:
str r2, [sp, #4*16+2*4]
ldr r0, [sp, #4*16+0*4]
ldr r1, [sp, #4*16+1*4]
ldr r2, [sp, #4*16+3*4]
ldr r4, [r0, #16*4]
mov r4, r4, lsl #32-10
add r1, r1, r4, lsr #32-10-7
sub r2, r2, #1
and r4, r4, r2
add r1, r1, r4, lsl #7
add r2, r0, #16*4
add r3, r1, #16*4
mov r12, sp
@ -980,29 +989,31 @@ scrypt_core_3way_loop2:
scrypt_core_macro1b_x4
ldr r1, [sp, #4*16+1*4]
add r1, r1, #1024*32*4
add r3, r1, #1024*32*4
ldr r2, [sp, #4*16+3*4]
add r1, r1, r2, lsl #7
add r3, r1, r2, lsl #7
sub r2, r2, #1
vmov r6, r7, d8
mov r6, r6, lsl #32-10
add r6, r1, r6, lsr #32-10-7
and r6, r6, r2
add r6, r1, r6, lsl #7
vmov r7, r8, d24
add lr, sp, #128
vldmia lr, {q0-q3}
pld [r6]
pld [r6, #8*4]
pld [r6, #8*4]
pld [r6, #16*4]
pld [r6, #24*4]
pld [r6, #24*4]
vldmia r6, {q8-q15}
mov r7, r7, lsl #32-10
add r7, r3, r7, lsr #32-10-7
and r7, r7, r2
add r7, r3, r7, lsl #7
veor.u32 q8, q8, q0
veor.u32 q9, q9, q1
veor.u32 q10, q10, q2
veor.u32 q11, q11, q3
pld [r7]
pld [r7, #8*4]
pld [r7, #8*4]
pld [r7, #16*4]
pld [r7, #24*4]
pld [r7, #24*4]
veor.u32 q12, q12, q4
veor.u32 q13, q13, q5
veor.u32 q14, q14, q6
@ -1079,15 +1090,17 @@ scrypt_core_3way_loop2:
ldr r0, [sp, #4*16+0*4]
ldr r3, [sp, #4*16+1*4]
ldr r2, [sp, #4*16+3*4]
mov r1, sp
add r0, r0, #16*4
sub r2, r2, #1
scrypt_core_macro3_x4
mov r4, r4, lsl #32-10
add r3, r3, r4, lsr #32-10-7
and r4, r4, r2
add r3, r3, r4, lsl #7
pld [r3, #16*4]
pld [r3]
pld [r3, #24*4]
pld [r3, #8*4]
pld [r3, #24*4]
pld [r3, #8*4]
scrypt_core_macro3_x6
scrypt_core_macro3_x6
@ -1164,7 +1177,7 @@ scrypt_core_3way_loop2:
vswp.u32 d26, d30
vstmia r0, {q8-q15}
ldr sp, [sp, #4*16+3*4]
ldr sp, [sp, #4*16+4*4]
vpop {q4-q7}
ldmfd sp!, {r4-r11, pc}

View file

@ -1,5 +1,5 @@
/*
* Copyright 2011-2013 pooler@litecoinpool.org
* Copyright 2011-2014 pooler@litecoinpool.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -113,38 +113,38 @@ scrypt_best_throughput_exit:
.macro scrypt_shuffle src, so, dest, do
movl \so+60(\src), %r8d
movl \so+44(\src), %r9d
movl \so+28(\src), %r10d
movl \so+12(\src), %r11d
movl %r8d, \do+12(\dest)
movl %r9d, \do+28(\dest)
movl %r10d, \do+44(\dest)
movl %r11d, \do+60(\dest)
movl \so+40(\src), %r8d
movl \so+8(\src), %r9d
movl \so+48(\src), %r10d
movl \so+16(\src), %r11d
movl %r8d, \do+8(\dest)
movl %r9d, \do+40(\dest)
movl %r10d, \do+16(\dest)
movl %r11d, \do+48(\dest)
movl \so+20(\src), %r8d
movl \so+4(\src), %r9d
movl \so+52(\src), %r10d
movl \so+36(\src), %r11d
movl %r8d, \do+4(\dest)
movl %r9d, \do+20(\dest)
movl %r10d, \do+36(\dest)
movl %r11d, \do+52(\dest)
movl \so+0(\src), %r8d
movl \so+24(\src), %r9d
movl \so+32(\src), %r10d
movl \so+56(\src), %r11d
movl %r8d, \do+0(\dest)
movl %r9d, \do+24(\dest)
movl %r10d, \do+32(\dest)
movl %r11d, \do+56(\dest)
movl \so+60(\src), %eax
movl \so+44(\src), %ebx
movl \so+28(\src), %ecx
movl \so+12(\src), %edx
movl %eax, \do+12(\dest)
movl %ebx, \do+28(\dest)
movl %ecx, \do+44(\dest)
movl %edx, \do+60(\dest)
movl \so+40(\src), %eax
movl \so+8(\src), %ebx
movl \so+48(\src), %ecx
movl \so+16(\src), %edx
movl %eax, \do+8(\dest)
movl %ebx, \do+40(\dest)
movl %ecx, \do+16(\dest)
movl %edx, \do+48(\dest)
movl \so+20(\src), %eax
movl \so+4(\src), %ebx
movl \so+52(\src), %ecx
movl \so+36(\src), %edx
movl %eax, \do+4(\dest)
movl %ebx, \do+20(\dest)
movl %ecx, \do+36(\dest)
movl %edx, \do+52(\dest)
movl \so+0(\src), %eax
movl \so+24(\src), %ebx
movl \so+32(\src), %ecx
movl \so+56(\src), %edx
movl %eax, \do+0(\dest)
movl %ebx, \do+24(\dest)
movl %ecx, \do+32(\dest)
movl %edx, \do+56(\dest)
.endm
@ -384,6 +384,8 @@ _scrypt_core:
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#else
movq %rdx, %r8
#endif
.macro scrypt_core_cleanup
@ -432,7 +434,10 @@ scrypt_core_gen:
movdqa 96(%rdi), %xmm14
movdqa 112(%rdi), %xmm15
leaq 131072(%rsi), %rcx
movq %r8, %rcx
shlq $7, %rcx
addq %rsi, %rcx
movq %r8, 96(%rsp)
movq %rdi, 104(%rsp)
movq %rsi, 112(%rsp)
movq %rcx, 120(%rsp)
@ -481,11 +486,14 @@ scrypt_core_gen_loop1:
cmpq %rcx, %rsi
jne scrypt_core_gen_loop1
movq $1024, %rcx
movq 96(%rsp), %r8
movq %r8, %rcx
subl $1, %r8d
movq %r8, 96(%rsp)
movd %xmm12, %edx
scrypt_core_gen_loop2:
movq 112(%rsp), %rsi
andl $1023, %edx
andl %r8d, %edx
shll $7, %edx
addq %rsi, %rdx
movdqa 0(%rdx), %xmm0
@ -529,6 +537,7 @@ scrypt_core_gen_loop2:
movdqa %xmm14, 32(%rsp)
movdqa %xmm15, 48(%rsp)
call salsa8_core_gen
movq 96(%rsp), %r8
movq 128(%rsp), %rcx
addl 0(%rsp), %edx
paddd %xmm0, %xmm12
@ -691,7 +700,9 @@ scrypt_core_xmm:
punpckhqdq %xmm0, %xmm13
movq %rsi, %rdx
leaq 131072(%rsi), %rcx
movq %r8, %rcx
shlq $7, %rcx
addq %rsi, %rcx
scrypt_core_xmm_loop1:
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
@ -734,10 +745,11 @@ scrypt_core_xmm_loop1:
cmpq %rcx, %rdx
jne scrypt_core_xmm_loop1
movq $1024, %rcx
movq %r8, %rcx
subl $1, %r8d
scrypt_core_xmm_loop2:
movd %xmm12, %edx
andl $1023, %edx
andl %r8d, %edx
shll $7, %edx
pxor 0(%rsi, %rdx), %xmm8
pxor 16(%rsi, %rdx), %xmm9
@ -1019,6 +1031,8 @@ _scrypt_core_3way:
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#else
movq %rdx, %r8
#endif
subq $392, %rsp
@ -1088,7 +1102,9 @@ scrypt_core_3way_avx:
movdqa 256+112(%rsp), %xmm15
movq %rsi, %rbx
leaq 3*131072(%rsi), %rax
leaq (%r8, %r8, 2), %rax
shlq $7, %rax
addq %rsi, %rax
scrypt_core_3way_avx_loop1:
movdqa %xmm0, 64(%rbx)
movdqa %xmm1, 80(%rbx)
@ -1208,7 +1224,8 @@ scrypt_core_3way_avx_loop1:
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
movq $1024, %rcx
movq %r8, %rcx
subq $1, %r8
scrypt_core_3way_avx_loop2:
movd %xmm0, %ebp
movd %xmm8, %ebx
@ -1225,13 +1242,13 @@ scrypt_core_3way_avx_loop2:
pxor 256+16(%rsp), %xmm13
pxor 256+32(%rsp), %xmm14
pxor 256+48(%rsp), %xmm15
andl $1023, %ebp
andl %r8d, %ebp
leaq (%rbp, %rbp, 2), %rbp
shll $7, %ebp
andl $1023, %ebx
andl %r8d, %ebx
leaq 1(%rbx, %rbx, 2), %rbx
shll $7, %ebx
andl $1023, %eax
andl %r8d, %eax
leaq 2(%rax, %rax, 2), %rax
shll $7, %eax
pxor 0(%rsi, %rbp), %xmm0
@ -1491,7 +1508,9 @@ scrypt_core_3way_xop:
movdqa 256+112(%rsp), %xmm15
movq %rsi, %rbx
leaq 3*131072(%rsi), %rax
leaq (%r8, %r8, 2), %rax
shlq $7, %rax
addq %rsi, %rax
scrypt_core_3way_xop_loop1:
movdqa %xmm0, 64(%rbx)
movdqa %xmm1, 80(%rbx)
@ -1611,7 +1630,8 @@ scrypt_core_3way_xop_loop1:
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
movq $1024, %rcx
movq %r8, %rcx
subq $1, %r8
scrypt_core_3way_xop_loop2:
movd %xmm0, %ebp
movd %xmm8, %ebx
@ -1628,13 +1648,13 @@ scrypt_core_3way_xop_loop2:
pxor 256+16(%rsp), %xmm13
pxor 256+32(%rsp), %xmm14
pxor 256+48(%rsp), %xmm15
andl $1023, %ebp
andl %r8d, %ebp
leaq (%rbp, %rbp, 2), %rbp
shll $7, %ebp
andl $1023, %ebx
andl %r8d, %ebx
leaq 1(%rbx, %rbx, 2), %rbx
shll $7, %ebx
andl $1023, %eax
andl %r8d, %eax
leaq 2(%rax, %rax, 2), %rax
shll $7, %eax
pxor 0(%rsi, %rbp), %xmm0
@ -1991,7 +2011,9 @@ scrypt_core_3way_xmm:
movdqa 256+112(%rsp), %xmm15
movq %rsi, %rbx
leaq 3*131072(%rsi), %rax
leaq (%r8, %r8, 2), %rax
shlq $7, %rax
addq %rsi, %rax
scrypt_core_3way_xmm_loop1:
movdqa %xmm0, 64(%rbx)
movdqa %xmm1, 80(%rbx)
@ -2111,7 +2133,8 @@ scrypt_core_3way_xmm_loop1:
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
movq $1024, %rcx
movq %r8, %rcx
subq $1, %r8
scrypt_core_3way_xmm_loop2:
movd %xmm0, %ebp
movd %xmm8, %ebx
@ -2128,13 +2151,13 @@ scrypt_core_3way_xmm_loop2:
pxor 256+16(%rsp), %xmm13
pxor 256+32(%rsp), %xmm14
pxor 256+48(%rsp), %xmm15
andl $1023, %ebp
andl %r8d, %ebp
leaq (%rbp, %rbp, 2), %rbp
shll $7, %ebp
andl $1023, %ebx
andl %r8d, %ebx
leaq 1(%rbx, %rbx, 2), %rbx
shll $7, %ebx
andl $1023, %eax
andl %r8d, %eax
leaq 2(%rax, %rax, 2), %rax
shll $7, %eax
pxor 0(%rsi, %rbp), %xmm0
@ -2445,6 +2468,8 @@ _scrypt_core_6way:
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#else
movq %rdx, %r8
#endif
movq %rsp, %rdx
subq $768, %rsp
@ -2539,7 +2564,9 @@ scrypt_core_6way_avx2:
vmovdqa 2*256+7*32(%rsp), %ymm15
movq %rsi, %rbx
leaq 6*131072(%rsi), %rax
leaq (%r8, %r8, 2), %rax
shlq $8, %rax
addq %rsi, %rax
scrypt_core_6way_avx2_loop1:
vmovdqa %ymm0, 0*256+4*32(%rbx)
vmovdqa %ymm1, 0*256+5*32(%rbx)
@ -2659,7 +2686,8 @@ scrypt_core_6way_avx2_loop1:
vmovdqa %ymm14, 2*256+6*32(%rsp)
vmovdqa %ymm15, 2*256+7*32(%rsp)
movq $1024, %rcx
movq %r8, %rcx
leaq -1(%r8), %r11
scrypt_core_6way_avx2_loop2:
vmovd %xmm0, %ebp
vmovd %xmm8, %ebx
@ -2682,22 +2710,22 @@ scrypt_core_6way_avx2_loop2:
vpxor 2*256+1*32(%rsp), %ymm13, %ymm13
vpxor 2*256+2*32(%rsp), %ymm14, %ymm14
vpxor 2*256+3*32(%rsp), %ymm15, %ymm15
andl $1023, %ebp
andl %r11d, %ebp
leaq 0(%rbp, %rbp, 2), %rbp
shll $8, %ebp
andl $1023, %ebx
andl %r11d, %ebx
leaq 1(%rbx, %rbx, 2), %rbx
shll $8, %ebx
andl $1023, %eax
andl %r11d, %eax
leaq 2(%rax, %rax, 2), %rax
shll $8, %eax
andl $1023, %r8d
andl %r11d, %r8d
leaq 0(%r8, %r8, 2), %r8
shll $8, %r8d
andl $1023, %r9d
andl %r11d, %r9d
leaq 1(%r9, %r9, 2), %r9
shll $8, %r9d
andl $1023, %r10d
andl %r11d, %r10d
leaq 2(%r10, %r10, 2), %r10
shll $8, %r10d
vmovdqa 0*32(%rsi, %rbp), %xmm4

View file

@ -1,5 +1,5 @@
/*
* Copyright 2011-2012 pooler@litecoinpool.org
* Copyright 2011-2012, 2014 pooler@litecoinpool.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -415,6 +415,7 @@ _scrypt_core:
scrypt_core_gen:
movl 20(%esp), %edi
movl 24(%esp), %esi
movl 28(%esp), %ecx
subl $72, %esp
.macro scrypt_core_macro1a p, q
@ -453,7 +454,8 @@ scrypt_core_gen:
movl %eax, \q(%edi)
.endm
leal 131072(%esi), %ecx
shll $7, %ecx
addl %esi, %ecx
scrypt_core_gen_loop1:
movl %esi, 64(%esp)
movl %ecx, 68(%esp)
@ -522,12 +524,15 @@ scrypt_core_gen_loop1:
jne scrypt_core_gen_loop1
movl 96(%esp), %esi
movl $1024, %ecx
movl 100(%esp), %ecx
movl %ecx, %eax
subl $1, %eax
movl %eax, 100(%esp)
scrypt_core_gen_loop2:
movl %ecx, 68(%esp)
movl 64(%edi), %edx
andl $1023, %edx
andl 100(%esp), %edx
shll $7, %edx
scrypt_core_macro1b 0, 64
@ -694,7 +699,9 @@ scrypt_core_sse2:
movdqa 112(%esp), %xmm7
movl %esi, %edx
leal 131072(%esi), %ecx
movl 28(%ebp), %ecx
shll $7, %ecx
addl %esi, %ecx
scrypt_core_sse2_loop1:
movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1
@ -748,14 +755,16 @@ scrypt_core_sse2_loop1:
movdqa 64(%esp), %xmm4
movdqa 80(%esp), %xmm5
movl $1024, %ecx
movl 28(%ebp), %ecx
movl %ecx, %eax
subl $1, %eax
scrypt_core_sse2_loop2:
movd %xmm4, %edx
movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1
movdqa 32(%esp), %xmm2
movdqa 48(%esp), %xmm3
andl $1023, %edx
andl %eax, %edx
shll $7, %edx
pxor 0(%esi, %edx), %xmm0
pxor 16(%esi, %edx), %xmm1

View file

@ -1,5 +1,5 @@
/*
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -383,30 +383,30 @@ static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
#define SCRYPT_MAX_WAYS 12
#define HAVE_SCRYPT_3WAY 1
int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V);
void scrypt_core_3way(uint32_t *X, uint32_t *V);
void scrypt_core(uint32_t *X, uint32_t *V, int N);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
#if defined(USE_AVX2)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
#define HAVE_SCRYPT_6WAY 1
void scrypt_core_6way(uint32_t *X, uint32_t *V);
void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
#endif
#elif defined(USE_ASM) && defined(__i386__)
#define SCRYPT_MAX_WAYS 4
#define scrypt_best_throughput() 1
void scrypt_core(uint32_t *X, uint32_t *V);
void scrypt_core(uint32_t *X, uint32_t *V, int N);
#elif defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
void scrypt_core(uint32_t *X, uint32_t *V);
void scrypt_core(uint32_t *X, uint32_t *V, int N);
#if defined(__ARM_NEON__)
#undef HAVE_SHA256_4WAY
#define SCRYPT_MAX_WAYS 3
#define HAVE_SCRYPT_3WAY 1
#define scrypt_best_throughput() 3
void scrypt_core_3way(uint32_t *X, uint32_t *V);
void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
#endif
#else
@ -479,17 +479,17 @@ static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
B[15] += x15;
}
static inline void scrypt_core(uint32_t *X, uint32_t *V)
static inline void scrypt_core(uint32_t *X, uint32_t *V, int N)
{
uint32_t i, j, k;
for (i = 0; i < 1024; i++) {
for (i = 0; i < N; i++) {
memcpy(&V[i * 32], X, 128);
xor_salsa8(&X[0], &X[16]);
xor_salsa8(&X[16], &X[0]);
}
for (i = 0; i < 1024; i++) {
j = 32 * (X[16] & 1023);
for (i = 0; i < N; i++) {
j = 32 * (X[16] & (N - 1));
for (k = 0; k < 32; k++)
X[k] ^= V[j + k];
xor_salsa8(&X[0], &X[16]);
@ -504,15 +504,13 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V)
#define scrypt_best_throughput() 1
#endif
#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
unsigned char *scrypt_buffer_alloc()
unsigned char *scrypt_buffer_alloc(int N)
{
return malloc(SCRYPT_BUFFER_SIZE);
return malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63);
}
static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad)
uint32_t *midstate, unsigned char *scratchpad, int N)
{
uint32_t tstate[8], ostate[8];
uint32_t X[32];
@ -524,14 +522,14 @@ static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core(X, V);
scrypt_core(X, V, N);
PBKDF2_SHA256_128_32(tstate, ostate, X, output);
}
#ifdef HAVE_SHA256_4WAY
static void scrypt_1024_1_1_256_4way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
uint32_t tstate[4 * 8] __attribute__((aligned(128)));
uint32_t ostate[4 * 8] __attribute__((aligned(128)));
@ -553,10 +551,10 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input,
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
X[k * 32 + i] = W[4 * i + k];
scrypt_core(X + 0 * 32, V);
scrypt_core(X + 1 * 32, V);
scrypt_core(X + 2 * 32, V);
scrypt_core(X + 3 * 32, V);
scrypt_core(X + 0 * 32, V, N);
scrypt_core(X + 1 * 32, V, N);
scrypt_core(X + 2 * 32, V, N);
scrypt_core(X + 3 * 32, V, N);
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
W[4 * i + k] = X[k * 32 + i];
@ -570,7 +568,7 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input,
#ifdef HAVE_SCRYPT_3WAY
static void scrypt_1024_1_1_256_3way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
uint32_t tstate[3 * 8], ostate[3 * 8];
uint32_t X[3 * 32] __attribute__((aligned(64)));
@ -588,7 +586,7 @@ static void scrypt_1024_1_1_256_3way(const uint32_t *input,
PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32);
PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);
scrypt_core_3way(X, V);
scrypt_core_3way(X, V, N);
PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0);
PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8);
@ -597,7 +595,7 @@ static void scrypt_1024_1_1_256_3way(const uint32_t *input,
#ifdef HAVE_SHA256_4WAY
static void scrypt_1024_1_1_256_12way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
uint32_t tstate[12 * 8] __attribute__((aligned(128)));
uint32_t ostate[12 * 8] __attribute__((aligned(128)));
@ -626,10 +624,10 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input,
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
scrypt_core_3way(X + 0 * 96, V);
scrypt_core_3way(X + 1 * 96, V);
scrypt_core_3way(X + 2 * 96, V);
scrypt_core_3way(X + 3 * 96, V);
scrypt_core_3way(X + 0 * 96, V, N);
scrypt_core_3way(X + 1 * 96, V, N);
scrypt_core_3way(X + 2 * 96, V, N);
scrypt_core_3way(X + 3 * 96, V, N);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 4; k++)
@ -648,7 +646,7 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input,
#ifdef HAVE_SCRYPT_6WAY
static void scrypt_1024_1_1_256_24way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
{
uint32_t tstate[24 * 8] __attribute__((aligned(128)));
uint32_t ostate[24 * 8] __attribute__((aligned(128)));
@ -677,10 +675,10 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
for (i = 0; i < 32; i++)
for (k = 0; k < 8; k++)
X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
scrypt_core_6way(X + 0 * 32, V);
scrypt_core_6way(X + 6 * 32, V);
scrypt_core_6way(X + 12 * 32, V);
scrypt_core_6way(X + 18 * 32, V);
scrypt_core_6way(X + 0 * 32, V, N);
scrypt_core_6way(X + 6 * 32, V, N);
scrypt_core_6way(X + 12 * 32, V, N);
scrypt_core_6way(X + 18 * 32, V, N);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 8; k++)
@ -697,7 +695,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
uint32_t max_nonce, unsigned long *hashes_done, int N)
{
uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
uint32_t midstate[8];
@ -723,25 +721,25 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
#if defined(HAVE_SHA256_4WAY)
if (throughput == 4)
scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf, N);
else
#endif
#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
if (throughput == 12)
scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf, N);
else
#endif
#if defined(HAVE_SCRYPT_6WAY)
if (throughput == 24)
scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf, N);
else
#endif
#if defined(HAVE_SCRYPT_3WAY)
if (throughput == 3)
scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf, N);
else
#endif
scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
scrypt_1024_1_1_256(data, hash, midstate, scratchbuf, N);
for (i = 0; i < throughput; i++) {
if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {