diff --git a/cpu-miner.c b/cpu-miner.c index 45a6424..80b2095 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -100,7 +100,7 @@ struct workio_cmd { } u; }; -enum sha256_algos { +enum algos { ALGO_SCRYPT, /* scrypt(1024,1,1) */ ALGO_SHA256D, /* SHA-256d */ }; @@ -128,7 +128,8 @@ static int opt_fail_pause = 30; int opt_timeout = 0; static int opt_scantime = 5; static const bool opt_time = true; -static enum sha256_algos opt_algo = ALGO_SCRYPT; +static enum algos opt_algo = ALGO_SCRYPT; +static int opt_scrypt_n = 1024; static int opt_n_threads; static int num_processors; static char *rpc_url; @@ -170,6 +171,7 @@ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ scrypt scrypt(1024, 1, 1) (default)\n\ + scrypt:N scrypt(N, 1, 1)\n\ sha256d SHA-256d\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ @@ -1080,9 +1082,13 @@ static void *miner_thread(void *userdata) affine_to_cpu(thr_id, thr_id % num_processors); } - if (opt_algo == ALGO_SCRYPT) - { - scratchbuf = scrypt_buffer_alloc(); + if (opt_algo == ALGO_SCRYPT) { + scratchbuf = scrypt_buffer_alloc(opt_scrypt_n); + if (!scratchbuf) { + applog(LOG_ERR, "scrypt buffer allocation failed"); + pthread_mutex_lock(&applog_lock); + exit(1); + } } while (1) { @@ -1133,8 +1139,16 @@ static void *miner_thread(void *userdata) max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) - time(NULL); max64 *= thr_hashrates[thr_id]; - if (max64 <= 0) - max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL; + if (max64 <= 0) { + switch (opt_algo) { + case ALGO_SCRYPT: + max64 = opt_scrypt_n < 16 ? 0x3ffff : 0x3fffff / opt_scrypt_n; + break; + case ALGO_SHA256D: + max64 = 0x1fffff; + break; + } + } if (work.data[19] + max64 > end_nonce) max_nonce = end_nonce; else @@ -1147,7 +1161,7 @@ static void *miner_thread(void *userdata) switch (opt_algo) { case ALGO_SCRYPT: rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target, - max_nonce, &hashes_done); + max_nonce, &hashes_done, opt_scrypt_n); break; case ALGO_SHA256D: @@ -1471,10 +1485,21 @@ static void parse_arg(int key, char *arg, char *pname) switch(key) { case 'a': for (i = 0; i < ARRAY_SIZE(algo_names); i++) { - if (algo_names[i] && - !strcmp(arg, algo_names[i])) { - opt_algo = i; - break; + v = strlen(algo_names[i]); + if (!strncmp(arg, algo_names[i], v)) { + if (arg[v] == '\0') { + opt_algo = i; + break; + } + if (arg[v] == ':' && i == ALGO_SCRYPT) { + char *ep; + v = strtol(arg+v+1, &ep, 10); + if (*ep || v & (v-1) || v < 2) + continue; + opt_algo = i; + opt_scrypt_n = v; + break; + } } } if (i == ARRAY_SIZE(algo_names)) { diff --git a/miner.h b/miner.h index 2ed3653..cc5adbd 100644 --- a/miner.h +++ b/miner.h @@ -154,10 +154,10 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); extern int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); -extern unsigned char *scrypt_buffer_alloc(); +extern unsigned char *scrypt_buffer_alloc(int N); extern int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done); + uint32_t max_nonce, unsigned long *hashes_done, int N); struct thr_info { int id; diff --git a/minerd.1 b/minerd.1 index 8c8d34f..3a712d5 100644 --- a/minerd.1 +++ b/minerd.1 @@ -72,6 +72,9 @@ Possible values are: .B scrypt scrypt(1024, 1, 1) (used by Litecoin) .TP +.B scrypt:\fIN\fR +scrypt(\fIN\fR, 1, 1) (\fIN\fR must be a power of 2 greater than 1) +.TP .B sha256d SHA-256d (used by Bitcoin) .RE diff --git a/scrypt-arm.S b/scrypt-arm.S index 983202a..5be3b0e 100644 --- a/scrypt-arm.S +++ b/scrypt-arm.S @@ -1,5 +1,5 @@ /* - * Copyright 2012 pooler@litecoinpool.org + * Copyright 2012, 2014 pooler@litecoinpool.org * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -472,14 +472,16 @@ scrypt_core: _scrypt_core: stmfd sp!, {r4-r11, lr} mov r12, sp - sub sp, sp, #21*4 + sub sp, sp, #22*4 bic sp, sp, #63 str r12, [sp, #20*4] + str r2, [sp, #21*4] scrypt_shuffle + ldr r2, [sp, #21*4] str r0, [sp, #16*4] - add r12, r1, #1024*32*4 + add r12, r1, r2, lsl #7 str r12, [sp, #18*4] scrypt_core_loop1: add lr, r0, #16*4 @@ -517,12 +519,14 @@ scrypt_core_loop1: cmp r1, r12 bne scrypt_core_loop1 + ldr r12, [sp, #21*4] ldr r4, [r0, #16*4] - sub r1, r1, #1024*32*4 + sub r2, r12, #1 + str r2, [sp, #21*4] + sub r1, r1, r12, lsl #7 str r1, [sp, #17*4] - mov r4, r4, lsl #32-10 - mov r12, #1024 - add r1, r1, r4, lsr #32-10-7 + and r4, r4, r2 + add r1, r1, r4, lsl #7 scrypt_core_loop2: add r2, r0, #16*4 add r3, r1, #16*4 @@ -553,9 +557,10 @@ scrypt_core_loop2: mov r1, sp ldr r3, [sp, #17*4] add r0, r0, #16*4 + ldr r2, [sp, #21*4] scrypt_core_macro3_x4 - mov r4, r4, lsl #32-10 - add r3, r3, r4, lsr #32-10-7 + and r4, r4, r2 + add r3, r3, r4, lsl #7 str r3, [sp, #19*4] #ifdef __ARM_ARCH_5E_OR_6_OR_7__ pld [r3, #16*4] @@ -794,10 +799,11 @@ _scrypt_core_3way: mov r12, sp sub sp, sp, #24*16 bic sp, sp, #63 - str r12, [sp, #4*16+3*4] + str r2, [sp, #4*16+3*4] + str r12, [sp, #4*16+4*4] - mov r2, r0 - vldmia r2!, {q8-q15} + mov r3, r0 + vldmia r3!, {q8-q15} vmov.u64 q0, #0xffffffff vmov.u32 q1, q8 vmov.u32 q2, q12 @@ -809,7 +815,7 @@ _scrypt_core_3way: vbif.u32 q14, q15, q0 vbif.u32 q11, q1, q0 vbif.u32 q15, q2, q0 - vldmia r2!, {q0-q7} + vldmia r3!, {q0-q7} vswp.u32 d17, d21 vswp.u32 d25, d29 vswp.u32 d18, d22 @@ -826,7 +832,7 @@ _scrypt_core_3way: vbif.u32 q6, q7, q8 vbif.u32 q3, q9, q8 vbif.u32 q7, q10, q8 - vldmia r2, {q8-q15} + vldmia r3, {q8-q15} vswp.u32 d1, d5 vswp.u32 d9, d13 vswp.u32 d2, d6 @@ -852,7 +858,7 @@ _scrypt_core_3way: add lr, sp, #128 vldmia lr, {q0-q7} - add r2, r1, #1024*32*4 + add r2, r1, r2, lsl #7 str r0, [sp, #4*16+0*4] str r2, [sp, #4*16+2*4] scrypt_core_3way_loop1: @@ -863,12 +869,13 @@ scrypt_core_3way_loop1: scrypt_core_macro1a_x4 scrypt_core_macro1a_x4 scrypt_core_macro1a_x4 + ldr r2, [sp, #4*16+3*4] scrypt_core_macro1a_x4 sub r1, r1, #4*16 - add r1, r1, #1024*32*4 + add r1, r1, r2, lsl #7 vstmia r1, {q0-q7} - add r3, r1, #1024*32*4 + add r3, r1, r2, lsl #7 vstmia r3, {q8-q15} add lr, sp, #128 @@ -957,20 +964,22 @@ scrypt_core_3way_loop1: cmp r1, r2 bne scrypt_core_3way_loop1 + ldr r2, [sp, #4*16+3*4] add r5, sp, #256+4*16 vstmia r5, {q12-q15} - sub r1, r1, #1024*32*4 + sub r1, r1, r2, lsl #7 str r1, [sp, #4*16+1*4] - mov r2, #1024 scrypt_core_3way_loop2: str r2, [sp, #4*16+2*4] ldr r0, [sp, #4*16+0*4] ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+3*4] ldr r4, [r0, #16*4] - mov r4, r4, lsl #32-10 - add r1, r1, r4, lsr #32-10-7 + sub r2, r2, #1 + and r4, r4, r2 + add r1, r1, r4, lsl #7 add r2, r0, #16*4 add r3, r1, #16*4 mov r12, sp @@ -980,29 +989,31 @@ scrypt_core_3way_loop2: scrypt_core_macro1b_x4 ldr r1, [sp, #4*16+1*4] - add r1, r1, #1024*32*4 - add r3, r1, #1024*32*4 + ldr r2, [sp, #4*16+3*4] + add r1, r1, r2, lsl #7 + add r3, r1, r2, lsl #7 + sub r2, r2, #1 vmov r6, r7, d8 - mov r6, r6, lsl #32-10 - add r6, r1, r6, lsr #32-10-7 + and r6, r6, r2 + add r6, r1, r6, lsl #7 vmov r7, r8, d24 add lr, sp, #128 vldmia lr, {q0-q3} pld [r6] - pld [r6, #8*4] + pld [r6, #8*4] pld [r6, #16*4] - pld [r6, #24*4] + pld [r6, #24*4] vldmia r6, {q8-q15} - mov r7, r7, lsl #32-10 - add r7, r3, r7, lsr #32-10-7 + and r7, r7, r2 + add r7, r3, r7, lsl #7 veor.u32 q8, q8, q0 veor.u32 q9, q9, q1 veor.u32 q10, q10, q2 veor.u32 q11, q11, q3 pld [r7] - pld [r7, #8*4] + pld [r7, #8*4] pld [r7, #16*4] - pld [r7, #24*4] + pld [r7, #24*4] veor.u32 q12, q12, q4 veor.u32 q13, q13, q5 veor.u32 q14, q14, q6 @@ -1079,15 +1090,17 @@ scrypt_core_3way_loop2: ldr r0, [sp, #4*16+0*4] ldr r3, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+3*4] mov r1, sp add r0, r0, #16*4 + sub r2, r2, #1 scrypt_core_macro3_x4 - mov r4, r4, lsl #32-10 - add r3, r3, r4, lsr #32-10-7 + and r4, r4, r2 + add r3, r3, r4, lsl #7 pld [r3, #16*4] pld [r3] - pld [r3, #24*4] - pld [r3, #8*4] + pld [r3, #24*4] + pld [r3, #8*4] scrypt_core_macro3_x6 scrypt_core_macro3_x6 @@ -1164,7 +1177,7 @@ scrypt_core_3way_loop2: vswp.u32 d26, d30 vstmia r0, {q8-q15} - ldr sp, [sp, #4*16+3*4] + ldr sp, [sp, #4*16+4*4] vpop {q4-q7} ldmfd sp!, {r4-r11, pc} diff --git a/scrypt-x64.S b/scrypt-x64.S index b9f3358..f9185d4 100644 --- a/scrypt-x64.S +++ b/scrypt-x64.S @@ -1,5 +1,5 @@ /* - * Copyright 2011-2013 pooler@litecoinpool.org + * Copyright 2011-2014 pooler@litecoinpool.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -113,38 +113,38 @@ scrypt_best_throughput_exit: .macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %r8d - movl \so+44(\src), %r9d - movl \so+28(\src), %r10d - movl \so+12(\src), %r11d - movl %r8d, \do+12(\dest) - movl %r9d, \do+28(\dest) - movl %r10d, \do+44(\dest) - movl %r11d, \do+60(\dest) - movl \so+40(\src), %r8d - movl \so+8(\src), %r9d - movl \so+48(\src), %r10d - movl \so+16(\src), %r11d - movl %r8d, \do+8(\dest) - movl %r9d, \do+40(\dest) - movl %r10d, \do+16(\dest) - movl %r11d, \do+48(\dest) - movl \so+20(\src), %r8d - movl \so+4(\src), %r9d - movl \so+52(\src), %r10d - movl \so+36(\src), %r11d - movl %r8d, \do+4(\dest) - movl %r9d, \do+20(\dest) - movl %r10d, \do+36(\dest) - movl %r11d, \do+52(\dest) - movl \so+0(\src), %r8d - movl \so+24(\src), %r9d - movl \so+32(\src), %r10d - movl \so+56(\src), %r11d - movl %r8d, \do+0(\dest) - movl %r9d, \do+24(\dest) - movl %r10d, \do+32(\dest) - movl %r11d, \do+56(\dest) + movl \so+60(\src), %eax + movl \so+44(\src), %ebx + movl \so+28(\src), %ecx + movl \so+12(\src), %edx + movl %eax, \do+12(\dest) + movl %ebx, \do+28(\dest) + movl %ecx, \do+44(\dest) + movl %edx, \do+60(\dest) + movl \so+40(\src), %eax + movl \so+8(\src), %ebx + movl \so+48(\src), %ecx + movl \so+16(\src), %edx + movl %eax, \do+8(\dest) + movl %ebx, \do+40(\dest) + movl %ecx, \do+16(\dest) + movl %edx, \do+48(\dest) + movl \so+20(\src), %eax + movl \so+4(\src), %ebx + movl \so+52(\src), %ecx + movl \so+36(\src), %edx + movl %eax, \do+4(\dest) + movl %ebx, \do+20(\dest) + movl %ecx, \do+36(\dest) + movl %edx, \do+52(\dest) + movl \so+0(\src), %eax + movl \so+24(\src), %ebx + movl \so+32(\src), %ecx + movl \so+56(\src), %edx + movl %eax, \do+0(\dest) + movl %ebx, \do+24(\dest) + movl %ecx, \do+32(\dest) + movl %edx, \do+56(\dest) .endm @@ -384,6 +384,8 @@ _scrypt_core: pushq %rsi movq %rcx, %rdi movq %rdx, %rsi +#else + movq %rdx, %r8 #endif .macro scrypt_core_cleanup @@ -432,7 +434,10 @@ scrypt_core_gen: movdqa 96(%rdi), %xmm14 movdqa 112(%rdi), %xmm15 - leaq 131072(%rsi), %rcx + movq %r8, %rcx + shlq $7, %rcx + addq %rsi, %rcx + movq %r8, 96(%rsp) movq %rdi, 104(%rsp) movq %rsi, 112(%rsp) movq %rcx, 120(%rsp) @@ -481,11 +486,14 @@ scrypt_core_gen_loop1: cmpq %rcx, %rsi jne scrypt_core_gen_loop1 - movq $1024, %rcx + movq 96(%rsp), %r8 + movq %r8, %rcx + subl $1, %r8d + movq %r8, 96(%rsp) movd %xmm12, %edx scrypt_core_gen_loop2: movq 112(%rsp), %rsi - andl $1023, %edx + andl %r8d, %edx shll $7, %edx addq %rsi, %rdx movdqa 0(%rdx), %xmm0 @@ -529,6 +537,7 @@ scrypt_core_gen_loop2: movdqa %xmm14, 32(%rsp) movdqa %xmm15, 48(%rsp) call salsa8_core_gen + movq 96(%rsp), %r8 movq 128(%rsp), %rcx addl 0(%rsp), %edx paddd %xmm0, %xmm12 @@ -691,7 +700,9 @@ scrypt_core_xmm: punpckhqdq %xmm0, %xmm13 movq %rsi, %rdx - leaq 131072(%rsi), %rcx + movq %r8, %rcx + shlq $7, %rcx + addq %rsi, %rcx scrypt_core_xmm_loop1: pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 @@ -734,10 +745,11 @@ scrypt_core_xmm_loop1: cmpq %rcx, %rdx jne scrypt_core_xmm_loop1 - movq $1024, %rcx + movq %r8, %rcx + subl $1, %r8d scrypt_core_xmm_loop2: movd %xmm12, %edx - andl $1023, %edx + andl %r8d, %edx shll $7, %edx pxor 0(%rsi, %rdx), %xmm8 pxor 16(%rsi, %rdx), %xmm9 @@ -1019,6 +1031,8 @@ _scrypt_core_3way: pushq %rsi movq %rcx, %rdi movq %rdx, %rsi +#else + movq %rdx, %r8 #endif subq $392, %rsp @@ -1088,7 +1102,9 @@ scrypt_core_3way_avx: movdqa 256+112(%rsp), %xmm15 movq %rsi, %rbx - leaq 3*131072(%rsi), %rax + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax scrypt_core_3way_avx_loop1: movdqa %xmm0, 64(%rbx) movdqa %xmm1, 80(%rbx) @@ -1208,7 +1224,8 @@ scrypt_core_3way_avx_loop1: movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - movq $1024, %rcx + movq %r8, %rcx + subq $1, %r8 scrypt_core_3way_avx_loop2: movd %xmm0, %ebp movd %xmm8, %ebx @@ -1225,13 +1242,13 @@ scrypt_core_3way_avx_loop2: pxor 256+16(%rsp), %xmm13 pxor 256+32(%rsp), %xmm14 pxor 256+48(%rsp), %xmm15 - andl $1023, %ebp + andl %r8d, %ebp leaq (%rbp, %rbp, 2), %rbp shll $7, %ebp - andl $1023, %ebx + andl %r8d, %ebx leaq 1(%rbx, %rbx, 2), %rbx shll $7, %ebx - andl $1023, %eax + andl %r8d, %eax leaq 2(%rax, %rax, 2), %rax shll $7, %eax pxor 0(%rsi, %rbp), %xmm0 @@ -1491,7 +1508,9 @@ scrypt_core_3way_xop: movdqa 256+112(%rsp), %xmm15 movq %rsi, %rbx - leaq 3*131072(%rsi), %rax + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax scrypt_core_3way_xop_loop1: movdqa %xmm0, 64(%rbx) movdqa %xmm1, 80(%rbx) @@ -1611,7 +1630,8 @@ scrypt_core_3way_xop_loop1: movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - movq $1024, %rcx + movq %r8, %rcx + subq $1, %r8 scrypt_core_3way_xop_loop2: movd %xmm0, %ebp movd %xmm8, %ebx @@ -1628,13 +1648,13 @@ scrypt_core_3way_xop_loop2: pxor 256+16(%rsp), %xmm13 pxor 256+32(%rsp), %xmm14 pxor 256+48(%rsp), %xmm15 - andl $1023, %ebp + andl %r8d, %ebp leaq (%rbp, %rbp, 2), %rbp shll $7, %ebp - andl $1023, %ebx + andl %r8d, %ebx leaq 1(%rbx, %rbx, 2), %rbx shll $7, %ebx - andl $1023, %eax + andl %r8d, %eax leaq 2(%rax, %rax, 2), %rax shll $7, %eax pxor 0(%rsi, %rbp), %xmm0 @@ -1991,7 +2011,9 @@ scrypt_core_3way_xmm: movdqa 256+112(%rsp), %xmm15 movq %rsi, %rbx - leaq 3*131072(%rsi), %rax + leaq (%r8, %r8, 2), %rax + shlq $7, %rax + addq %rsi, %rax scrypt_core_3way_xmm_loop1: movdqa %xmm0, 64(%rbx) movdqa %xmm1, 80(%rbx) @@ -2111,7 +2133,8 @@ scrypt_core_3way_xmm_loop1: movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - movq $1024, %rcx + movq %r8, %rcx + subq $1, %r8 scrypt_core_3way_xmm_loop2: movd %xmm0, %ebp movd %xmm8, %ebx @@ -2128,13 +2151,13 @@ scrypt_core_3way_xmm_loop2: pxor 256+16(%rsp), %xmm13 pxor 256+32(%rsp), %xmm14 pxor 256+48(%rsp), %xmm15 - andl $1023, %ebp + andl %r8d, %ebp leaq (%rbp, %rbp, 2), %rbp shll $7, %ebp - andl $1023, %ebx + andl %r8d, %ebx leaq 1(%rbx, %rbx, 2), %rbx shll $7, %ebx - andl $1023, %eax + andl %r8d, %eax leaq 2(%rax, %rax, 2), %rax shll $7, %eax pxor 0(%rsi, %rbp), %xmm0 @@ -2445,6 +2468,8 @@ _scrypt_core_6way: pushq %rsi movq %rcx, %rdi movq %rdx, %rsi +#else + movq %rdx, %r8 #endif movq %rsp, %rdx subq $768, %rsp @@ -2539,7 +2564,9 @@ scrypt_core_6way_avx2: vmovdqa 2*256+7*32(%rsp), %ymm15 movq %rsi, %rbx - leaq 6*131072(%rsi), %rax + leaq (%r8, %r8, 2), %rax + shlq $8, %rax + addq %rsi, %rax scrypt_core_6way_avx2_loop1: vmovdqa %ymm0, 0*256+4*32(%rbx) vmovdqa %ymm1, 0*256+5*32(%rbx) @@ -2659,7 +2686,8 @@ scrypt_core_6way_avx2_loop1: vmovdqa %ymm14, 2*256+6*32(%rsp) vmovdqa %ymm15, 2*256+7*32(%rsp) - movq $1024, %rcx + movq %r8, %rcx + leaq -1(%r8), %r11 scrypt_core_6way_avx2_loop2: vmovd %xmm0, %ebp vmovd %xmm8, %ebx @@ -2682,22 +2710,22 @@ scrypt_core_6way_avx2_loop2: vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 - andl $1023, %ebp + andl %r11d, %ebp leaq 0(%rbp, %rbp, 2), %rbp shll $8, %ebp - andl $1023, %ebx + andl %r11d, %ebx leaq 1(%rbx, %rbx, 2), %rbx shll $8, %ebx - andl $1023, %eax + andl %r11d, %eax leaq 2(%rax, %rax, 2), %rax shll $8, %eax - andl $1023, %r8d + andl %r11d, %r8d leaq 0(%r8, %r8, 2), %r8 shll $8, %r8d - andl $1023, %r9d + andl %r11d, %r9d leaq 1(%r9, %r9, 2), %r9 shll $8, %r9d - andl $1023, %r10d + andl %r11d, %r10d leaq 2(%r10, %r10, 2), %r10 shll $8, %r10d vmovdqa 0*32(%rsi, %rbp), %xmm4 diff --git a/scrypt-x86.S b/scrypt-x86.S index 7e12ceb..5ab7eda 100644 --- a/scrypt-x86.S +++ b/scrypt-x86.S @@ -1,5 +1,5 @@ /* - * Copyright 2011-2012 pooler@litecoinpool.org + * Copyright 2011-2012, 2014 pooler@litecoinpool.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -415,6 +415,7 @@ _scrypt_core: scrypt_core_gen: movl 20(%esp), %edi movl 24(%esp), %esi + movl 28(%esp), %ecx subl $72, %esp .macro scrypt_core_macro1a p, q @@ -453,7 +454,8 @@ scrypt_core_gen: movl %eax, \q(%edi) .endm - leal 131072(%esi), %ecx + shll $7, %ecx + addl %esi, %ecx scrypt_core_gen_loop1: movl %esi, 64(%esp) movl %ecx, 68(%esp) @@ -522,12 +524,15 @@ scrypt_core_gen_loop1: jne scrypt_core_gen_loop1 movl 96(%esp), %esi - movl $1024, %ecx + movl 100(%esp), %ecx + movl %ecx, %eax + subl $1, %eax + movl %eax, 100(%esp) scrypt_core_gen_loop2: movl %ecx, 68(%esp) movl 64(%edi), %edx - andl $1023, %edx + andl 100(%esp), %edx shll $7, %edx scrypt_core_macro1b 0, 64 @@ -694,7 +699,9 @@ scrypt_core_sse2: movdqa 112(%esp), %xmm7 movl %esi, %edx - leal 131072(%esi), %ecx + movl 28(%ebp), %ecx + shll $7, %ecx + addl %esi, %ecx scrypt_core_sse2_loop1: movdqa 0(%esp), %xmm0 movdqa 16(%esp), %xmm1 @@ -748,14 +755,16 @@ scrypt_core_sse2_loop1: movdqa 64(%esp), %xmm4 movdqa 80(%esp), %xmm5 - movl $1024, %ecx + movl 28(%ebp), %ecx + movl %ecx, %eax + subl $1, %eax scrypt_core_sse2_loop2: movd %xmm4, %edx movdqa 0(%esp), %xmm0 movdqa 16(%esp), %xmm1 movdqa 32(%esp), %xmm2 movdqa 48(%esp), %xmm3 - andl $1023, %edx + andl %eax, %edx shll $7, %edx pxor 0(%esi, %edx), %xmm0 pxor 16(%esi, %edx), %xmm1 diff --git a/scrypt.c b/scrypt.c index 702551b..f75123e 100644 --- a/scrypt.c +++ b/scrypt.c @@ -1,5 +1,5 @@ /* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -383,30 +383,30 @@ static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, #define SCRYPT_MAX_WAYS 12 #define HAVE_SCRYPT_3WAY 1 int scrypt_best_throughput(); -void scrypt_core(uint32_t *X, uint32_t *V); -void scrypt_core_3way(uint32_t *X, uint32_t *V); +void scrypt_core(uint32_t *X, uint32_t *V, int N); +void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); #if defined(USE_AVX2) #undef SCRYPT_MAX_WAYS #define SCRYPT_MAX_WAYS 24 #define HAVE_SCRYPT_6WAY 1 -void scrypt_core_6way(uint32_t *X, uint32_t *V); +void scrypt_core_6way(uint32_t *X, uint32_t *V, int N); #endif #elif defined(USE_ASM) && defined(__i386__) #define SCRYPT_MAX_WAYS 4 #define scrypt_best_throughput() 1 -void scrypt_core(uint32_t *X, uint32_t *V); +void scrypt_core(uint32_t *X, uint32_t *V, int N); #elif defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) -void scrypt_core(uint32_t *X, uint32_t *V); +void scrypt_core(uint32_t *X, uint32_t *V, int N); #if defined(__ARM_NEON__) #undef HAVE_SHA256_4WAY #define SCRYPT_MAX_WAYS 3 #define HAVE_SCRYPT_3WAY 1 #define scrypt_best_throughput() 3 -void scrypt_core_3way(uint32_t *X, uint32_t *V); +void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); #endif #else @@ -479,17 +479,17 @@ static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) B[15] += x15; } -static inline void scrypt_core(uint32_t *X, uint32_t *V) +static inline void scrypt_core(uint32_t *X, uint32_t *V, int N) { uint32_t i, j, k; - for (i = 0; i < 1024; i++) { + for (i = 0; i < N; i++) { memcpy(&V[i * 32], X, 128); xor_salsa8(&X[0], &X[16]); xor_salsa8(&X[16], &X[0]); } - for (i = 0; i < 1024; i++) { - j = 32 * (X[16] & 1023); + for (i = 0; i < N; i++) { + j = 32 * (X[16] & (N - 1)); for (k = 0; k < 32; k++) X[k] ^= V[j + k]; xor_salsa8(&X[0], &X[16]); @@ -504,15 +504,13 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V) #define scrypt_best_throughput() 1 #endif -#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) - -unsigned char *scrypt_buffer_alloc() +unsigned char *scrypt_buffer_alloc(int N) { - return malloc(SCRYPT_BUFFER_SIZE); + return malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63); } static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, - uint32_t *midstate, unsigned char *scratchpad) + uint32_t *midstate, unsigned char *scratchpad, int N) { uint32_t tstate[8], ostate[8]; uint32_t X[32]; @@ -524,14 +522,14 @@ static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, HMAC_SHA256_80_init(input, tstate, ostate); PBKDF2_SHA256_80_128(tstate, ostate, input, X); - scrypt_core(X, V); + scrypt_core(X, V, N); PBKDF2_SHA256_128_32(tstate, ostate, X, output); } #ifdef HAVE_SHA256_4WAY static void scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) { uint32_t tstate[4 * 8] __attribute__((aligned(128))); uint32_t ostate[4 * 8] __attribute__((aligned(128))); @@ -553,10 +551,10 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input, for (i = 0; i < 32; i++) for (k = 0; k < 4; k++) X[k * 32 + i] = W[4 * i + k]; - scrypt_core(X + 0 * 32, V); - scrypt_core(X + 1 * 32, V); - scrypt_core(X + 2 * 32, V); - scrypt_core(X + 3 * 32, V); + scrypt_core(X + 0 * 32, V, N); + scrypt_core(X + 1 * 32, V, N); + scrypt_core(X + 2 * 32, V, N); + scrypt_core(X + 3 * 32, V, N); for (i = 0; i < 32; i++) for (k = 0; k < 4; k++) W[4 * i + k] = X[k * 32 + i]; @@ -570,7 +568,7 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input, #ifdef HAVE_SCRYPT_3WAY static void scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) { uint32_t tstate[3 * 8], ostate[3 * 8]; uint32_t X[3 * 32] __attribute__((aligned(64))); @@ -588,7 +586,7 @@ static void scrypt_1024_1_1_256_3way(const uint32_t *input, PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); - scrypt_core_3way(X, V); + scrypt_core_3way(X, V, N); PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); @@ -597,7 +595,7 @@ static void scrypt_1024_1_1_256_3way(const uint32_t *input, #ifdef HAVE_SHA256_4WAY static void scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) { uint32_t tstate[12 * 8] __attribute__((aligned(128))); uint32_t ostate[12 * 8] __attribute__((aligned(128))); @@ -626,10 +624,10 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input, for (i = 0; i < 32; i++) for (k = 0; k < 4; k++) X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; - scrypt_core_3way(X + 0 * 96, V); - scrypt_core_3way(X + 1 * 96, V); - scrypt_core_3way(X + 2 * 96, V); - scrypt_core_3way(X + 3 * 96, V); + scrypt_core_3way(X + 0 * 96, V, N); + scrypt_core_3way(X + 1 * 96, V, N); + scrypt_core_3way(X + 2 * 96, V, N); + scrypt_core_3way(X + 3 * 96, V, N); for (j = 0; j < 3; j++) for (i = 0; i < 32; i++) for (k = 0; k < 4; k++) @@ -648,7 +646,7 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input, #ifdef HAVE_SCRYPT_6WAY static void scrypt_1024_1_1_256_24way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) { uint32_t tstate[24 * 8] __attribute__((aligned(128))); uint32_t ostate[24 * 8] __attribute__((aligned(128))); @@ -677,10 +675,10 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input, for (i = 0; i < 32; i++) for (k = 0; k < 8; k++) X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - scrypt_core_6way(X + 0 * 32, V); - scrypt_core_6way(X + 6 * 32, V); - scrypt_core_6way(X + 12 * 32, V); - scrypt_core_6way(X + 18 * 32, V); + scrypt_core_6way(X + 0 * 32, V, N); + scrypt_core_6way(X + 6 * 32, V, N); + scrypt_core_6way(X + 12 * 32, V, N); + scrypt_core_6way(X + 18 * 32, V, N); for (j = 0; j < 3; j++) for (i = 0; i < 32; i++) for (k = 0; k < 8; k++) @@ -697,7 +695,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input, int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) + uint32_t max_nonce, unsigned long *hashes_done, int N) { uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; uint32_t midstate[8]; @@ -723,25 +721,25 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata, #if defined(HAVE_SHA256_4WAY) if (throughput == 4) - scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); + scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf, N); else #endif #if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) if (throughput == 12) - scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); + scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf, N); else #endif #if defined(HAVE_SCRYPT_6WAY) if (throughput == 24) - scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); + scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf, N); else #endif #if defined(HAVE_SCRYPT_3WAY) if (throughput == 3) - scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); + scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf, N); else #endif - scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); + scrypt_1024_1_1_256(data, hash, midstate, scratchbuf, N); for (i = 0; i < throughput; i++) { if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {