Add support for scrypt(N, 1, 1)

2014-05-25 17:21:36 +02:00 · 2014-05-25 17:21:36 +02:00 · be1b725270
commit be1b725270
parent a988337f52
7 changed files with 235 additions and 159 deletions
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -100,7 +100,7 @@ struct workio_cmd {
 	} u;
 };

-enum sha256_algos {
+enum algos {
 	ALGO_SCRYPT,		/* scrypt(1024,1,1) */
 	ALGO_SHA256D,		/* SHA-256d */
 };
@ -128,7 +128,8 @@ static int opt_fail_pause = 30;
 int opt_timeout = 0;
 static int opt_scantime = 5;
 static const bool opt_time = true;
-static enum sha256_algos opt_algo = ALGO_SCRYPT;
+static enum algos opt_algo = ALGO_SCRYPT;
+static int opt_scrypt_n = 1024;
 static int opt_n_threads;
 static int num_processors;
 static char *rpc_url;
@ -170,6 +171,7 @@ Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
  -a, --algo=ALGO       specify the algorithm to use\n\
                          scrypt    scrypt(1024, 1, 1) (default)\n\
+                          scrypt:N  scrypt(N, 1, 1)\n\
                          sha256d   SHA-256d\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
@ -1080,9 +1082,13 @@ static void *miner_thread(void *userdata)
 		affine_to_cpu(thr_id, thr_id % num_processors);
 	}
 	
-	if (opt_algo == ALGO_SCRYPT)
-	{
-		scratchbuf = scrypt_buffer_alloc();
+	if (opt_algo == ALGO_SCRYPT) {
+		scratchbuf = scrypt_buffer_alloc(opt_scrypt_n);
+		if (!scratchbuf) {
+			applog(LOG_ERR, "scrypt buffer allocation failed");
+			pthread_mutex_lock(&applog_lock);
+			exit(1);
+		}
 	}

 	while (1) {
@ -1133,8 +1139,16 @@ static void *miner_thread(void *userdata)
 			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
 			      - time(NULL);
 		max64 *= thr_hashrates[thr_id];
-		if (max64 <= 0)
-			max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL;
+		if (max64 <= 0) {
+			switch (opt_algo) {
+			case ALGO_SCRYPT:
+				max64 = opt_scrypt_n < 16 ? 0x3ffff : 0x3fffff / opt_scrypt_n;
+				break;
+			case ALGO_SHA256D:
+				max64 = 0x1fffff;
+				break;
+			}
+		}
 		if (work.data[19] + max64 > end_nonce)
 			max_nonce = end_nonce;
 		else
@ -1147,7 +1161,7 @@ static void *miner_thread(void *userdata)
 		switch (opt_algo) {
 		case ALGO_SCRYPT:
 			rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target,
-			                     max_nonce, &hashes_done);
+			                     max_nonce, &hashes_done, opt_scrypt_n);
 			break;

 		case ALGO_SHA256D:
@ -1471,10 +1485,21 @@ static void parse_arg(int key, char *arg, char *pname)
 	switch(key) {
 	case 'a':
 		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
-			if (algo_names[i] &&
-			    !strcmp(arg, algo_names[i])) {
-				opt_algo = i;
-				break;
+			v = strlen(algo_names[i]);
+			if (!strncmp(arg, algo_names[i], v)) {
+				if (arg[v] == '\0') {
+					opt_algo = i;
+					break;
+				}
+				if (arg[v] == ':' && i == ALGO_SCRYPT) {
+					char *ep;
+					v = strtol(arg+v+1, &ep, 10);
+					if (*ep || v & (v-1) || v < 2)
+						continue;
+					opt_algo = i;
+					opt_scrypt_n = v;
+					break;
+				}
 			}
 		}
 		if (i == ARRAY_SIZE(algo_names)) {
--- a/miner.h
+++ b/miner.h
@ -154,10 +154,10 @@ void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
 extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);

-extern unsigned char *scrypt_buffer_alloc();
+extern unsigned char *scrypt_buffer_alloc(int N);
 extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
 	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done);
+	uint32_t max_nonce, unsigned long *hashes_done, int N);

 struct thr_info {
 	int		id;
--- a/minerd.1
+++ b/minerd.1
@ -72,6 +72,9 @@ Possible values are:
 .B scrypt
 scrypt(1024, 1, 1) (used by Litecoin)
 .TP
+.B scrypt:\fIN\fR
+scrypt(\fIN\fR, 1, 1) (\fIN\fR must be a power of 2 greater than 1)
+.TP
 .B sha256d
 SHA-256d (used by Bitcoin)
 .RE
--- a/scrypt-arm.S
+++ b/scrypt-arm.S
@ -1,5 +1,5 @@
 /*
- * Copyright 2012 pooler@litecoinpool.org
+ * Copyright 2012, 2014 pooler@litecoinpool.org
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
@ -472,14 +472,16 @@ scrypt_core:
 _scrypt_core:
 	stmfd	sp!, {r4-r11, lr}
 	mov	r12, sp
-	sub	sp, sp, #21*4
+	sub	sp, sp, #22*4
 	bic	sp, sp, #63
 	str	r12, [sp, #20*4]
+	str	r2, [sp, #21*4]
 	
 	scrypt_shuffle
 	
+	ldr	r2, [sp, #21*4]
 	str	r0, [sp, #16*4]
-	add	r12, r1, #1024*32*4
+	add	r12, r1, r2, lsl #7
 	str	r12, [sp, #18*4]
 scrypt_core_loop1:
 	add	lr, r0, #16*4
@ -517,12 +519,14 @@ scrypt_core_loop1:
 	cmp	r1, r12
 	bne	scrypt_core_loop1
 	
+	ldr	r12, [sp, #21*4]
 	ldr	r4, [r0, #16*4]
-	sub	r1, r1, #1024*32*4
+	sub	r2, r12, #1
+	str	r2, [sp, #21*4]
+	sub	r1, r1, r12, lsl #7
 	str	r1, [sp, #17*4]
-	mov	r4, r4, lsl #32-10
-	mov	r12, #1024
-	add	r1, r1, r4, lsr #32-10-7
+	and	r4, r4, r2
+	add	r1, r1, r4, lsl #7
 scrypt_core_loop2:
 	add	r2, r0, #16*4
 	add	r3, r1, #16*4
@ -553,9 +557,10 @@ scrypt_core_loop2:
 	mov	r1, sp
 	ldr	r3, [sp, #17*4]
 	add	r0, r0, #16*4
+	ldr	r2, [sp, #21*4]
 	scrypt_core_macro3_x4
-	mov	r4, r4, lsl #32-10
-	add	r3, r3, r4, lsr #32-10-7
+	and	r4, r4, r2
+	add	r3, r3, r4, lsl #7
 	str	r3, [sp, #19*4]
 #ifdef __ARM_ARCH_5E_OR_6_OR_7__
 	pld	[r3, #16*4]
@ -794,10 +799,11 @@ _scrypt_core_3way:
 	mov	r12, sp
 	sub	sp, sp, #24*16
 	bic	sp, sp, #63
-	str	r12, [sp, #4*16+3*4]
+	str	r2, [sp, #4*16+3*4]
+	str	r12, [sp, #4*16+4*4]
 	
-	mov	r2, r0
-	vldmia	r2!, {q8-q15}
+	mov	r3, r0
+	vldmia	r3!, {q8-q15}
 	vmov.u64	q0, #0xffffffff
 	vmov.u32	q1, q8
 	vmov.u32	q2, q12
@ -809,7 +815,7 @@ _scrypt_core_3way:
 	vbif.u32	q14, q15, q0
 	vbif.u32	q11, q1, q0
 	vbif.u32	q15, q2, q0
-	vldmia	r2!, {q0-q7}
+	vldmia	r3!, {q0-q7}
 	vswp.u32	d17, d21
 	vswp.u32	d25, d29
 	vswp.u32	d18, d22
@ -826,7 +832,7 @@ _scrypt_core_3way:
 	vbif.u32	q6, q7, q8
 	vbif.u32	q3, q9, q8
 	vbif.u32	q7, q10, q8
-	vldmia	r2, {q8-q15}
+	vldmia	r3, {q8-q15}
 	vswp.u32	d1, d5
 	vswp.u32	d9, d13
 	vswp.u32	d2, d6
@ -852,7 +858,7 @@ _scrypt_core_3way:
 	
 	add	lr, sp, #128
 	vldmia	lr, {q0-q7}
-	add	r2, r1, #1024*32*4
+	add	r2, r1, r2, lsl #7
 	str	r0, [sp, #4*16+0*4]
 	str	r2, [sp, #4*16+2*4]
 scrypt_core_3way_loop1:
@ -863,12 +869,13 @@ scrypt_core_3way_loop1:
 	scrypt_core_macro1a_x4
 	scrypt_core_macro1a_x4
 	scrypt_core_macro1a_x4
+	ldr	r2, [sp, #4*16+3*4]
 	scrypt_core_macro1a_x4
 	sub	r1, r1, #4*16
 	
-	add	r1, r1, #1024*32*4
+	add	r1, r1, r2, lsl #7
 	vstmia	r1, {q0-q7}
-	add	r3, r1, #1024*32*4
+	add	r3, r1, r2, lsl #7
 	vstmia	r3, {q8-q15}
 	
 	add	lr, sp, #128
@ -957,20 +964,22 @@ scrypt_core_3way_loop1:
 	cmp	r1, r2
 	bne	scrypt_core_3way_loop1
 	
+	ldr	r2, [sp, #4*16+3*4]
 	add	r5, sp, #256+4*16
 	vstmia	r5, {q12-q15}
 	
-	sub	r1, r1, #1024*32*4
+	sub	r1, r1, r2, lsl #7
 	str	r1, [sp, #4*16+1*4]
-	mov	r2, #1024
 scrypt_core_3way_loop2:
 	str	r2, [sp, #4*16+2*4]
 	
 	ldr	r0, [sp, #4*16+0*4]
 	ldr	r1, [sp, #4*16+1*4]
+	ldr	r2, [sp, #4*16+3*4]
 	ldr	r4, [r0, #16*4]
-	mov	r4, r4, lsl #32-10
-	add	r1, r1, r4, lsr #32-10-7
+	sub	r2, r2, #1
+	and	r4, r4, r2
+	add	r1, r1, r4, lsl #7
 	add	r2, r0, #16*4
 	add	r3, r1, #16*4
 	mov	r12, sp
@ -980,29 +989,31 @@ scrypt_core_3way_loop2:
 	scrypt_core_macro1b_x4
 	
 	ldr	r1, [sp, #4*16+1*4]
-	add	r1, r1, #1024*32*4
-	add	r3, r1, #1024*32*4
+	ldr	r2, [sp, #4*16+3*4]
+	add	r1, r1, r2, lsl #7
+	add	r3, r1, r2, lsl #7
+	sub	r2, r2, #1
 	vmov	r6, r7, d8
-	mov	r6, r6, lsl #32-10
-	add	r6, r1, r6, lsr #32-10-7
+	and	r6, r6, r2
+	add	r6, r1, r6, lsl #7
 	vmov	r7, r8, d24
 	add	lr, sp, #128
 	vldmia	lr, {q0-q3}
 	pld	[r6]
-	pld [r6, #8*4]
+	pld	[r6, #8*4]
 	pld	[r6, #16*4]
-	pld [r6, #24*4]
+	pld	[r6, #24*4]
 	vldmia	r6, {q8-q15}
-	mov	r7, r7, lsl #32-10
-	add	r7, r3, r7, lsr #32-10-7
+	and	r7, r7, r2
+	add	r7, r3, r7, lsl #7
 	veor.u32	q8, q8, q0
 	veor.u32	q9, q9, q1
 	veor.u32	q10, q10, q2
 	veor.u32	q11, q11, q3
 	pld	[r7]
-	pld [r7, #8*4]
+	pld	[r7, #8*4]
 	pld	[r7, #16*4]
-	pld [r7, #24*4]
+	pld	[r7, #24*4]
 	veor.u32	q12, q12, q4
 	veor.u32	q13, q13, q5
 	veor.u32	q14, q14, q6
@ -1079,15 +1090,17 @@ scrypt_core_3way_loop2:
 	
 	ldr	r0, [sp, #4*16+0*4]
 	ldr	r3, [sp, #4*16+1*4]
+	ldr	r2, [sp, #4*16+3*4]
 	mov	r1, sp
 	add	r0, r0, #16*4
+	sub	r2, r2, #1
 	scrypt_core_macro3_x4
-	mov	r4, r4, lsl #32-10
-	add	r3, r3, r4, lsr #32-10-7
+	and	r4, r4, r2
+	add	r3, r3, r4, lsl #7
 	pld	[r3, #16*4]
 	pld	[r3]
-	pld [r3, #24*4]
-	pld [r3, #8*4]
+	pld	[r3, #24*4]
+	pld	[r3, #8*4]
 	scrypt_core_macro3_x6
 	scrypt_core_macro3_x6
 	
@ -1164,7 +1177,7 @@ scrypt_core_3way_loop2:
 	vswp.u32	d26, d30
 	vstmia	r0, {q8-q15}
 	
-	ldr	sp, [sp, #4*16+3*4]
+	ldr	sp, [sp, #4*16+4*4]
 	vpop	{q4-q7}
 	ldmfd	sp!, {r4-r11, pc}

--- a/scrypt-x64.S
+++ b/scrypt-x64.S
@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 pooler@litecoinpool.org
+ * Copyright 2011-2014 pooler@litecoinpool.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -113,38 +113,38 @@ scrypt_best_throughput_exit:
 	
 	
 .macro scrypt_shuffle src, so, dest, do
-	movl	\so+60(\src), %r8d
-	movl	\so+44(\src), %r9d
-	movl	\so+28(\src), %r10d
-	movl	\so+12(\src), %r11d
-	movl	%r8d, \do+12(\dest)
-	movl	%r9d, \do+28(\dest)
-	movl	%r10d, \do+44(\dest)
-	movl	%r11d, \do+60(\dest)
-	movl	\so+40(\src), %r8d
-	movl	\so+8(\src), %r9d
-	movl	\so+48(\src), %r10d
-	movl	\so+16(\src), %r11d
-	movl	%r8d, \do+8(\dest)
-	movl	%r9d, \do+40(\dest)
-	movl	%r10d, \do+16(\dest)
-	movl	%r11d, \do+48(\dest)
-	movl	\so+20(\src), %r8d
-	movl	\so+4(\src), %r9d
-	movl	\so+52(\src), %r10d
-	movl	\so+36(\src), %r11d
-	movl	%r8d, \do+4(\dest)
-	movl	%r9d, \do+20(\dest)
-	movl	%r10d, \do+36(\dest)
-	movl	%r11d, \do+52(\dest)
-	movl	\so+0(\src), %r8d
-	movl	\so+24(\src), %r9d
-	movl	\so+32(\src), %r10d
-	movl	\so+56(\src), %r11d
-	movl	%r8d, \do+0(\dest)
-	movl	%r9d, \do+24(\dest)
-	movl	%r10d, \do+32(\dest)
-	movl	%r11d, \do+56(\dest)
+	movl	\so+60(\src), %eax
+	movl	\so+44(\src), %ebx
+	movl	\so+28(\src), %ecx
+	movl	\so+12(\src), %edx
+	movl	%eax, \do+12(\dest)
+	movl	%ebx, \do+28(\dest)
+	movl	%ecx, \do+44(\dest)
+	movl	%edx, \do+60(\dest)
+	movl	\so+40(\src), %eax
+	movl	\so+8(\src), %ebx
+	movl	\so+48(\src), %ecx
+	movl	\so+16(\src), %edx
+	movl	%eax, \do+8(\dest)
+	movl	%ebx, \do+40(\dest)
+	movl	%ecx, \do+16(\dest)
+	movl	%edx, \do+48(\dest)
+	movl	\so+20(\src), %eax
+	movl	\so+4(\src), %ebx
+	movl	\so+52(\src), %ecx
+	movl	\so+36(\src), %edx
+	movl	%eax, \do+4(\dest)
+	movl	%ebx, \do+20(\dest)
+	movl	%ecx, \do+36(\dest)
+	movl	%edx, \do+52(\dest)
+	movl	\so+0(\src), %eax
+	movl	\so+24(\src), %ebx
+	movl	\so+32(\src), %ecx
+	movl	\so+56(\src), %edx
+	movl	%eax, \do+0(\dest)
+	movl	%ebx, \do+24(\dest)
+	movl	%ecx, \do+32(\dest)
+	movl	%edx, \do+56(\dest)
 .endm


@ -384,6 +384,8 @@ _scrypt_core:
 	pushq	%rsi
 	movq	%rcx, %rdi
 	movq	%rdx, %rsi
+#else
+	movq	%rdx, %r8
 #endif

 .macro scrypt_core_cleanup
@ -432,7 +434,10 @@ scrypt_core_gen:
 	movdqa	96(%rdi), %xmm14
 	movdqa	112(%rdi), %xmm15
 	
-	leaq	131072(%rsi), %rcx
+	movq	%r8, %rcx
+	shlq	$7, %rcx
+	addq	%rsi, %rcx
+	movq	%r8, 96(%rsp)
 	movq	%rdi, 104(%rsp)
 	movq	%rsi, 112(%rsp)
 	movq	%rcx, 120(%rsp)
@ -481,11 +486,14 @@ scrypt_core_gen_loop1:
 	cmpq	%rcx, %rsi
 	jne scrypt_core_gen_loop1
 	
-	movq	$1024, %rcx
+	movq	96(%rsp), %r8
+	movq	%r8, %rcx
+	subl	$1, %r8d
+	movq	%r8, 96(%rsp)
 	movd	%xmm12, %edx
 scrypt_core_gen_loop2:
 	movq	112(%rsp), %rsi
-	andl	$1023, %edx
+	andl	%r8d, %edx
 	shll	$7, %edx
 	addq	%rsi, %rdx
 	movdqa	0(%rdx), %xmm0
@ -529,6 +537,7 @@ scrypt_core_gen_loop2:
 	movdqa	%xmm14, 32(%rsp)
 	movdqa	%xmm15, 48(%rsp)
 	call salsa8_core_gen
+	movq	96(%rsp), %r8
 	movq	128(%rsp), %rcx
 	addl	0(%rsp), %edx
 	paddd	%xmm0, %xmm12
@ -691,7 +700,9 @@ scrypt_core_xmm:
 	punpckhqdq	%xmm0, %xmm13
 	
 	movq	%rsi, %rdx
-	leaq	131072(%rsi), %rcx
+	movq	%r8, %rcx
+	shlq	$7, %rcx
+	addq	%rsi, %rcx
 scrypt_core_xmm_loop1:
 	pxor	%xmm12, %xmm8
 	pxor	%xmm13, %xmm9
@ -734,10 +745,11 @@ scrypt_core_xmm_loop1:
 	cmpq	%rcx, %rdx
 	jne scrypt_core_xmm_loop1
 	
-	movq	$1024, %rcx
+	movq	%r8, %rcx
+	subl	$1, %r8d
 scrypt_core_xmm_loop2:
 	movd	%xmm12, %edx
-	andl	$1023, %edx
+	andl	%r8d, %edx
 	shll	$7, %edx
 	pxor	0(%rsi, %rdx), %xmm8
 	pxor	16(%rsi, %rdx), %xmm9
@ -1019,6 +1031,8 @@ _scrypt_core_3way:
 	pushq	%rsi
 	movq	%rcx, %rdi
 	movq	%rdx, %rsi
+#else
+	movq	%rdx, %r8
 #endif
 	subq	$392, %rsp
 	
@ -1088,7 +1102,9 @@ scrypt_core_3way_avx:
 	movdqa	256+112(%rsp), %xmm15
 	
 	movq	%rsi, %rbx
-	leaq	3*131072(%rsi), %rax
+	leaq	(%r8, %r8, 2), %rax
+	shlq	$7, %rax
+	addq	%rsi, %rax
 scrypt_core_3way_avx_loop1:
 	movdqa	%xmm0, 64(%rbx)
 	movdqa	%xmm1, 80(%rbx)
@ -1208,7 +1224,8 @@ scrypt_core_3way_avx_loop1:
 	movdqa	%xmm14, 256+96(%rsp)
 	movdqa	%xmm15, 256+112(%rsp)
 	
-	movq	$1024, %rcx
+	movq	%r8, %rcx
+	subq	$1, %r8
 scrypt_core_3way_avx_loop2:
 	movd	%xmm0, %ebp
 	movd	%xmm8, %ebx
@ -1225,13 +1242,13 @@ scrypt_core_3way_avx_loop2:
 	pxor	256+16(%rsp), %xmm13
 	pxor	256+32(%rsp), %xmm14
 	pxor	256+48(%rsp), %xmm15
-	andl	$1023, %ebp
+	andl	%r8d, %ebp
 	leaq	(%rbp, %rbp, 2), %rbp
 	shll	$7, %ebp
-	andl	$1023, %ebx
+	andl	%r8d, %ebx
 	leaq	1(%rbx, %rbx, 2), %rbx
 	shll	$7, %ebx
-	andl	$1023, %eax
+	andl	%r8d, %eax
 	leaq	2(%rax, %rax, 2), %rax
 	shll	$7, %eax
 	pxor	0(%rsi, %rbp), %xmm0
@ -1491,7 +1508,9 @@ scrypt_core_3way_xop:
 	movdqa	256+112(%rsp), %xmm15
 	
 	movq	%rsi, %rbx
-	leaq	3*131072(%rsi), %rax
+	leaq	(%r8, %r8, 2), %rax
+	shlq	$7, %rax
+	addq	%rsi, %rax
 scrypt_core_3way_xop_loop1:
 	movdqa	%xmm0, 64(%rbx)
 	movdqa	%xmm1, 80(%rbx)
@ -1611,7 +1630,8 @@ scrypt_core_3way_xop_loop1:
 	movdqa	%xmm14, 256+96(%rsp)
 	movdqa	%xmm15, 256+112(%rsp)
 	
-	movq	$1024, %rcx
+	movq	%r8, %rcx
+	subq	$1, %r8
 scrypt_core_3way_xop_loop2:
 	movd	%xmm0, %ebp
 	movd	%xmm8, %ebx
@ -1628,13 +1648,13 @@ scrypt_core_3way_xop_loop2:
 	pxor	256+16(%rsp), %xmm13
 	pxor	256+32(%rsp), %xmm14
 	pxor	256+48(%rsp), %xmm15
-	andl	$1023, %ebp
+	andl	%r8d, %ebp
 	leaq	(%rbp, %rbp, 2), %rbp
 	shll	$7, %ebp
-	andl	$1023, %ebx
+	andl	%r8d, %ebx
 	leaq	1(%rbx, %rbx, 2), %rbx
 	shll	$7, %ebx
-	andl	$1023, %eax
+	andl	%r8d, %eax
 	leaq	2(%rax, %rax, 2), %rax
 	shll	$7, %eax
 	pxor	0(%rsi, %rbp), %xmm0
@ -1991,7 +2011,9 @@ scrypt_core_3way_xmm:
 	movdqa	256+112(%rsp), %xmm15
 	
 	movq	%rsi, %rbx
-	leaq	3*131072(%rsi), %rax
+	leaq	(%r8, %r8, 2), %rax
+	shlq	$7, %rax
+	addq	%rsi, %rax
 scrypt_core_3way_xmm_loop1:
 	movdqa	%xmm0, 64(%rbx)
 	movdqa	%xmm1, 80(%rbx)
@ -2111,7 +2133,8 @@ scrypt_core_3way_xmm_loop1:
 	movdqa	%xmm14, 256+96(%rsp)
 	movdqa	%xmm15, 256+112(%rsp)
 	
-	movq	$1024, %rcx
+	movq	%r8, %rcx
+	subq	$1, %r8
 scrypt_core_3way_xmm_loop2:
 	movd	%xmm0, %ebp
 	movd	%xmm8, %ebx
@ -2128,13 +2151,13 @@ scrypt_core_3way_xmm_loop2:
 	pxor	256+16(%rsp), %xmm13
 	pxor	256+32(%rsp), %xmm14
 	pxor	256+48(%rsp), %xmm15
-	andl	$1023, %ebp
+	andl	%r8d, %ebp
 	leaq	(%rbp, %rbp, 2), %rbp
 	shll	$7, %ebp
-	andl	$1023, %ebx
+	andl	%r8d, %ebx
 	leaq	1(%rbx, %rbx, 2), %rbx
 	shll	$7, %ebx
-	andl	$1023, %eax
+	andl	%r8d, %eax
 	leaq	2(%rax, %rax, 2), %rax
 	shll	$7, %eax
 	pxor	0(%rsi, %rbp), %xmm0
@ -2445,6 +2468,8 @@ _scrypt_core_6way:
 	pushq	%rsi
 	movq	%rcx, %rdi
 	movq	%rdx, %rsi
+#else
+	movq	%rdx, %r8
 #endif
 	movq	%rsp, %rdx
 	subq	$768, %rsp
@ -2539,7 +2564,9 @@ scrypt_core_6way_avx2:
 	vmovdqa	2*256+7*32(%rsp), %ymm15
 	
 	movq	%rsi, %rbx
-	leaq	6*131072(%rsi), %rax
+	leaq	(%r8, %r8, 2), %rax
+	shlq	$8, %rax
+	addq	%rsi, %rax
 scrypt_core_6way_avx2_loop1:
 	vmovdqa	%ymm0, 0*256+4*32(%rbx)
 	vmovdqa	%ymm1, 0*256+5*32(%rbx)
@ -2659,7 +2686,8 @@ scrypt_core_6way_avx2_loop1:
 	vmovdqa	%ymm14, 2*256+6*32(%rsp)
 	vmovdqa	%ymm15, 2*256+7*32(%rsp)
 	
-	movq	$1024, %rcx
+	movq	%r8, %rcx
+	leaq	-1(%r8), %r11
 scrypt_core_6way_avx2_loop2:
 	vmovd	%xmm0, %ebp
 	vmovd	%xmm8, %ebx
@ -2682,22 +2710,22 @@ scrypt_core_6way_avx2_loop2:
 	vpxor	2*256+1*32(%rsp), %ymm13, %ymm13
 	vpxor	2*256+2*32(%rsp), %ymm14, %ymm14
 	vpxor	2*256+3*32(%rsp), %ymm15, %ymm15
-	andl	$1023, %ebp
+	andl	%r11d, %ebp
 	leaq	0(%rbp, %rbp, 2), %rbp
 	shll	$8, %ebp
-	andl	$1023, %ebx
+	andl	%r11d, %ebx
 	leaq	1(%rbx, %rbx, 2), %rbx
 	shll	$8, %ebx
-	andl	$1023, %eax
+	andl	%r11d, %eax
 	leaq	2(%rax, %rax, 2), %rax
 	shll	$8, %eax
-	andl	$1023, %r8d
+	andl	%r11d, %r8d
 	leaq	0(%r8, %r8, 2), %r8
 	shll	$8, %r8d
-	andl	$1023, %r9d
+	andl	%r11d, %r9d
 	leaq	1(%r9, %r9, 2), %r9
 	shll	$8, %r9d
-	andl	$1023, %r10d
+	andl	%r11d, %r10d
 	leaq	2(%r10, %r10, 2), %r10
 	shll	$8, %r10d
 	vmovdqa	0*32(%rsi, %rbp), %xmm4
--- a/scrypt-x86.S
+++ b/scrypt-x86.S
@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2012 pooler@litecoinpool.org
+ * Copyright 2011-2012, 2014 pooler@litecoinpool.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -415,6 +415,7 @@ _scrypt_core:
 scrypt_core_gen:
 	movl	20(%esp), %edi
 	movl	24(%esp), %esi
+	movl	28(%esp), %ecx
 	subl	$72, %esp
 	
 .macro scrypt_core_macro1a p, q
@ -453,7 +454,8 @@ scrypt_core_gen:
 	movl	%eax, \q(%edi)
 .endm
 	
-	leal	131072(%esi), %ecx
+	shll	$7, %ecx
+	addl	%esi, %ecx
 scrypt_core_gen_loop1:
 	movl	%esi, 64(%esp)
 	movl	%ecx, 68(%esp)
@ -522,12 +524,15 @@ scrypt_core_gen_loop1:
 	jne scrypt_core_gen_loop1

 	movl	96(%esp), %esi
-	movl	$1024, %ecx
+	movl	100(%esp), %ecx
+	movl	%ecx, %eax
+	subl	$1, %eax
+	movl	%eax, 100(%esp)
 scrypt_core_gen_loop2:
 	movl	%ecx, 68(%esp)
 	
 	movl	64(%edi), %edx
-	andl	$1023, %edx
+	andl	100(%esp), %edx
 	shll	$7, %edx
 	
 	scrypt_core_macro1b	0, 64
@ -694,7 +699,9 @@ scrypt_core_sse2:
 	movdqa	112(%esp), %xmm7
 	
 	movl	%esi, %edx
-	leal	131072(%esi), %ecx
+	movl	28(%ebp), %ecx
+	shll	$7, %ecx
+	addl	%esi, %ecx
 scrypt_core_sse2_loop1:
 	movdqa	0(%esp), %xmm0
 	movdqa	16(%esp), %xmm1
@ -748,14 +755,16 @@ scrypt_core_sse2_loop1:
 	movdqa	64(%esp), %xmm4
 	movdqa	80(%esp), %xmm5
 	
-	movl	$1024, %ecx
+	movl	28(%ebp), %ecx
+	movl	%ecx, %eax
+	subl	$1, %eax
 scrypt_core_sse2_loop2:
 	movd	%xmm4, %edx
 	movdqa	0(%esp), %xmm0
 	movdqa	16(%esp), %xmm1
 	movdqa	32(%esp), %xmm2
 	movdqa	48(%esp), %xmm3
-	andl	$1023, %edx
+	andl	%eax, %edx
 	shll	$7, %edx
 	pxor	0(%esi, %edx), %xmm0
 	pxor	16(%esi, %edx), %xmm1
--- a/scrypt.c
+++ b/scrypt.c
@ -1,5 +1,5 @@
 /*
- * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@ -383,30 +383,30 @@ static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
 #define SCRYPT_MAX_WAYS 12
 #define HAVE_SCRYPT_3WAY 1
 int scrypt_best_throughput();
-void scrypt_core(uint32_t *X, uint32_t *V);
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
+void scrypt_core(uint32_t *X, uint32_t *V, int N);
+void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
 #if defined(USE_AVX2)
 #undef SCRYPT_MAX_WAYS
 #define SCRYPT_MAX_WAYS 24
 #define HAVE_SCRYPT_6WAY 1
-void scrypt_core_6way(uint32_t *X, uint32_t *V);
+void scrypt_core_6way(uint32_t *X, uint32_t *V, int N);
 #endif

 #elif defined(USE_ASM) && defined(__i386__)

 #define SCRYPT_MAX_WAYS 4
 #define scrypt_best_throughput() 1
-void scrypt_core(uint32_t *X, uint32_t *V);
+void scrypt_core(uint32_t *X, uint32_t *V, int N);

 #elif defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)

-void scrypt_core(uint32_t *X, uint32_t *V);
+void scrypt_core(uint32_t *X, uint32_t *V, int N);
 #if defined(__ARM_NEON__)
 #undef HAVE_SHA256_4WAY
 #define SCRYPT_MAX_WAYS 3
 #define HAVE_SCRYPT_3WAY 1
 #define scrypt_best_throughput() 3
-void scrypt_core_3way(uint32_t *X, uint32_t *V);
+void scrypt_core_3way(uint32_t *X, uint32_t *V, int N);
 #endif

 #else
@ -479,17 +479,17 @@ static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
 	B[15] += x15;
 }

-static inline void scrypt_core(uint32_t *X, uint32_t *V)
+static inline void scrypt_core(uint32_t *X, uint32_t *V, int N)
 {
 	uint32_t i, j, k;
 	
-	for (i = 0; i < 1024; i++) {
+	for (i = 0; i < N; i++) {
 		memcpy(&V[i * 32], X, 128);
 		xor_salsa8(&X[0], &X[16]);
 		xor_salsa8(&X[16], &X[0]);
 	}
-	for (i = 0; i < 1024; i++) {
-		j = 32 * (X[16] & 1023);
+	for (i = 0; i < N; i++) {
+		j = 32 * (X[16] & (N - 1));
 		for (k = 0; k < 32; k++)
 			X[k] ^= V[j + k];
 		xor_salsa8(&X[0], &X[16]);
@ -504,15 +504,13 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V)
 #define scrypt_best_throughput() 1
 #endif

-#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63)
-
-unsigned char *scrypt_buffer_alloc()
+unsigned char *scrypt_buffer_alloc(int N)
 {
-	return malloc(SCRYPT_BUFFER_SIZE);
+	return malloc((size_t)N * SCRYPT_MAX_WAYS * 128 + 63);
 }

 static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
-	uint32_t *midstate, unsigned char *scratchpad)
+	uint32_t *midstate, unsigned char *scratchpad, int N)
 {
 	uint32_t tstate[8], ostate[8];
 	uint32_t X[32];
@ -524,14 +522,14 @@ static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output,
 	HMAC_SHA256_80_init(input, tstate, ostate);
 	PBKDF2_SHA256_80_128(tstate, ostate, input, X);

-	scrypt_core(X, V);
+	scrypt_core(X, V, N);

 	PBKDF2_SHA256_128_32(tstate, ostate, X, output);
 }

 #ifdef HAVE_SHA256_4WAY
 static void scrypt_1024_1_1_256_4way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
 {
 	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
 	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
@ -553,10 +551,10 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input,
 	for (i = 0; i < 32; i++)
 		for (k = 0; k < 4; k++)
 			X[k * 32 + i] = W[4 * i + k];
-	scrypt_core(X + 0 * 32, V);
-	scrypt_core(X + 1 * 32, V);
-	scrypt_core(X + 2 * 32, V);
-	scrypt_core(X + 3 * 32, V);
+	scrypt_core(X + 0 * 32, V, N);
+	scrypt_core(X + 1 * 32, V, N);
+	scrypt_core(X + 2 * 32, V, N);
+	scrypt_core(X + 3 * 32, V, N);
 	for (i = 0; i < 32; i++)
 		for (k = 0; k < 4; k++)
 			W[4 * i + k] = X[k * 32 + i];
@ -570,7 +568,7 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input,
 #ifdef HAVE_SCRYPT_3WAY

 static void scrypt_1024_1_1_256_3way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
 {
 	uint32_t tstate[3 * 8], ostate[3 * 8];
 	uint32_t X[3 * 32] __attribute__((aligned(64)));
@ -588,7 +586,7 @@ static void scrypt_1024_1_1_256_3way(const uint32_t *input,
 	PBKDF2_SHA256_80_128(tstate +  8, ostate +  8, input + 20, X + 32);
 	PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64);

-	scrypt_core_3way(X, V);
+	scrypt_core_3way(X, V, N);

 	PBKDF2_SHA256_128_32(tstate +  0, ostate +  0, X +  0, output +  0);
 	PBKDF2_SHA256_128_32(tstate +  8, ostate +  8, X + 32, output +  8);
@ -597,7 +595,7 @@ static void scrypt_1024_1_1_256_3way(const uint32_t *input,

 #ifdef HAVE_SHA256_4WAY
 static void scrypt_1024_1_1_256_12way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
 {
 	uint32_t tstate[12 * 8] __attribute__((aligned(128)));
 	uint32_t ostate[12 * 8] __attribute__((aligned(128)));
@ -626,10 +624,10 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input,
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 4; k++)
 				X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
-	scrypt_core_3way(X + 0 * 96, V);
-	scrypt_core_3way(X + 1 * 96, V);
-	scrypt_core_3way(X + 2 * 96, V);
-	scrypt_core_3way(X + 3 * 96, V);
+	scrypt_core_3way(X + 0 * 96, V, N);
+	scrypt_core_3way(X + 1 * 96, V, N);
+	scrypt_core_3way(X + 2 * 96, V, N);
+	scrypt_core_3way(X + 3 * 96, V, N);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 4; k++)
@ -648,7 +646,7 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input,

 #ifdef HAVE_SCRYPT_6WAY
 static void scrypt_1024_1_1_256_24way(const uint32_t *input,
-	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
+	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N)
 {
 	uint32_t tstate[24 * 8] __attribute__((aligned(128)));
 	uint32_t ostate[24 * 8] __attribute__((aligned(128)));
@ -677,10 +675,10 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 8; k++)
 				X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
-	scrypt_core_6way(X + 0 * 32, V);
-	scrypt_core_6way(X + 6 * 32, V);
-	scrypt_core_6way(X + 12 * 32, V);
-	scrypt_core_6way(X + 18 * 32, V);
+	scrypt_core_6way(X + 0 * 32, V, N);
+	scrypt_core_6way(X + 6 * 32, V, N);
+	scrypt_core_6way(X + 12 * 32, V, N);
+	scrypt_core_6way(X + 18 * 32, V, N);
 	for (j = 0; j < 3; j++)
 		for (i = 0; i < 32; i++)
 			for (k = 0; k < 8; k++)
@ -697,7 +695,7 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,

 int scanhash_scrypt(int thr_id, uint32_t *pdata,
 	unsigned char *scratchbuf, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
+	uint32_t max_nonce, unsigned long *hashes_done, int N)
 {
 	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
 	uint32_t midstate[8];
@ -723,25 +721,25 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
 		
 #if defined(HAVE_SHA256_4WAY)
 		if (throughput == 4)
-			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf);
+			scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf, N);
 		else
 #endif
 #if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY)
 		if (throughput == 12)
-			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
+			scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf, N);
 		else
 #endif
 #if defined(HAVE_SCRYPT_6WAY)
 		if (throughput == 24)
-			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
+			scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf, N);
 		else
 #endif
 #if defined(HAVE_SCRYPT_3WAY)
 		if (throughput == 3)
-			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
+			scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf, N);
 		else
 #endif
-		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf);
+		scrypt_1024_1_1_256(data, hash, midstate, scratchbuf, N);
 		
 		for (i = 0; i < throughput; i++) {
 			if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) {