Some more optimization

This commit is contained in:
pooler 2011-12-24 12:22:06 +01:00
parent 963efb9546
commit 36225b4206
6 changed files with 974 additions and 284 deletions

View file

@ -551,7 +551,7 @@ static void *miner_thread(void *userdata)
if (opt_algo == ALGO_SCRYPT) if (opt_algo == ALGO_SCRYPT)
{ {
scratchbuf = malloc(131583); scratchbuf = scrypt_buffer_alloc();
max_nonce = 0xffff; max_nonce = 0xffff;
} }
@ -955,7 +955,7 @@ int main(int argc, char *argv[])
} }
applog(LOG_INFO, "%d miner threads started, " applog(LOG_INFO, "%d miner threads started, "
"using SHA256 '%s' algorithm.", "using '%s' algorithm.",
opt_n_threads, opt_n_threads,
algo_names[opt_algo]); algo_names[opt_algo]);

View file

@ -127,6 +127,7 @@ extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
extern char *bin2hex(const unsigned char *p, size_t len); extern char *bin2hex(const unsigned char *p, size_t len);
extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
extern unsigned char *scrypt_buffer_alloc();
extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf, extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf,
const unsigned char *ptarget, const unsigned char *ptarget,
uint32_t max_nonce, unsigned long *nHashesDone); uint32_t max_nonce, unsigned long *nHashesDone);

View file

@ -24,7 +24,7 @@
#if defined(__x86_64__) #if defined(__x86_64__)
.macro x64_gen_salsa8_core_doubleround .macro gen_salsa8_core_doubleround
movq 72(%rsp), %r15 movq 72(%rsp), %r15
leaq (%r14, %rdx), %rbp leaq (%r14, %rdx), %rbp
roll $7, %ebp roll $7, %ebp
@ -137,7 +137,7 @@
.text .text
.align 32 .align 32
x64_gen_salsa8_core: gen_salsa8_core:
# 0: %rdx, %rdi, %rcx, %rsi # 0: %rdx, %rdi, %rcx, %rsi
movq 8(%rsp), %rdi movq 8(%rsp), %rdi
movq %rdi, %rdx movq %rdi, %rdx
@ -170,52 +170,52 @@ x64_gen_salsa8_core:
shrq $32, %r15 shrq $32, %r15
movq %r15, 88(%rsp) movq %r15, 88(%rsp)
x64_gen_salsa8_core_doubleround gen_salsa8_core_doubleround
x64_gen_salsa8_core_doubleround gen_salsa8_core_doubleround
x64_gen_salsa8_core_doubleround gen_salsa8_core_doubleround
x64_gen_salsa8_core_doubleround gen_salsa8_core_doubleround
movl %edx, %edx movl %edx, %edx
shlq $32, %rdi shlq $32, %rdi
addq %rdi, %rdx addq %rdi, %rdx
movq %rdx, %xmm0 movd %rdx, %xmm0
movl %ecx, %ecx movl %ecx, %ecx
shlq $32, %rsi shlq $32, %rsi
addq %rsi, %rcx addq %rsi, %rcx
movq %rcx, %xmm4 movd %rcx, %xmm4
movq 72(%rsp), %rdi movq 72(%rsp), %rdi
movl %r9d, %r9d movl %r9d, %r9d
shlq $32, %rdi shlq $32, %rdi
addq %rdi, %r9 addq %rdi, %r9
movq %r9, %xmm1 movd %r9, %xmm1
movl %eax, %eax movl %eax, %eax
shlq $32, %r8 shlq $32, %r8
addq %r8, %rax addq %r8, %rax
movq %rax, %xmm5 movd %rax, %xmm5
movl %r11d, %r11d movl %r11d, %r11d
shlq $32, %r10 shlq $32, %r10
addq %r10, %r11 addq %r10, %r11
movq %r11, %xmm2 movd %r11, %xmm2
movl 48(%rsp), %r8d movl 48(%rsp), %r8d
shlq $32, %r12 shlq $32, %r12
addq %r12, %r8 addq %r12, %r8
movq %r8, %xmm6 movd %r8, %xmm6
movl %r14d, %r14d movl %r14d, %r14d
shlq $32, %r13 shlq $32, %r13
addq %r13, %r14 addq %r13, %r14
movq %r14, %xmm3 movd %r14, %xmm3
movq 88(%rsp), %rdi movq 88(%rsp), %rdi
movl %ebx, %ebx movl %ebx, %ebx
shlq $32, %rdi shlq $32, %rdi
addq %rdi, %rbx addq %rdi, %rbx
movq %rbx, %xmm7 movd %rbx, %xmm7
punpcklqdq %xmm4, %xmm0 punpcklqdq %xmm4, %xmm0
punpcklqdq %xmm5, %xmm1 punpcklqdq %xmm5, %xmm1
@ -236,10 +236,10 @@ x64_gen_salsa8_core:
.text .text
.align 32 .align 32
.globl x64_scrypt_core .globl scrypt_core
.globl _x64_scrypt_core .globl _scrypt_core
x64_scrypt_core: scrypt_core:
_x64_scrypt_core: _scrypt_core:
pushq %rbx pushq %rbx
pushq %rbp pushq %rbp
pushq %r12 pushq %r12
@ -264,7 +264,7 @@ _x64_scrypt_core:
movq %rdx, %rsi movq %rdx, %rsi
#endif #endif
.macro x64_scrypt_core_cleanup .macro scrypt_core_cleanup
#if defined(WIN64) #if defined(WIN64)
popq %rsi popq %rsi
popq %rdi popq %rdi
@ -292,13 +292,13 @@ _x64_scrypt_core:
xorl %eax, %eax xorl %eax, %eax
cpuid cpuid
cmpl $0x6c65746e, %ecx cmpl $0x6c65746e, %ecx
jne x64_gen_scrypt_core jne gen_scrypt_core
cmpl $0x49656e69, %edx cmpl $0x49656e69, %edx
jne x64_gen_scrypt_core jne gen_scrypt_core
cmpl $0x756e6547, %ebx cmpl $0x756e6547, %ebx
je x64_xmm_scrypt_core je xmm_scrypt_core
x64_gen_scrypt_core: gen_scrypt_core:
subq $136, %rsp subq $136, %rsp
movdqa 0(%rdi), %xmm8 movdqa 0(%rdi), %xmm8
movdqa 16(%rdi), %xmm9 movdqa 16(%rdi), %xmm9
@ -313,7 +313,7 @@ x64_gen_scrypt_core:
movq %rdi, 104(%rsp) movq %rdi, 104(%rsp)
movq %rsi, 112(%rsp) movq %rsi, 112(%rsp)
movq %rcx, 120(%rsp) movq %rcx, 120(%rsp)
x64_gen_scrypt_core_loop1: gen_scrypt_core_loop1:
movdqa %xmm8, 0(%rsi) movdqa %xmm8, 0(%rsi)
movdqa %xmm9, 16(%rsi) movdqa %xmm9, 16(%rsi)
movdqa %xmm10, 32(%rsi) movdqa %xmm10, 32(%rsi)
@ -332,7 +332,7 @@ x64_gen_scrypt_core_loop1:
movdqa %xmm10, 32(%rsp) movdqa %xmm10, 32(%rsp)
movdqa %xmm11, 48(%rsp) movdqa %xmm11, 48(%rsp)
movq %rsi, 128(%rsp) movq %rsi, 128(%rsp)
call x64_gen_salsa8_core call gen_salsa8_core
paddd %xmm0, %xmm8 paddd %xmm0, %xmm8
paddd %xmm1, %xmm9 paddd %xmm1, %xmm9
paddd %xmm2, %xmm10 paddd %xmm2, %xmm10
@ -346,7 +346,7 @@ x64_gen_scrypt_core_loop1:
movdqa %xmm13, 16(%rsp) movdqa %xmm13, 16(%rsp)
movdqa %xmm14, 32(%rsp) movdqa %xmm14, 32(%rsp)
movdqa %xmm15, 48(%rsp) movdqa %xmm15, 48(%rsp)
call x64_gen_salsa8_core call gen_salsa8_core
movq 128(%rsp), %rsi movq 128(%rsp), %rsi
paddd %xmm0, %xmm12 paddd %xmm0, %xmm12
paddd %xmm1, %xmm13 paddd %xmm1, %xmm13
@ -356,10 +356,10 @@ x64_gen_scrypt_core_loop1:
addq $128, %rsi addq $128, %rsi
movq 120(%rsp), %rcx movq 120(%rsp), %rcx
cmpq %rcx, %rsi cmpq %rcx, %rsi
jne x64_gen_scrypt_core_loop1 jne gen_scrypt_core_loop1
movq $1024, %rcx movq $1024, %rcx
x64_gen_scrypt_core_loop2: gen_scrypt_core_loop2:
movq 112(%rsp), %rsi movq 112(%rsp), %rsi
movd %xmm12, %edx movd %xmm12, %edx
andl $1023, %edx andl $1023, %edx
@ -390,7 +390,7 @@ x64_gen_scrypt_core_loop2:
movdqa %xmm10, 32(%rsp) movdqa %xmm10, 32(%rsp)
movdqa %xmm11, 48(%rsp) movdqa %xmm11, 48(%rsp)
movq %rcx, 128(%rsp) movq %rcx, 128(%rsp)
call x64_gen_salsa8_core call gen_salsa8_core
paddd %xmm0, %xmm8 paddd %xmm0, %xmm8
paddd %xmm1, %xmm9 paddd %xmm1, %xmm9
paddd %xmm2, %xmm10 paddd %xmm2, %xmm10
@ -404,7 +404,7 @@ x64_gen_scrypt_core_loop2:
movdqa %xmm13, 16(%rsp) movdqa %xmm13, 16(%rsp)
movdqa %xmm14, 32(%rsp) movdqa %xmm14, 32(%rsp)
movdqa %xmm15, 48(%rsp) movdqa %xmm15, 48(%rsp)
call x64_gen_salsa8_core call gen_salsa8_core
movq 128(%rsp), %rcx movq 128(%rsp), %rcx
paddd %xmm0, %xmm12 paddd %xmm0, %xmm12
paddd %xmm1, %xmm13 paddd %xmm1, %xmm13
@ -412,7 +412,7 @@ x64_gen_scrypt_core_loop2:
paddd %xmm3, %xmm15 paddd %xmm3, %xmm15
subq $1, %rcx subq $1, %rcx
ja x64_gen_scrypt_core_loop2 ja gen_scrypt_core_loop2
movq 104(%rsp), %rdi movq 104(%rsp), %rdi
movdqa %xmm8, 0(%rdi) movdqa %xmm8, 0(%rdi)
@ -425,11 +425,11 @@ x64_gen_scrypt_core_loop2:
movdqa %xmm15, 112(%rdi) movdqa %xmm15, 112(%rdi)
addq $136, %rsp addq $136, %rsp
x64_scrypt_core_cleanup scrypt_core_cleanup
ret ret
.macro x64_xmm_salsa8_core_doubleround .macro xmm_salsa8_core_doubleround
paddd %xmm0, %xmm4 paddd %xmm0, %xmm4
movdqa %xmm0, %xmm5 movdqa %xmm0, %xmm5
movdqa %xmm4, %xmm6 movdqa %xmm4, %xmm6
@ -495,16 +495,16 @@ x64_gen_scrypt_core_loop2:
pxor %xmm6, %xmm0 pxor %xmm6, %xmm0
.endm .endm
.macro x64_xmm_salsa8_core .macro xmm_salsa8_core
movdqa %xmm1, %xmm4 movdqa %xmm1, %xmm4
x64_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
x64_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
x64_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
x64_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
.endm .endm
.align 32 .align 32
x64_xmm_scrypt_core: xmm_scrypt_core:
# shuffle 1st block into %xmm8-%xmm11 # shuffle 1st block into %xmm8-%xmm11
movl 60(%rdi), %edx movl 60(%rdi), %edx
movl 44(%rdi), %ecx movl 44(%rdi), %ecx
@ -623,7 +623,7 @@ x64_xmm_scrypt_core:
movq %rsi, %rdx movq %rsi, %rdx
leaq 131072(%rsi), %rcx leaq 131072(%rsi), %rcx
x64_xmm_scrypt_core_loop1: xmm_scrypt_core_loop1:
movdqa %xmm8, 0(%rdx) movdqa %xmm8, 0(%rdx)
movdqa %xmm9, 16(%rdx) movdqa %xmm9, 16(%rdx)
movdqa %xmm10, 32(%rdx) movdqa %xmm10, 32(%rdx)
@ -641,7 +641,7 @@ x64_xmm_scrypt_core_loop1:
movdqa %xmm9, %xmm1 movdqa %xmm9, %xmm1
movdqa %xmm10, %xmm2 movdqa %xmm10, %xmm2
movdqa %xmm11, %xmm3 movdqa %xmm11, %xmm3
x64_xmm_salsa8_core xmm_salsa8_core
paddd %xmm0, %xmm8 paddd %xmm0, %xmm8
paddd %xmm1, %xmm9 paddd %xmm1, %xmm9
paddd %xmm2, %xmm10 paddd %xmm2, %xmm10
@ -655,7 +655,7 @@ x64_xmm_scrypt_core_loop1:
movdqa %xmm13, %xmm1 movdqa %xmm13, %xmm1
movdqa %xmm14, %xmm2 movdqa %xmm14, %xmm2
movdqa %xmm15, %xmm3 movdqa %xmm15, %xmm3
x64_xmm_salsa8_core xmm_salsa8_core
paddd %xmm0, %xmm12 paddd %xmm0, %xmm12
paddd %xmm1, %xmm13 paddd %xmm1, %xmm13
paddd %xmm2, %xmm14 paddd %xmm2, %xmm14
@ -663,10 +663,10 @@ x64_xmm_scrypt_core_loop1:
addq $128, %rdx addq $128, %rdx
cmpq %rcx, %rdx cmpq %rcx, %rdx
jne x64_xmm_scrypt_core_loop1 jne xmm_scrypt_core_loop1
movq $1024, %rcx movq $1024, %rcx
x64_xmm_scrypt_core_loop2: xmm_scrypt_core_loop2:
movd %xmm12, %edx movd %xmm12, %edx
andl $1023, %edx andl $1023, %edx
shll $7, %edx shll $7, %edx
@ -695,7 +695,7 @@ x64_xmm_scrypt_core_loop2:
movdqa %xmm9, %xmm1 movdqa %xmm9, %xmm1
movdqa %xmm10, %xmm2 movdqa %xmm10, %xmm2
movdqa %xmm11, %xmm3 movdqa %xmm11, %xmm3
x64_xmm_salsa8_core xmm_salsa8_core
paddd %xmm0, %xmm8 paddd %xmm0, %xmm8
paddd %xmm1, %xmm9 paddd %xmm1, %xmm9
paddd %xmm2, %xmm10 paddd %xmm2, %xmm10
@ -709,14 +709,14 @@ x64_xmm_scrypt_core_loop2:
movdqa %xmm13, %xmm1 movdqa %xmm13, %xmm1
movdqa %xmm14, %xmm2 movdqa %xmm14, %xmm2
movdqa %xmm15, %xmm3 movdqa %xmm15, %xmm3
x64_xmm_salsa8_core xmm_salsa8_core
paddd %xmm0, %xmm12 paddd %xmm0, %xmm12
paddd %xmm1, %xmm13 paddd %xmm1, %xmm13
paddd %xmm2, %xmm14 paddd %xmm2, %xmm14
paddd %xmm3, %xmm15 paddd %xmm3, %xmm15
subq $1, %rcx subq $1, %rcx
ja x64_xmm_scrypt_core_loop2 ja xmm_scrypt_core_loop2
# re-shuffle 1st block back # re-shuffle 1st block back
movd %xmm8, %eax movd %xmm8, %eax
@ -810,7 +810,653 @@ x64_xmm_scrypt_core_loop2:
movl %ebx, 92(%rdi) movl %ebx, 92(%rdi)
movl %eax, 76(%rdi) movl %eax, 76(%rdi)
x64_scrypt_core_cleanup scrypt_core_cleanup
ret
.text
.align 32
.globl prefer_dual_scrypt
.globl _prefer_dual_scrypt
prefer_dual_scrypt:
_prefer_dual_scrypt:
pushq %rbx
xorq %rax, %rax
cpuid
xorq %rax, %rax
cmpl $0x6c65746e, %ecx
jne prefer_dual_scrypt_false
cmpl $0x49656e69, %edx
jne prefer_dual_scrypt_false
cmpl $0x756e6547, %ebx
jne prefer_dual_scrypt_false
incl %eax
prefer_dual_scrypt_false:
popq %rbx
ret
.macro xmm_dual_salsa8_core_doubleround
paddd %xmm0, %xmm4
paddd %xmm8, %xmm12
movdqa %xmm0, %xmm5
movdqa %xmm8, %xmm13
movdqa %xmm4, %xmm6
movdqa %xmm12, %xmm14
pslld $7, %xmm4
pslld $7, %xmm12
psrld $25, %xmm6
psrld $25, %xmm14
pxor %xmm4, %xmm3
pxor %xmm12, %xmm11
pxor %xmm6, %xmm3
pxor %xmm14, %xmm11
paddd %xmm3, %xmm5
paddd %xmm11, %xmm13
movdqa %xmm3, %xmm4
movdqa %xmm11, %xmm12
movdqa %xmm5, %xmm6
movdqa %xmm13, %xmm14
pslld $9, %xmm5
pslld $9, %xmm13
psrld $23, %xmm6
psrld $23, %xmm14
pxor %xmm5, %xmm2
pxor %xmm13, %xmm10
pshufd $0x93, %xmm3, %xmm3
pshufd $0x93, %xmm11, %xmm11
pxor %xmm6, %xmm2
pxor %xmm14, %xmm10
paddd %xmm2, %xmm4
paddd %xmm10, %xmm12
movdqa %xmm2, %xmm5
movdqa %xmm10, %xmm13
movdqa %xmm4, %xmm6
movdqa %xmm12, %xmm14
pslld $13, %xmm4
pslld $13, %xmm12
psrld $19, %xmm6
psrld $19, %xmm14
pxor %xmm4, %xmm1
pxor %xmm12, %xmm9
pshufd $0x4e, %xmm2, %xmm2
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm6, %xmm1
pxor %xmm14, %xmm9
paddd %xmm1, %xmm5
paddd %xmm9, %xmm13
movdqa %xmm3, %xmm4
movdqa %xmm11, %xmm12
movdqa %xmm5, %xmm6
movdqa %xmm13, %xmm14
pslld $18, %xmm5
pslld $18, %xmm13
psrld $14, %xmm6
psrld $14, %xmm14
pxor %xmm5, %xmm0
pxor %xmm13, %xmm8
pshufd $0x39, %xmm1, %xmm1
pshufd $0x39, %xmm9, %xmm9
pxor %xmm6, %xmm0
pxor %xmm14, %xmm8
paddd %xmm0, %xmm4
paddd %xmm8, %xmm12
movdqa %xmm0, %xmm5
movdqa %xmm8, %xmm13
movdqa %xmm4, %xmm6
movdqa %xmm12, %xmm14
pslld $7, %xmm4
pslld $7, %xmm12
psrld $25, %xmm6
psrld $25, %xmm14
pxor %xmm4, %xmm1
pxor %xmm12, %xmm9
pxor %xmm6, %xmm1
pxor %xmm14, %xmm9
paddd %xmm1, %xmm5
paddd %xmm9, %xmm13
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm12
movdqa %xmm5, %xmm6
movdqa %xmm13, %xmm14
pslld $9, %xmm5
pslld $9, %xmm13
psrld $23, %xmm6
psrld $23, %xmm14
pxor %xmm5, %xmm2
pxor %xmm13, %xmm10
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm9, %xmm9
pxor %xmm6, %xmm2
pxor %xmm14, %xmm10
paddd %xmm2, %xmm4
paddd %xmm10, %xmm12
movdqa %xmm2, %xmm5
movdqa %xmm10, %xmm13
movdqa %xmm4, %xmm6
movdqa %xmm12, %xmm14
pslld $13, %xmm4
pslld $13, %xmm12
psrld $19, %xmm6
psrld $19, %xmm14
pxor %xmm4, %xmm3
pxor %xmm12, %xmm11
pshufd $0x4e, %xmm2, %xmm2
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm6, %xmm3
pxor %xmm14, %xmm11
paddd %xmm3, %xmm5
paddd %xmm11, %xmm13
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm12
movdqa %xmm5, %xmm6
movdqa %xmm13, %xmm14
pslld $18, %xmm5
pslld $18, %xmm13
psrld $14, %xmm6
psrld $14, %xmm14
pxor %xmm5, %xmm0
pxor %xmm13, %xmm8
pshufd $0x39, %xmm3, %xmm3
pshufd $0x39, %xmm11, %xmm11
pxor %xmm6, %xmm0
pxor %xmm14, %xmm8
.endm
.macro xmm_dual_salsa8_core
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm12
xmm_dual_salsa8_core_doubleround
xmm_dual_salsa8_core_doubleround
xmm_dual_salsa8_core_doubleround
xmm_dual_salsa8_core_doubleround
.endm
.text
.align 32
.globl dual_scrypt_core
.globl _dual_scrypt_core
dual_scrypt_core:
_dual_scrypt_core:
pushq %rbx
pushq %rbp
#if defined(WIN64)
subq $176, %rsp
movdqa %xmm6, 8(%rsp)
movdqa %xmm7, 24(%rsp)
movdqa %xmm8, 40(%rsp)
movdqa %xmm9, 56(%rsp)
movdqa %xmm10, 72(%rsp)
movdqa %xmm11, 88(%rsp)
movdqa %xmm12, 104(%rsp)
movdqa %xmm13, 120(%rsp)
movdqa %xmm14, 136(%rsp)
movdqa %xmm15, 152(%rsp)
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
# shuffle 1st block
movl 60(%rdi), %ebp
movl 44(%rdi), %ecx
movl 28(%rdi), %ebx
movl 12(%rdi), %eax
movl %ebp, 12(%rdi)
movl %ecx, 28(%rdi)
movl %ebx, 44(%rdi)
movl %eax, 60(%rdi)
movl 40(%rdi), %ecx
movl 8(%rdi), %eax
movl 48(%rdi), %ebp
movl 16(%rdi), %ebx
movl %ecx, 8(%rdi)
movl %eax, 40(%rdi)
movl %ebp, 16(%rdi)
movl %ebx, 48(%rdi)
movl 20(%rdi), %ebx
movl 4(%rdi), %eax
movl 52(%rdi), %ebp
movl 36(%rdi), %ecx
movl %ebx, 4(%rdi)
movl %eax, 20(%rdi)
movl %ebp, 36(%rdi)
movl %ecx, 52(%rdi)
# shuffle 2nd block
movl 124(%rdi), %ebp
movl 108(%rdi), %ecx
movl 92(%rdi), %ebx
movl 76(%rdi), %eax
movl %ebp, 76(%rdi)
movl %ecx, 92(%rdi)
movl %ebx, 108(%rdi)
movl %eax, 124(%rdi)
movl 104(%rdi), %ecx
movl 72(%rdi), %eax
movl 112(%rdi), %ebp
movl 80(%rdi), %ebx
movl %ecx, 72(%rdi)
movl %eax, 104(%rdi)
movl %ebp, 80(%rdi)
movl %ebx, 112(%rdi)
movl 84(%rdi), %ebx
movl 68(%rdi), %eax
movl 116(%rdi), %ebp
movl 100(%rdi), %ecx
movl %ebx, 68(%rdi)
movl %eax, 84(%rdi)
movl %ebp, 100(%rdi)
movl %ecx, 116(%rdi)
# shuffle 3rd block
movl 60(%rsi), %ebp
movl 44(%rsi), %ecx
movl 28(%rsi), %ebx
movl 12(%rsi), %eax
movl %ebp, 12(%rsi)
movl %ecx, 28(%rsi)
movl %ebx, 44(%rsi)
movl %eax, 60(%rsi)
movl 40(%rsi), %ecx
movl 8(%rsi), %eax
movl 48(%rsi), %ebp
movl 16(%rsi), %ebx
movl %ecx, 8(%rsi)
movl %eax, 40(%rsi)
movl %ebp, 16(%rsi)
movl %ebx, 48(%rsi)
movl 20(%rsi), %ebx
movl 4(%rsi), %eax
movl 52(%rsi), %ebp
movl 36(%rsi), %ecx
movl %ebx, 4(%rsi)
movl %eax, 20(%rsi)
movl %ebp, 36(%rsi)
movl %ecx, 52(%rsi)
# shuffle 4th block
movl 124(%rsi), %ebp
movl 108(%rsi), %ecx
movl 92(%rsi), %ebx
movl 76(%rsi), %eax
movl %ebp, 76(%rsi)
movl %ecx, 92(%rsi)
movl %ebx, 108(%rsi)
movl %eax, 124(%rsi)
movl 104(%rsi), %ecx
movl 72(%rsi), %eax
movl 112(%rsi), %ebp
movl 80(%rsi), %ebx
movl %ecx, 72(%rsi)
movl %eax, 104(%rsi)
movl %ebp, 80(%rsi)
movl %ebx, 112(%rsi)
movl 84(%rsi), %ebx
movl 68(%rsi), %eax
movl 116(%rsi), %ebp
movl 100(%rsi), %ecx
movl %ebx, 68(%rsi)
movl %eax, 84(%rsi)
movl %ebp, 100(%rsi)
movl %ecx, 116(%rsi)
movq %rdx, %rbp
leaq 262144(%rdx), %rcx
.align 8
dual_scrypt_core_loop1:
movdqa 0(%rdi), %xmm0
movdqa 16(%rdi), %xmm1
movdqa 32(%rdi), %xmm2
movdqa 48(%rdi), %xmm3
movdqa 64(%rdi), %xmm4
movdqa 80(%rdi), %xmm5
movdqa 96(%rdi), %xmm6
movdqa 112(%rdi), %xmm7
movdqa 0(%rsi), %xmm8
movdqa 16(%rsi), %xmm9
movdqa 32(%rsi), %xmm10
movdqa 48(%rsi), %xmm11
movdqa 64(%rsi), %xmm12
movdqa 80(%rsi), %xmm13
movdqa 96(%rsi), %xmm14
movdqa 112(%rsi), %xmm15
movdqa %xmm0, 0(%rbp)
movdqa %xmm1, 16(%rbp)
movdqa %xmm2, 32(%rbp)
movdqa %xmm3, 48(%rbp)
movdqa %xmm4, 64(%rbp)
movdqa %xmm5, 80(%rbp)
movdqa %xmm6, 96(%rbp)
movdqa %xmm7, 112(%rbp)
movdqa %xmm8, 128(%rbp)
movdqa %xmm9, 144(%rbp)
movdqa %xmm10, 160(%rbp)
movdqa %xmm11, 176(%rbp)
movdqa %xmm12, 192(%rbp)
movdqa %xmm13, 208(%rbp)
movdqa %xmm14, 224(%rbp)
movdqa %xmm15, 240(%rbp)
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 0(%rdi)
movdqa %xmm1, 16(%rdi)
movdqa %xmm2, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm8, 0(%rsi)
movdqa %xmm9, 16(%rsi)
movdqa %xmm10, 32(%rsi)
movdqa %xmm11, 48(%rsi)
xmm_dual_salsa8_core
paddd 0(%rdi), %xmm0
paddd 16(%rdi), %xmm1
paddd 32(%rdi), %xmm2
paddd 48(%rdi), %xmm3
paddd 0(%rsi), %xmm8
paddd 16(%rsi), %xmm9
paddd 32(%rsi), %xmm10
paddd 48(%rsi), %xmm11
movdqa %xmm0, 0(%rdi)
movdqa %xmm1, 16(%rdi)
movdqa %xmm2, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm8, 0(%rsi)
movdqa %xmm9, 16(%rsi)
movdqa %xmm10, 32(%rsi)
movdqa %xmm11, 48(%rsi)
pxor 64(%rdi), %xmm0
pxor 80(%rdi), %xmm1
pxor 96(%rdi), %xmm2
pxor 112(%rdi), %xmm3
pxor 64(%rsi), %xmm8
pxor 80(%rsi), %xmm9
pxor 96(%rsi), %xmm10
pxor 112(%rsi), %xmm11
movdqa %xmm0, 64(%rdi)
movdqa %xmm1, 80(%rdi)
movdqa %xmm2, 96(%rdi)
movdqa %xmm3, 112(%rdi)
movdqa %xmm8, 64(%rsi)
movdqa %xmm9, 80(%rsi)
movdqa %xmm10, 96(%rsi)
movdqa %xmm11, 112(%rsi)
xmm_dual_salsa8_core
paddd 64(%rdi), %xmm0
paddd 80(%rdi), %xmm1
paddd 96(%rdi), %xmm2
paddd 112(%rdi), %xmm3
paddd 64(%rsi), %xmm8
paddd 80(%rsi), %xmm9
paddd 96(%rsi), %xmm10
paddd 112(%rsi), %xmm11
movdqa %xmm0, 64(%rdi)
movdqa %xmm1, 80(%rdi)
movdqa %xmm2, 96(%rdi)
movdqa %xmm3, 112(%rdi)
movdqa %xmm8, 64(%rsi)
movdqa %xmm9, 80(%rsi)
movdqa %xmm10, 96(%rsi)
movdqa %xmm11, 112(%rsi)
addq $256, %rbp
cmpq %rcx, %rbp
jne dual_scrypt_core_loop1
movq $1024, %rcx
.align 8
dual_scrypt_core_loop2:
movl 64(%rdi), %ebp
andl $1023, %ebp
shll $8, %ebp
movdqa 0(%rdx, %rbp), %xmm0
movdqa 16(%rdx, %rbp), %xmm1
movdqa 32(%rdx, %rbp), %xmm2
movdqa 48(%rdx, %rbp), %xmm3
movdqa 64(%rdx, %rbp), %xmm4
movdqa 80(%rdx, %rbp), %xmm5
movdqa 96(%rdx, %rbp), %xmm6
movdqa 112(%rdx, %rbp), %xmm7
movl 64(%rsi), %ebp
andl $1023, %ebp
shll $8, %ebp
addl $128, %ebp
movdqa 0(%rdx, %rbp), %xmm8
movdqa 16(%rdx, %rbp), %xmm9
movdqa 32(%rdx, %rbp), %xmm10
movdqa 48(%rdx, %rbp), %xmm11
movdqa 64(%rdx, %rbp), %xmm12
movdqa 80(%rdx, %rbp), %xmm13
movdqa 96(%rdx, %rbp), %xmm14
movdqa 112(%rdx, %rbp), %xmm15
pxor 0(%rdi), %xmm0
pxor 16(%rdi), %xmm1
pxor 32(%rdi), %xmm2
pxor 48(%rdi), %xmm3
pxor 64(%rdi), %xmm4
pxor 80(%rdi), %xmm5
pxor 96(%rdi), %xmm6
pxor 112(%rdi), %xmm7
pxor 0(%rsi), %xmm8
pxor 16(%rsi), %xmm9
pxor 32(%rsi), %xmm10
pxor 48(%rsi), %xmm11
pxor 64(%rsi), %xmm12
pxor 80(%rsi), %xmm13
pxor 96(%rsi), %xmm14
pxor 112(%rsi), %xmm15
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 0(%rdi)
movdqa %xmm1, 16(%rdi)
movdqa %xmm2, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm4, 64(%rdi)
movdqa %xmm5, 80(%rdi)
movdqa %xmm6, 96(%rdi)
movdqa %xmm7, 112(%rdi)
movdqa %xmm8, 0(%rsi)
movdqa %xmm9, 16(%rsi)
movdqa %xmm10, 32(%rsi)
movdqa %xmm11, 48(%rsi)
movdqa %xmm12, 64(%rsi)
movdqa %xmm13, 80(%rsi)
movdqa %xmm14, 96(%rsi)
movdqa %xmm15, 112(%rsi)
xmm_dual_salsa8_core
paddd 0(%rdi), %xmm0
paddd 16(%rdi), %xmm1
paddd 32(%rdi), %xmm2
paddd 48(%rdi), %xmm3
paddd 0(%rsi), %xmm8
paddd 16(%rsi), %xmm9
paddd 32(%rsi), %xmm10
paddd 48(%rsi), %xmm11
movdqa %xmm0, 0(%rdi)
movdqa %xmm1, 16(%rdi)
movdqa %xmm2, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm8, 0(%rsi)
movdqa %xmm9, 16(%rsi)
movdqa %xmm10, 32(%rsi)
movdqa %xmm11, 48(%rsi)
pxor 64(%rdi), %xmm0
pxor 80(%rdi), %xmm1
pxor 96(%rdi), %xmm2
pxor 112(%rdi), %xmm3
pxor 64(%rsi), %xmm8
pxor 80(%rsi), %xmm9
pxor 96(%rsi), %xmm10
pxor 112(%rsi), %xmm11
movdqa %xmm0, 64(%rdi)
movdqa %xmm1, 80(%rdi)
movdqa %xmm2, 96(%rdi)
movdqa %xmm3, 112(%rdi)
movdqa %xmm8, 64(%rsi)
movdqa %xmm9, 80(%rsi)
movdqa %xmm10, 96(%rsi)
movdqa %xmm11, 112(%rsi)
xmm_dual_salsa8_core
paddd 64(%rdi), %xmm0
paddd 80(%rdi), %xmm1
paddd 96(%rdi), %xmm2
paddd 112(%rdi), %xmm3
paddd 64(%rsi), %xmm8
paddd 80(%rsi), %xmm9
paddd 96(%rsi), %xmm10
paddd 112(%rsi), %xmm11
movdqa %xmm0, 64(%rdi)
movdqa %xmm1, 80(%rdi)
movdqa %xmm2, 96(%rdi)
movdqa %xmm3, 112(%rdi)
movdqa %xmm8, 64(%rsi)
movdqa %xmm9, 80(%rsi)
movdqa %xmm10, 96(%rsi)
movdqa %xmm11, 112(%rsi)
subq $1, %rcx
ja dual_scrypt_core_loop2
# shuffle 1st block
movl 60(%rdi), %ebp
movl 44(%rdi), %ecx
movl 28(%rdi), %ebx
movl 12(%rdi), %eax
movl %ebp, 12(%rdi)
movl %ecx, 28(%rdi)
movl %ebx, 44(%rdi)
movl %eax, 60(%rdi)
movl 40(%rdi), %ecx
movl 8(%rdi), %eax
movl 48(%rdi), %ebp
movl 16(%rdi), %ebx
movl %ecx, 8(%rdi)
movl %eax, 40(%rdi)
movl %ebp, 16(%rdi)
movl %ebx, 48(%rdi)
movl 20(%rdi), %ebx
movl 4(%rdi), %eax
movl 52(%rdi), %ebp
movl 36(%rdi), %ecx
movl %ebx, 4(%rdi)
movl %eax, 20(%rdi)
movl %ebp, 36(%rdi)
movl %ecx, 52(%rdi)
# shuffle 2nd block
movl 124(%rdi), %ebp
movl 108(%rdi), %ecx
movl 92(%rdi), %ebx
movl 76(%rdi), %eax
movl %ebp, 76(%rdi)
movl %ecx, 92(%rdi)
movl %ebx, 108(%rdi)
movl %eax, 124(%rdi)
movl 104(%rdi), %ecx
movl 72(%rdi), %eax
movl 112(%rdi), %ebp
movl 80(%rdi), %ebx
movl %ecx, 72(%rdi)
movl %eax, 104(%rdi)
movl %ebp, 80(%rdi)
movl %ebx, 112(%rdi)
movl 84(%rdi), %ebx
movl 68(%rdi), %eax
movl 116(%rdi), %ebp
movl 100(%rdi), %ecx
movl %ebx, 68(%rdi)
movl %eax, 84(%rdi)
movl %ebp, 100(%rdi)
movl %ecx, 116(%rdi)
# shuffle 3rd block
movl 60(%rsi), %ebp
movl 44(%rsi), %ecx
movl 28(%rsi), %ebx
movl 12(%rsi), %eax
movl %ebp, 12(%rsi)
movl %ecx, 28(%rsi)
movl %ebx, 44(%rsi)
movl %eax, 60(%rsi)
movl 40(%rsi), %ecx
movl 8(%rsi), %eax
movl 48(%rsi), %ebp
movl 16(%rsi), %ebx
movl %ecx, 8(%rsi)
movl %eax, 40(%rsi)
movl %ebp, 16(%rsi)
movl %ebx, 48(%rsi)
movl 20(%rsi), %ebx
movl 4(%rsi), %eax
movl 52(%rsi), %ebp
movl 36(%rsi), %ecx
movl %ebx, 4(%rsi)
movl %eax, 20(%rsi)
movl %ebp, 36(%rsi)
movl %ecx, 52(%rsi)
# shuffle 4th block
movl 124(%rsi), %ebp
movl 108(%rsi), %ecx
movl 92(%rsi), %ebx
movl 76(%rsi), %eax
movl %ebp, 76(%rsi)
movl %ecx, 92(%rsi)
movl %ebx, 108(%rsi)
movl %eax, 124(%rsi)
movl 104(%rsi), %ecx
movl 72(%rsi), %eax
movl 112(%rsi), %ebp
movl 80(%rsi), %ebx
movl %ecx, 72(%rsi)
movl %eax, 104(%rsi)
movl %ebp, 80(%rsi)
movl %ebx, 112(%rsi)
movl 84(%rsi), %ebx
movl 68(%rsi), %eax
movl 116(%rsi), %ebp
movl 100(%rsi), %ecx
movl %ebx, 68(%rsi)
movl %eax, 84(%rsi)
movl %ebp, 100(%rsi)
movl %ecx, 116(%rsi)
#if defined(WIN64)
popq %rsi
popq %rdi
movdqa 8(%rsp), %xmm6
movdqa 24(%rsp), %xmm7
movdqa 40(%rsp), %xmm8
movdqa 56(%rsp), %xmm9
movdqa 72(%rsp), %xmm10
movdqa 88(%rsp), %xmm11
movdqa 104(%rsp), %xmm12
movdqa 120(%rsp), %xmm13
movdqa 136(%rsp), %xmm14
movdqa 152(%rsp), %xmm15
addq $176, %rsp
#endif
popq %rbp
popq %rbx
ret ret
#endif #endif

View file

@ -24,7 +24,7 @@
#if defined(__i386__) #if defined(__i386__)
.macro x86_gen_salsa8_core_quadround .macro gen_salsa8_core_quadround
movl 52(%esp), %ecx movl 52(%esp), %ecx
movl 4(%esp), %edx movl 4(%esp), %edx
movl 20(%esp), %ebx movl 20(%esp), %ebx
@ -346,18 +346,18 @@
.text .text
.align 32 .align 32
x86_gen_salsa8_core: gen_salsa8_core:
x86_gen_salsa8_core_quadround gen_salsa8_core_quadround
x86_gen_salsa8_core_quadround gen_salsa8_core_quadround
ret ret
.text .text
.align 32 .align 32
.globl x86_scrypt_core .globl scrypt_core
.globl _x86_scrypt_core .globl _scrypt_core
x86_scrypt_core: scrypt_core:
_x86_scrypt_core: _scrypt_core:
pushl %ebx pushl %ebx
pushl %ebp pushl %ebp
pushl %edi pushl %edi
@ -367,14 +367,14 @@ _x86_scrypt_core:
movl $1, %eax movl $1, %eax
cpuid cpuid
andl $0x04000000, %edx andl $0x04000000, %edx
jnz x86_xmm_scrypt_core jnz xmm_scrypt_core
x86_gen_scrypt_core: gen_scrypt_core:
movl 20(%esp), %edi movl 20(%esp), %edi
movl 24(%esp), %esi movl 24(%esp), %esi
subl $72, %esp subl $72, %esp
.macro x86_scrypt_core_macro1a p, q .macro scrypt_core_macro1a p, q
movl \p(%edi), %eax movl \p(%edi), %eax
movl \q(%edi), %edx movl \q(%edi), %edx
movl %eax, \p(%esi) movl %eax, \p(%esi)
@ -384,7 +384,7 @@ x86_gen_scrypt_core:
movl %eax, \p(%esp) movl %eax, \p(%esp)
.endm .endm
.macro x86_scrypt_core_macro1b p, q .macro scrypt_core_macro1b p, q
movl \p(%edi), %eax movl \p(%edi), %eax
xorl \p(%esi, %edx), %eax xorl \p(%esi, %edx), %eax
movl \q(%edi), %ebx movl \q(%edi), %ebx
@ -395,7 +395,7 @@ x86_gen_scrypt_core:
movl %eax, \p(%esp) movl %eax, \p(%esp)
.endm .endm
.macro x86_scrypt_core_macro2 p, q .macro scrypt_core_macro2 p, q
movl \p(%esp), %eax movl \p(%esp), %eax
addl \p(%edi), %eax addl \p(%edi), %eax
movl %eax, \p(%edi) movl %eax, \p(%edi)
@ -404,150 +404,150 @@ x86_gen_scrypt_core:
movl %eax, \p(%esp) movl %eax, \p(%esp)
.endm .endm
.macro x86_scrypt_core_macro3 p, q .macro scrypt_core_macro3 p, q
movl \p(%esp), %eax movl \p(%esp), %eax
addl \q(%edi), %eax addl \q(%edi), %eax
movl %eax, \q(%edi) movl %eax, \q(%edi)
.endm .endm
leal 131072(%esi), %ecx leal 131072(%esi), %ecx
x86_gen_scrypt_core_loop1: gen_scrypt_core_loop1:
movl %esi, 64(%esp) movl %esi, 64(%esp)
movl %ecx, 68(%esp) movl %ecx, 68(%esp)
x86_scrypt_core_macro1a 0, 64 scrypt_core_macro1a 0, 64
x86_scrypt_core_macro1a 4, 68 scrypt_core_macro1a 4, 68
x86_scrypt_core_macro1a 8, 72 scrypt_core_macro1a 8, 72
x86_scrypt_core_macro1a 12, 76 scrypt_core_macro1a 12, 76
x86_scrypt_core_macro1a 16, 80 scrypt_core_macro1a 16, 80
x86_scrypt_core_macro1a 20, 84 scrypt_core_macro1a 20, 84
x86_scrypt_core_macro1a 24, 88 scrypt_core_macro1a 24, 88
x86_scrypt_core_macro1a 28, 92 scrypt_core_macro1a 28, 92
x86_scrypt_core_macro1a 32, 96 scrypt_core_macro1a 32, 96
x86_scrypt_core_macro1a 36, 100 scrypt_core_macro1a 36, 100
x86_scrypt_core_macro1a 40, 104 scrypt_core_macro1a 40, 104
x86_scrypt_core_macro1a 44, 108 scrypt_core_macro1a 44, 108
x86_scrypt_core_macro1a 48, 112 scrypt_core_macro1a 48, 112
x86_scrypt_core_macro1a 52, 116 scrypt_core_macro1a 52, 116
x86_scrypt_core_macro1a 56, 120 scrypt_core_macro1a 56, 120
x86_scrypt_core_macro1a 60, 124 scrypt_core_macro1a 60, 124
call x86_gen_salsa8_core call gen_salsa8_core
movl 92(%esp), %edi movl 92(%esp), %edi
x86_scrypt_core_macro2 0, 64 scrypt_core_macro2 0, 64
x86_scrypt_core_macro2 4, 68 scrypt_core_macro2 4, 68
x86_scrypt_core_macro2 8, 72 scrypt_core_macro2 8, 72
x86_scrypt_core_macro2 12, 76 scrypt_core_macro2 12, 76
x86_scrypt_core_macro2 16, 80 scrypt_core_macro2 16, 80
x86_scrypt_core_macro2 20, 84 scrypt_core_macro2 20, 84
x86_scrypt_core_macro2 24, 88 scrypt_core_macro2 24, 88
x86_scrypt_core_macro2 28, 92 scrypt_core_macro2 28, 92
x86_scrypt_core_macro2 32, 96 scrypt_core_macro2 32, 96
x86_scrypt_core_macro2 36, 100 scrypt_core_macro2 36, 100
x86_scrypt_core_macro2 40, 104 scrypt_core_macro2 40, 104
x86_scrypt_core_macro2 44, 108 scrypt_core_macro2 44, 108
x86_scrypt_core_macro2 48, 112 scrypt_core_macro2 48, 112
x86_scrypt_core_macro2 52, 116 scrypt_core_macro2 52, 116
x86_scrypt_core_macro2 56, 120 scrypt_core_macro2 56, 120
x86_scrypt_core_macro2 60, 124 scrypt_core_macro2 60, 124
call x86_gen_salsa8_core call gen_salsa8_core
movl 92(%esp), %edi movl 92(%esp), %edi
x86_scrypt_core_macro3 0, 64 scrypt_core_macro3 0, 64
x86_scrypt_core_macro3 4, 68 scrypt_core_macro3 4, 68
x86_scrypt_core_macro3 8, 72 scrypt_core_macro3 8, 72
x86_scrypt_core_macro3 12, 76 scrypt_core_macro3 12, 76
x86_scrypt_core_macro3 16, 80 scrypt_core_macro3 16, 80
x86_scrypt_core_macro3 20, 84 scrypt_core_macro3 20, 84
x86_scrypt_core_macro3 24, 88 scrypt_core_macro3 24, 88
x86_scrypt_core_macro3 28, 92 scrypt_core_macro3 28, 92
x86_scrypt_core_macro3 32, 96 scrypt_core_macro3 32, 96
x86_scrypt_core_macro3 36, 100 scrypt_core_macro3 36, 100
x86_scrypt_core_macro3 40, 104 scrypt_core_macro3 40, 104
x86_scrypt_core_macro3 44, 108 scrypt_core_macro3 44, 108
x86_scrypt_core_macro3 48, 112 scrypt_core_macro3 48, 112
x86_scrypt_core_macro3 52, 116 scrypt_core_macro3 52, 116
x86_scrypt_core_macro3 56, 120 scrypt_core_macro3 56, 120
x86_scrypt_core_macro3 60, 124 scrypt_core_macro3 60, 124
movl 64(%esp), %esi movl 64(%esp), %esi
movl 68(%esp), %ecx movl 68(%esp), %ecx
addl $128, %esi addl $128, %esi
cmpl %ecx, %esi cmpl %ecx, %esi
jne x86_gen_scrypt_core_loop1 jne gen_scrypt_core_loop1
movl 96(%esp), %esi movl 96(%esp), %esi
movl $1024, %ecx movl $1024, %ecx
x86_gen_scrypt_core_loop2: gen_scrypt_core_loop2:
movl %ecx, 68(%esp) movl %ecx, 68(%esp)
movl 64(%edi), %edx movl 64(%edi), %edx
andl $1023, %edx andl $1023, %edx
shll $7, %edx shll $7, %edx
x86_scrypt_core_macro1b 0, 64 scrypt_core_macro1b 0, 64
x86_scrypt_core_macro1b 4, 68 scrypt_core_macro1b 4, 68
x86_scrypt_core_macro1b 8, 72 scrypt_core_macro1b 8, 72
x86_scrypt_core_macro1b 12, 76 scrypt_core_macro1b 12, 76
x86_scrypt_core_macro1b 16, 80 scrypt_core_macro1b 16, 80
x86_scrypt_core_macro1b 20, 84 scrypt_core_macro1b 20, 84
x86_scrypt_core_macro1b 24, 88 scrypt_core_macro1b 24, 88
x86_scrypt_core_macro1b 28, 92 scrypt_core_macro1b 28, 92
x86_scrypt_core_macro1b 32, 96 scrypt_core_macro1b 32, 96
x86_scrypt_core_macro1b 36, 100 scrypt_core_macro1b 36, 100
x86_scrypt_core_macro1b 40, 104 scrypt_core_macro1b 40, 104
x86_scrypt_core_macro1b 44, 108 scrypt_core_macro1b 44, 108
x86_scrypt_core_macro1b 48, 112 scrypt_core_macro1b 48, 112
x86_scrypt_core_macro1b 52, 116 scrypt_core_macro1b 52, 116
x86_scrypt_core_macro1b 56, 120 scrypt_core_macro1b 56, 120
x86_scrypt_core_macro1b 60, 124 scrypt_core_macro1b 60, 124
call x86_gen_salsa8_core call gen_salsa8_core
movl 92(%esp), %edi movl 92(%esp), %edi
x86_scrypt_core_macro2 0, 64 scrypt_core_macro2 0, 64
x86_scrypt_core_macro2 4, 68 scrypt_core_macro2 4, 68
x86_scrypt_core_macro2 8, 72 scrypt_core_macro2 8, 72
x86_scrypt_core_macro2 12, 76 scrypt_core_macro2 12, 76
x86_scrypt_core_macro2 16, 80 scrypt_core_macro2 16, 80
x86_scrypt_core_macro2 20, 84 scrypt_core_macro2 20, 84
x86_scrypt_core_macro2 24, 88 scrypt_core_macro2 24, 88
x86_scrypt_core_macro2 28, 92 scrypt_core_macro2 28, 92
x86_scrypt_core_macro2 32, 96 scrypt_core_macro2 32, 96
x86_scrypt_core_macro2 36, 100 scrypt_core_macro2 36, 100
x86_scrypt_core_macro2 40, 104 scrypt_core_macro2 40, 104
x86_scrypt_core_macro2 44, 108 scrypt_core_macro2 44, 108
x86_scrypt_core_macro2 48, 112 scrypt_core_macro2 48, 112
x86_scrypt_core_macro2 52, 116 scrypt_core_macro2 52, 116
x86_scrypt_core_macro2 56, 120 scrypt_core_macro2 56, 120
x86_scrypt_core_macro2 60, 124 scrypt_core_macro2 60, 124
call x86_gen_salsa8_core call gen_salsa8_core
movl 92(%esp), %edi movl 92(%esp), %edi
movl 96(%esp), %esi movl 96(%esp), %esi
x86_scrypt_core_macro3 0, 64 scrypt_core_macro3 0, 64
x86_scrypt_core_macro3 4, 68 scrypt_core_macro3 4, 68
x86_scrypt_core_macro3 8, 72 scrypt_core_macro3 8, 72
x86_scrypt_core_macro3 12, 76 scrypt_core_macro3 12, 76
x86_scrypt_core_macro3 16, 80 scrypt_core_macro3 16, 80
x86_scrypt_core_macro3 20, 84 scrypt_core_macro3 20, 84
x86_scrypt_core_macro3 24, 88 scrypt_core_macro3 24, 88
x86_scrypt_core_macro3 28, 92 scrypt_core_macro3 28, 92
x86_scrypt_core_macro3 32, 96 scrypt_core_macro3 32, 96
x86_scrypt_core_macro3 36, 100 scrypt_core_macro3 36, 100
x86_scrypt_core_macro3 40, 104 scrypt_core_macro3 40, 104
x86_scrypt_core_macro3 44, 108 scrypt_core_macro3 44, 108
x86_scrypt_core_macro3 48, 112 scrypt_core_macro3 48, 112
x86_scrypt_core_macro3 52, 116 scrypt_core_macro3 52, 116
x86_scrypt_core_macro3 56, 120 scrypt_core_macro3 56, 120
x86_scrypt_core_macro3 60, 124 scrypt_core_macro3 60, 124
movl 68(%esp), %ecx movl 68(%esp), %ecx
subl $1, %ecx subl $1, %ecx
ja x86_gen_scrypt_core_loop2 ja gen_scrypt_core_loop2
addl $72, %esp addl $72, %esp
popl %esi popl %esi
@ -557,7 +557,7 @@ x86_gen_scrypt_core_loop2:
ret ret
.macro x86_xmm_salsa8_core_doubleround .macro xmm_salsa8_core_doubleround
paddd %xmm0, %xmm4 paddd %xmm0, %xmm4
movdqa %xmm0, %xmm5 movdqa %xmm0, %xmm5
movdqa %xmm4, %xmm6 movdqa %xmm4, %xmm6
@ -624,16 +624,16 @@ x86_gen_scrypt_core_loop2:
pxor %xmm6, %xmm0 pxor %xmm6, %xmm0
.endm .endm
.macro x86_xmm_salsa8_core .macro xmm_salsa8_core
movdqa %xmm1, %xmm4 movdqa %xmm1, %xmm4
x86_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
x86_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
x86_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
x86_xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround
.endm .endm
.align 32 .align 32
x86_xmm_scrypt_core: xmm_scrypt_core:
movl 20(%esp), %edi movl 20(%esp), %edi
movl 24(%esp), %esi movl 24(%esp), %esi
movl %esp, %ebp movl %esp, %ebp
@ -710,7 +710,7 @@ x86_xmm_scrypt_core:
movl %esi, %edx movl %esi, %edx
leal 131072(%esi), %ecx leal 131072(%esi), %ecx
x86_xmm_scrypt_core_loop1: xmm_scrypt_core_loop1:
movdqa 0(%esp), %xmm0 movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1 movdqa 16(%esp), %xmm1
movdqa 32(%esp), %xmm2 movdqa 32(%esp), %xmm2
@ -736,7 +736,7 @@ x86_xmm_scrypt_core_loop1:
movdqa %xmm1, 16(%esp) movdqa %xmm1, 16(%esp)
movdqa %xmm2, 32(%esp) movdqa %xmm2, 32(%esp)
movdqa %xmm3, 48(%esp) movdqa %xmm3, 48(%esp)
x86_xmm_salsa8_core xmm_salsa8_core
paddd 0(%esp), %xmm0 paddd 0(%esp), %xmm0
paddd 16(%esp), %xmm1 paddd 16(%esp), %xmm1
paddd 32(%esp), %xmm2 paddd 32(%esp), %xmm2
@ -754,7 +754,7 @@ x86_xmm_scrypt_core_loop1:
movdqa %xmm1, 80(%esp) movdqa %xmm1, 80(%esp)
movdqa %xmm2, 96(%esp) movdqa %xmm2, 96(%esp)
movdqa %xmm3, 112(%esp) movdqa %xmm3, 112(%esp)
x86_xmm_salsa8_core xmm_salsa8_core
paddd 64(%esp), %xmm0 paddd 64(%esp), %xmm0
paddd 80(%esp), %xmm1 paddd 80(%esp), %xmm1
paddd 96(%esp), %xmm2 paddd 96(%esp), %xmm2
@ -766,10 +766,10 @@ x86_xmm_scrypt_core_loop1:
addl $128, %edx addl $128, %edx
cmpl %ecx, %edx cmpl %ecx, %edx
jne x86_xmm_scrypt_core_loop1 jne xmm_scrypt_core_loop1
movl $1024, %ecx movl $1024, %ecx
x86_xmm_scrypt_core_loop2: xmm_scrypt_core_loop2:
movdqa 0(%esp), %xmm0 movdqa 0(%esp), %xmm0
movdqa 16(%esp), %xmm1 movdqa 16(%esp), %xmm1
movdqa 32(%esp), %xmm2 movdqa 32(%esp), %xmm2
@ -802,7 +802,7 @@ x86_xmm_scrypt_core_loop2:
movdqa %xmm1, 16(%esp) movdqa %xmm1, 16(%esp)
movdqa %xmm2, 32(%esp) movdqa %xmm2, 32(%esp)
movdqa %xmm3, 48(%esp) movdqa %xmm3, 48(%esp)
x86_xmm_salsa8_core xmm_salsa8_core
paddd 0(%esp), %xmm0 paddd 0(%esp), %xmm0
paddd 16(%esp), %xmm1 paddd 16(%esp), %xmm1
paddd 32(%esp), %xmm2 paddd 32(%esp), %xmm2
@ -820,7 +820,7 @@ x86_xmm_scrypt_core_loop2:
movdqa %xmm1, 80(%esp) movdqa %xmm1, 80(%esp)
movdqa %xmm2, 96(%esp) movdqa %xmm2, 96(%esp)
movdqa %xmm3, 112(%esp) movdqa %xmm3, 112(%esp)
x86_xmm_salsa8_core xmm_salsa8_core
paddd 64(%esp), %xmm0 paddd 64(%esp), %xmm0
paddd 80(%esp), %xmm1 paddd 80(%esp), %xmm1
paddd 96(%esp), %xmm2 paddd 96(%esp), %xmm2
@ -831,7 +831,7 @@ x86_xmm_scrypt_core_loop2:
movdqa %xmm3, 112(%esp) movdqa %xmm3, 112(%esp)
subl $1, %ecx subl $1, %ecx
ja x86_xmm_scrypt_core_loop2 ja xmm_scrypt_core_loop2
# re-shuffle 1st block back # re-shuffle 1st block back
movl 60(%esp), %edx movl 60(%esp), %edx

241
scrypt.c
View file

@ -193,79 +193,13 @@ SHA256_InitState(uint32_t * state)
static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000}; static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300}; static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300};
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
static inline void static inline void
PBKDF2_SHA256_80_128(const uint32_t * passwd, uint32_t * buf) PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t ostate[8])
{ {
SHA256_CTX PShictx, PShoctx;
uint32_t tstate[8];
uint32_t ihash[8]; uint32_t ihash[8];
uint32_t i;
uint32_t pad[16]; uint32_t pad[16];
static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
/* If Klen > 64, the key is really SHA256(K). */
SHA256_InitState(tstate);
SHA256_Transform(tstate, passwd, 1);
memcpy(pad, passwd+16, 16);
memcpy(pad+4, passwdpad, 48);
SHA256_Transform(tstate, pad, 1);
memcpy(ihash, tstate, 32);
SHA256_InitState(PShictx.state);
for (i = 0; i < 8; i++)
pad[i] = ihash[i] ^ 0x36363636;
for (; i < 16; i++)
pad[i] = 0x36363636;
SHA256_Transform(PShictx.state, pad, 0);
SHA256_Transform(PShictx.state, passwd, 1);
be32enc_vect(PShictx.buf, passwd+16, 4);
be32enc_vect(PShictx.buf+5, innerpad, 11);
SHA256_InitState(PShoctx.state);
for (i = 0; i < 8; i++)
pad[i] = ihash[i] ^ 0x5c5c5c5c;
for (; i < 16; i++)
pad[i] = 0x5c5c5c5c;
SHA256_Transform(PShoctx.state, pad, 0);
memcpy(PShoctx.buf+8, outerpad, 32);
/* Iterate through the blocks. */
for (i = 0; i < 4; i++) {
uint32_t istate[8];
uint32_t ostate[8];
memcpy(istate, PShictx.state, 32);
PShictx.buf[4] = i + 1;
SHA256_Transform(istate, PShictx.buf, 0);
memcpy(PShoctx.buf, istate, 32);
memcpy(ostate, PShoctx.state, 32);
SHA256_Transform(ostate, PShoctx.buf, 0);
be32enc_vect(buf+i*8, ostate, 8);
}
}
static inline uint32_t
PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt)
{
uint32_t tstate[8];
uint32_t ostate[8];
uint32_t ihash[8];
uint32_t i; uint32_t i;
/* Compute HMAC state after processing P and S. */
uint32_t pad[16];
static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
/* If Klen > 64, the key is really SHA256(K). */
SHA256_InitState(tstate); SHA256_InitState(tstate);
SHA256_Transform(tstate, passwd, 1); SHA256_Transform(tstate, passwd, 1);
memcpy(pad, passwd+16, 16); memcpy(pad, passwd+16, 16);
@ -286,16 +220,63 @@ PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt)
for (; i < 16; i++) for (; i < 16; i++)
pad[i] = 0x36363636; pad[i] = 0x36363636;
SHA256_Transform(tstate, pad, 0); SHA256_Transform(tstate, pad, 0);
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
static inline void
PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *passwd, uint32_t *buf)
{
static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
SHA256_CTX PShictx, PShoctx;
uint32_t i;
/* If Klen > 64, the key is really SHA256(K). */
memcpy(PShictx.state, tstate, 32);
memcpy(PShoctx.state, ostate, 32);
memcpy(PShoctx.buf+8, outerpad, 32);
SHA256_Transform(PShictx.state, passwd, 1);
be32enc_vect(PShictx.buf, passwd+16, 4);
be32enc_vect(PShictx.buf+5, innerpad, 11);
/* Iterate through the blocks. */
for (i = 0; i < 4; i++) {
uint32_t ist[8];
uint32_t ost[8];
memcpy(ist, PShictx.state, 32);
PShictx.buf[4] = i + 1;
SHA256_Transform(ist, PShictx.buf, 0);
memcpy(PShoctx.buf, ist, 32);
memcpy(ost, PShoctx.state, 32);
SHA256_Transform(ost, PShoctx.buf, 0);
be32enc_vect(buf+i*8, ost, 8);
}
}
static inline void
PBKDF2_SHA256_80_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *passwd, const uint32_t *salt, uint32_t *output)
{
static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
uint32_t pad[16];
uint32_t i;
SHA256_Transform(tstate, salt, 1); SHA256_Transform(tstate, salt, 1);
SHA256_Transform(tstate, salt+16, 1); SHA256_Transform(tstate, salt+16, 1);
SHA256_Transform(tstate, ihash_finalblk, 0); SHA256_Transform(tstate, ihash_finalblk, 0);
memcpy(pad, tstate, 32); memcpy(pad, tstate, 32);
memcpy(pad+8, outerpad, 32); memcpy(pad+8, outerpad, 32);
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Transform(ostate, pad, 0); SHA256_Transform(ostate, pad, 0);
/* Finish the outer SHA256 operation. */
return byteswap(ostate[7]); for (i = 0; i < 8; i++)
output[i] = byteswap(ostate[i]);
} }
@ -358,34 +339,33 @@ salsa20_8(uint32_t B[16], const uint32_t Bx[16])
B[15] += x15; B[15] += x15;
} }
#if defined(__x86_64__)
void x64_scrypt_core(uint32_t *B, uint32_t *V);
#elif defined(__i386__)
void x86_scrypt_core(uint32_t *B, uint32_t *V);
#endif
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output #if defined(__x86_64__)
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
*/ #define DUAL_SCRYPT
static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) #define SCRYPT_BUFFER_SIZE (2 * 131072 + 63)
int prefer_dual_scrypt();
void scrypt_core(uint32_t *X, uint32_t *V);
void dual_scrypt_core(uint32_t *X, uint32_t *Y, uint32_t *V);
#elif defined(__i386__)
#define SCRYPT_BUFFER_SIZE (131072 + 63)
void scrypt_core(uint32_t *X, uint32_t *V);
#else
#define SCRYPT_BUFFER_SIZE (131072 + 63)
static inline void scrypt_core(uint32_t *X, uint32_t *V)
{ {
uint32_t * V;
uint32_t X[32];
uint32_t i; uint32_t i;
uint32_t j; uint32_t j;
uint32_t k; uint32_t k;
uint64_t *p1, *p2; uint64_t *p1, *p2;
p1 = (uint64_t *)X; p1 = (uint64_t *)X;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
PBKDF2_SHA256_80_128(input, X);
#if defined(__x86_64__)
x64_scrypt_core(X, V);
#elif defined(__i386__)
x86_scrypt_core(X, V);
#else
for (i = 0; i < 1024; i += 2) { for (i = 0; i < 1024; i += 2) {
memcpy(&V[i * 32], X, 128); memcpy(&V[i * 32], X, 128);
@ -414,32 +394,93 @@ static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad)
salsa20_8(&X[0], &X[16]); salsa20_8(&X[0], &X[16]);
salsa20_8(&X[16], &X[0]); salsa20_8(&X[16], &X[0]);
} }
}
#endif #endif
return PBKDF2_SHA256_80_128_32(input, X); unsigned char *scrypt_buffer_alloc() {
return malloc(SCRYPT_BUFFER_SIZE);
} }
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
r = 1, p = 1, N = 1024
*/
static void scrypt_1024_1_1_256_sp(const uint32_t* input, unsigned char *scratchpad, uint32_t *res)
{
uint32_t tstate[8], ostate[8];
uint32_t *V;
uint32_t X[32];
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
PBKDF2_SHA256_80_128_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core(X, V);
return PBKDF2_SHA256_80_128_32(tstate, ostate, input, X, res);
}
#ifdef DUAL_SCRYPT
static void dual_scrypt_1024_1_1_256_sp(const uint32_t *input1, const uint32_t *input2, unsigned char *scratchpad, uint32_t *res1, uint32_t *res2)
{
uint32_t tstate1[8], tstate2[8], ostate1[8], ostate2[8];
uint32_t *V;
uint32_t X[32], Y[32];
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1);
PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2);
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
dual_scrypt_core(X, Y, V);
PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, res1);
PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, res2);
}
#endif
int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
const unsigned char *ptarget, const unsigned char *ptarget,
uint32_t max_nonce, unsigned long *hashes_done) uint32_t max_nonce, unsigned long *hashes_done)
{ {
uint32_t data[20]; uint32_t data[20], hash[8];
uint32_t tmp_hash7; #ifdef DUAL_SCRYPT
uint32_t data2[20], hash2[8];
int use_dual;
#endif
uint32_t n = 0; uint32_t n = 0;
uint32_t Htarg = ((const uint32_t *)ptarget)[7]; uint32_t Htarg = ((const uint32_t *)ptarget)[7];
int i;
work_restart[thr_id].restart = 0; work_restart[thr_id].restart = 0;
be32enc_vect(data, (const uint32_t *)pdata, 19); be32enc_vect(data, (const uint32_t *)pdata, 19);
#ifdef DUAL_SCRYPT
memcpy(data2, data, 80);
use_dual = prefer_dual_scrypt();
#endif
while (1) { while (1) {
n++; data[19] = n++;
data[19] = n; #ifdef DUAL_SCRYPT
tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf); if (use_dual) {
data2[19] = n++;
dual_scrypt_1024_1_1_256_sp(data, data2, scratchbuf, hash, hash2);
if (hash2[7] <= Htarg) {
((uint32_t *)pdata)[19] = byteswap(data2[19]);
*hashes_done = n;
return true;
}
} else {
scrypt_1024_1_1_256_sp(data, scratchbuf, hash);
}
#else
scrypt_1024_1_1_256_sp(data, scratchbuf, hash);
#endif
if (tmp_hash7 <= Htarg) { if (hash[7] <= Htarg) {
((uint32_t *)pdata)[19] = byteswap(n); ((uint32_t *)pdata)[19] = byteswap(data[19]);
*hashes_done = n; *hashes_done = n;
return true; return true;
} }

2
util.c
View file

@ -94,7 +94,9 @@ void applog(int prio, const char *fmt, ...)
tm.tm_min, tm.tm_min,
tm.tm_sec, tm.tm_sec,
fmt); fmt);
pthread_mutex_lock(&time_lock);
vfprintf(stderr, f, ap); /* atomic write to stderr */ vfprintf(stderr, f, ap); /* atomic write to stderr */
pthread_mutex_unlock(&time_lock);
} }
va_end(ap); va_end(ap);
} }