From 36225b42064b346d64f00c1278347a21852e9cfd Mon Sep 17 00:00:00 2001 From: pooler Date: Sat, 24 Dec 2011 12:22:06 +0100 Subject: [PATCH] Some more optimization --- cpu-miner.c | 4 +- miner.h | 1 + scrypt-x64.S | 742 +++++++++++++++++++++++++++++++++++++++++++++++---- scrypt-x86.S | 266 +++++++++--------- scrypt.c | 243 ++++++++++------- util.c | 2 + 6 files changed, 974 insertions(+), 284 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 6e1c4e4..9d2ae91 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -551,7 +551,7 @@ static void *miner_thread(void *userdata) if (opt_algo == ALGO_SCRYPT) { - scratchbuf = malloc(131583); + scratchbuf = scrypt_buffer_alloc(); max_nonce = 0xffff; } @@ -955,7 +955,7 @@ int main(int argc, char *argv[]) } applog(LOG_INFO, "%d miner threads started, " - "using SHA256 '%s' algorithm.", + "using '%s' algorithm.", opt_n_threads, algo_names[opt_algo]); diff --git a/miner.h b/miner.h index 07423e1..df6d01e 100644 --- a/miner.h +++ b/miner.h @@ -127,6 +127,7 @@ extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, extern char *bin2hex(const unsigned char *p, size_t len); extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); +extern unsigned char *scrypt_buffer_alloc(); extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, unsigned long *nHashesDone); diff --git a/scrypt-x64.S b/scrypt-x64.S index 04af376..a3267c1 100644 --- a/scrypt-x64.S +++ b/scrypt-x64.S @@ -24,7 +24,7 @@ #if defined(__x86_64__) -.macro x64_gen_salsa8_core_doubleround +.macro gen_salsa8_core_doubleround movq 72(%rsp), %r15 leaq (%r14, %rdx), %rbp roll $7, %ebp @@ -137,7 +137,7 @@ .text .align 32 -x64_gen_salsa8_core: +gen_salsa8_core: # 0: %rdx, %rdi, %rcx, %rsi movq 8(%rsp), %rdi movq %rdi, %rdx @@ -170,52 +170,52 @@ x64_gen_salsa8_core: shrq $32, %r15 movq %r15, 88(%rsp) - x64_gen_salsa8_core_doubleround - x64_gen_salsa8_core_doubleround - x64_gen_salsa8_core_doubleround - x64_gen_salsa8_core_doubleround + gen_salsa8_core_doubleround + gen_salsa8_core_doubleround + gen_salsa8_core_doubleround + gen_salsa8_core_doubleround movl %edx, %edx shlq $32, %rdi addq %rdi, %rdx - movq %rdx, %xmm0 + movd %rdx, %xmm0 movl %ecx, %ecx shlq $32, %rsi addq %rsi, %rcx - movq %rcx, %xmm4 + movd %rcx, %xmm4 movq 72(%rsp), %rdi movl %r9d, %r9d shlq $32, %rdi addq %rdi, %r9 - movq %r9, %xmm1 + movd %r9, %xmm1 movl %eax, %eax shlq $32, %r8 addq %r8, %rax - movq %rax, %xmm5 + movd %rax, %xmm5 movl %r11d, %r11d shlq $32, %r10 addq %r10, %r11 - movq %r11, %xmm2 + movd %r11, %xmm2 movl 48(%rsp), %r8d shlq $32, %r12 addq %r12, %r8 - movq %r8, %xmm6 + movd %r8, %xmm6 movl %r14d, %r14d shlq $32, %r13 addq %r13, %r14 - movq %r14, %xmm3 + movd %r14, %xmm3 movq 88(%rsp), %rdi movl %ebx, %ebx shlq $32, %rdi addq %rdi, %rbx - movq %rbx, %xmm7 + movd %rbx, %xmm7 punpcklqdq %xmm4, %xmm0 punpcklqdq %xmm5, %xmm1 @@ -236,10 +236,10 @@ x64_gen_salsa8_core: .text .align 32 - .globl x64_scrypt_core - .globl _x64_scrypt_core -x64_scrypt_core: -_x64_scrypt_core: + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: pushq %rbx pushq %rbp pushq %r12 @@ -264,7 +264,7 @@ _x64_scrypt_core: movq %rdx, %rsi #endif -.macro x64_scrypt_core_cleanup +.macro scrypt_core_cleanup #if defined(WIN64) popq %rsi popq %rdi @@ -292,13 +292,13 @@ _x64_scrypt_core: xorl %eax, %eax cpuid cmpl $0x6c65746e, %ecx - jne x64_gen_scrypt_core + jne gen_scrypt_core cmpl $0x49656e69, %edx - jne x64_gen_scrypt_core + jne gen_scrypt_core cmpl $0x756e6547, %ebx - je x64_xmm_scrypt_core + je xmm_scrypt_core -x64_gen_scrypt_core: +gen_scrypt_core: subq $136, %rsp movdqa 0(%rdi), %xmm8 movdqa 16(%rdi), %xmm9 @@ -313,7 +313,7 @@ x64_gen_scrypt_core: movq %rdi, 104(%rsp) movq %rsi, 112(%rsp) movq %rcx, 120(%rsp) -x64_gen_scrypt_core_loop1: +gen_scrypt_core_loop1: movdqa %xmm8, 0(%rsi) movdqa %xmm9, 16(%rsi) movdqa %xmm10, 32(%rsi) @@ -332,7 +332,7 @@ x64_gen_scrypt_core_loop1: movdqa %xmm10, 32(%rsp) movdqa %xmm11, 48(%rsp) movq %rsi, 128(%rsp) - call x64_gen_salsa8_core + call gen_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -346,7 +346,7 @@ x64_gen_scrypt_core_loop1: movdqa %xmm13, 16(%rsp) movdqa %xmm14, 32(%rsp) movdqa %xmm15, 48(%rsp) - call x64_gen_salsa8_core + call gen_salsa8_core movq 128(%rsp), %rsi paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 @@ -356,10 +356,10 @@ x64_gen_scrypt_core_loop1: addq $128, %rsi movq 120(%rsp), %rcx cmpq %rcx, %rsi - jne x64_gen_scrypt_core_loop1 + jne gen_scrypt_core_loop1 movq $1024, %rcx -x64_gen_scrypt_core_loop2: +gen_scrypt_core_loop2: movq 112(%rsp), %rsi movd %xmm12, %edx andl $1023, %edx @@ -390,7 +390,7 @@ x64_gen_scrypt_core_loop2: movdqa %xmm10, 32(%rsp) movdqa %xmm11, 48(%rsp) movq %rcx, 128(%rsp) - call x64_gen_salsa8_core + call gen_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -404,7 +404,7 @@ x64_gen_scrypt_core_loop2: movdqa %xmm13, 16(%rsp) movdqa %xmm14, 32(%rsp) movdqa %xmm15, 48(%rsp) - call x64_gen_salsa8_core + call gen_salsa8_core movq 128(%rsp), %rcx paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 @@ -412,7 +412,7 @@ x64_gen_scrypt_core_loop2: paddd %xmm3, %xmm15 subq $1, %rcx - ja x64_gen_scrypt_core_loop2 + ja gen_scrypt_core_loop2 movq 104(%rsp), %rdi movdqa %xmm8, 0(%rdi) @@ -425,11 +425,11 @@ x64_gen_scrypt_core_loop2: movdqa %xmm15, 112(%rdi) addq $136, %rsp - x64_scrypt_core_cleanup + scrypt_core_cleanup ret -.macro x64_xmm_salsa8_core_doubleround +.macro xmm_salsa8_core_doubleround paddd %xmm0, %xmm4 movdqa %xmm0, %xmm5 movdqa %xmm4, %xmm6 @@ -495,16 +495,16 @@ x64_gen_scrypt_core_loop2: pxor %xmm6, %xmm0 .endm -.macro x64_xmm_salsa8_core +.macro xmm_salsa8_core movdqa %xmm1, %xmm4 - x64_xmm_salsa8_core_doubleround - x64_xmm_salsa8_core_doubleround - x64_xmm_salsa8_core_doubleround - x64_xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround .endm .align 32 -x64_xmm_scrypt_core: +xmm_scrypt_core: # shuffle 1st block into %xmm8-%xmm11 movl 60(%rdi), %edx movl 44(%rdi), %ecx @@ -623,7 +623,7 @@ x64_xmm_scrypt_core: movq %rsi, %rdx leaq 131072(%rsi), %rcx -x64_xmm_scrypt_core_loop1: +xmm_scrypt_core_loop1: movdqa %xmm8, 0(%rdx) movdqa %xmm9, 16(%rdx) movdqa %xmm10, 32(%rdx) @@ -641,7 +641,7 @@ x64_xmm_scrypt_core_loop1: movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 - x64_xmm_salsa8_core + xmm_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -655,7 +655,7 @@ x64_xmm_scrypt_core_loop1: movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 - x64_xmm_salsa8_core + xmm_salsa8_core paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 @@ -663,10 +663,10 @@ x64_xmm_scrypt_core_loop1: addq $128, %rdx cmpq %rcx, %rdx - jne x64_xmm_scrypt_core_loop1 + jne xmm_scrypt_core_loop1 movq $1024, %rcx -x64_xmm_scrypt_core_loop2: +xmm_scrypt_core_loop2: movd %xmm12, %edx andl $1023, %edx shll $7, %edx @@ -695,7 +695,7 @@ x64_xmm_scrypt_core_loop2: movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 - x64_xmm_salsa8_core + xmm_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -709,14 +709,14 @@ x64_xmm_scrypt_core_loop2: movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 - x64_xmm_salsa8_core + xmm_salsa8_core paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 paddd %xmm3, %xmm15 subq $1, %rcx - ja x64_xmm_scrypt_core_loop2 + ja xmm_scrypt_core_loop2 # re-shuffle 1st block back movd %xmm8, %eax @@ -810,7 +810,653 @@ x64_xmm_scrypt_core_loop2: movl %ebx, 92(%rdi) movl %eax, 76(%rdi) - x64_scrypt_core_cleanup + scrypt_core_cleanup + ret + + + .text + .align 32 + .globl prefer_dual_scrypt + .globl _prefer_dual_scrypt +prefer_dual_scrypt: +_prefer_dual_scrypt: + pushq %rbx + xorq %rax, %rax + cpuid + xorq %rax, %rax + cmpl $0x6c65746e, %ecx + jne prefer_dual_scrypt_false + cmpl $0x49656e69, %edx + jne prefer_dual_scrypt_false + cmpl $0x756e6547, %ebx + jne prefer_dual_scrypt_false + incl %eax +prefer_dual_scrypt_false: + popq %rbx + ret + + +.macro xmm_dual_salsa8_core_doubleround + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm12 + movdqa %xmm0, %xmm5 + movdqa %xmm8, %xmm13 + movdqa %xmm4, %xmm6 + movdqa %xmm12, %xmm14 + pslld $7, %xmm4 + pslld $7, %xmm12 + psrld $25, %xmm6 + psrld $25, %xmm14 + pxor %xmm4, %xmm3 + pxor %xmm12, %xmm11 + pxor %xmm6, %xmm3 + pxor %xmm14, %xmm11 + paddd %xmm3, %xmm5 + paddd %xmm11, %xmm13 + movdqa %xmm3, %xmm4 + movdqa %xmm11, %xmm12 + movdqa %xmm5, %xmm6 + movdqa %xmm13, %xmm14 + pslld $9, %xmm5 + pslld $9, %xmm13 + psrld $23, %xmm6 + psrld $23, %xmm14 + pxor %xmm5, %xmm2 + pxor %xmm13, %xmm10 + pshufd $0x93, %xmm3, %xmm3 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm6, %xmm2 + pxor %xmm14, %xmm10 + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm12 + movdqa %xmm2, %xmm5 + movdqa %xmm10, %xmm13 + movdqa %xmm4, %xmm6 + movdqa %xmm12, %xmm14 + pslld $13, %xmm4 + pslld $13, %xmm12 + psrld $19, %xmm6 + psrld $19, %xmm14 + pxor %xmm4, %xmm1 + pxor %xmm12, %xmm9 + pshufd $0x4e, %xmm2, %xmm2 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm6, %xmm1 + pxor %xmm14, %xmm9 + paddd %xmm1, %xmm5 + paddd %xmm9, %xmm13 + movdqa %xmm3, %xmm4 + movdqa %xmm11, %xmm12 + movdqa %xmm5, %xmm6 + movdqa %xmm13, %xmm14 + pslld $18, %xmm5 + pslld $18, %xmm13 + psrld $14, %xmm6 + psrld $14, %xmm14 + pxor %xmm5, %xmm0 + pxor %xmm13, %xmm8 + pshufd $0x39, %xmm1, %xmm1 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm6, %xmm0 + pxor %xmm14, %xmm8 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm12 + movdqa %xmm0, %xmm5 + movdqa %xmm8, %xmm13 + movdqa %xmm4, %xmm6 + movdqa %xmm12, %xmm14 + pslld $7, %xmm4 + pslld $7, %xmm12 + psrld $25, %xmm6 + psrld $25, %xmm14 + pxor %xmm4, %xmm1 + pxor %xmm12, %xmm9 + pxor %xmm6, %xmm1 + pxor %xmm14, %xmm9 + paddd %xmm1, %xmm5 + paddd %xmm9, %xmm13 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm12 + movdqa %xmm5, %xmm6 + movdqa %xmm13, %xmm14 + pslld $9, %xmm5 + pslld $9, %xmm13 + psrld $23, %xmm6 + psrld $23, %xmm14 + pxor %xmm5, %xmm2 + pxor %xmm13, %xmm10 + pshufd $0x93, %xmm1, %xmm1 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm6, %xmm2 + pxor %xmm14, %xmm10 + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm12 + movdqa %xmm2, %xmm5 + movdqa %xmm10, %xmm13 + movdqa %xmm4, %xmm6 + movdqa %xmm12, %xmm14 + pslld $13, %xmm4 + pslld $13, %xmm12 + psrld $19, %xmm6 + psrld $19, %xmm14 + pxor %xmm4, %xmm3 + pxor %xmm12, %xmm11 + pshufd $0x4e, %xmm2, %xmm2 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm6, %xmm3 + pxor %xmm14, %xmm11 + paddd %xmm3, %xmm5 + paddd %xmm11, %xmm13 + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm12 + movdqa %xmm5, %xmm6 + movdqa %xmm13, %xmm14 + pslld $18, %xmm5 + pslld $18, %xmm13 + psrld $14, %xmm6 + psrld $14, %xmm14 + pxor %xmm5, %xmm0 + pxor %xmm13, %xmm8 + pshufd $0x39, %xmm3, %xmm3 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm6, %xmm0 + pxor %xmm14, %xmm8 +.endm + +.macro xmm_dual_salsa8_core + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm12 + xmm_dual_salsa8_core_doubleround + xmm_dual_salsa8_core_doubleround + xmm_dual_salsa8_core_doubleround + xmm_dual_salsa8_core_doubleround +.endm + + + .text + .align 32 + .globl dual_scrypt_core + .globl _dual_scrypt_core +dual_scrypt_core: +_dual_scrypt_core: + pushq %rbx + pushq %rbp +#if defined(WIN64) + subq $176, %rsp + movdqa %xmm6, 8(%rsp) + movdqa %xmm7, 24(%rsp) + movdqa %xmm8, 40(%rsp) + movdqa %xmm9, 56(%rsp) + movdqa %xmm10, 72(%rsp) + movdqa %xmm11, 88(%rsp) + movdqa %xmm12, 104(%rsp) + movdqa %xmm13, 120(%rsp) + movdqa %xmm14, 136(%rsp) + movdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + + # shuffle 1st block + movl 60(%rdi), %ebp + movl 44(%rdi), %ecx + movl 28(%rdi), %ebx + movl 12(%rdi), %eax + movl %ebp, 12(%rdi) + movl %ecx, 28(%rdi) + movl %ebx, 44(%rdi) + movl %eax, 60(%rdi) + movl 40(%rdi), %ecx + movl 8(%rdi), %eax + movl 48(%rdi), %ebp + movl 16(%rdi), %ebx + movl %ecx, 8(%rdi) + movl %eax, 40(%rdi) + movl %ebp, 16(%rdi) + movl %ebx, 48(%rdi) + movl 20(%rdi), %ebx + movl 4(%rdi), %eax + movl 52(%rdi), %ebp + movl 36(%rdi), %ecx + movl %ebx, 4(%rdi) + movl %eax, 20(%rdi) + movl %ebp, 36(%rdi) + movl %ecx, 52(%rdi) + + # shuffle 2nd block + movl 124(%rdi), %ebp + movl 108(%rdi), %ecx + movl 92(%rdi), %ebx + movl 76(%rdi), %eax + movl %ebp, 76(%rdi) + movl %ecx, 92(%rdi) + movl %ebx, 108(%rdi) + movl %eax, 124(%rdi) + movl 104(%rdi), %ecx + movl 72(%rdi), %eax + movl 112(%rdi), %ebp + movl 80(%rdi), %ebx + movl %ecx, 72(%rdi) + movl %eax, 104(%rdi) + movl %ebp, 80(%rdi) + movl %ebx, 112(%rdi) + movl 84(%rdi), %ebx + movl 68(%rdi), %eax + movl 116(%rdi), %ebp + movl 100(%rdi), %ecx + movl %ebx, 68(%rdi) + movl %eax, 84(%rdi) + movl %ebp, 100(%rdi) + movl %ecx, 116(%rdi) + + # shuffle 3rd block + movl 60(%rsi), %ebp + movl 44(%rsi), %ecx + movl 28(%rsi), %ebx + movl 12(%rsi), %eax + movl %ebp, 12(%rsi) + movl %ecx, 28(%rsi) + movl %ebx, 44(%rsi) + movl %eax, 60(%rsi) + movl 40(%rsi), %ecx + movl 8(%rsi), %eax + movl 48(%rsi), %ebp + movl 16(%rsi), %ebx + movl %ecx, 8(%rsi) + movl %eax, 40(%rsi) + movl %ebp, 16(%rsi) + movl %ebx, 48(%rsi) + movl 20(%rsi), %ebx + movl 4(%rsi), %eax + movl 52(%rsi), %ebp + movl 36(%rsi), %ecx + movl %ebx, 4(%rsi) + movl %eax, 20(%rsi) + movl %ebp, 36(%rsi) + movl %ecx, 52(%rsi) + + # shuffle 4th block + movl 124(%rsi), %ebp + movl 108(%rsi), %ecx + movl 92(%rsi), %ebx + movl 76(%rsi), %eax + movl %ebp, 76(%rsi) + movl %ecx, 92(%rsi) + movl %ebx, 108(%rsi) + movl %eax, 124(%rsi) + movl 104(%rsi), %ecx + movl 72(%rsi), %eax + movl 112(%rsi), %ebp + movl 80(%rsi), %ebx + movl %ecx, 72(%rsi) + movl %eax, 104(%rsi) + movl %ebp, 80(%rsi) + movl %ebx, 112(%rsi) + movl 84(%rsi), %ebx + movl 68(%rsi), %eax + movl 116(%rsi), %ebp + movl 100(%rsi), %ecx + movl %ebx, 68(%rsi) + movl %eax, 84(%rsi) + movl %ebp, 100(%rsi) + movl %ecx, 116(%rsi) + + movq %rdx, %rbp + leaq 262144(%rdx), %rcx + .align 8 +dual_scrypt_core_loop1: + movdqa 0(%rdi), %xmm0 + movdqa 16(%rdi), %xmm1 + movdqa 32(%rdi), %xmm2 + movdqa 48(%rdi), %xmm3 + movdqa 64(%rdi), %xmm4 + movdqa 80(%rdi), %xmm5 + movdqa 96(%rdi), %xmm6 + movdqa 112(%rdi), %xmm7 + movdqa 0(%rsi), %xmm8 + movdqa 16(%rsi), %xmm9 + movdqa 32(%rsi), %xmm10 + movdqa 48(%rsi), %xmm11 + movdqa 64(%rsi), %xmm12 + movdqa 80(%rsi), %xmm13 + movdqa 96(%rsi), %xmm14 + movdqa 112(%rsi), %xmm15 + movdqa %xmm0, 0(%rbp) + movdqa %xmm1, 16(%rbp) + movdqa %xmm2, 32(%rbp) + movdqa %xmm3, 48(%rbp) + movdqa %xmm4, 64(%rbp) + movdqa %xmm5, 80(%rbp) + movdqa %xmm6, 96(%rbp) + movdqa %xmm7, 112(%rbp) + movdqa %xmm8, 128(%rbp) + movdqa %xmm9, 144(%rbp) + movdqa %xmm10, 160(%rbp) + movdqa %xmm11, 176(%rbp) + movdqa %xmm12, 192(%rbp) + movdqa %xmm13, 208(%rbp) + movdqa %xmm14, 224(%rbp) + movdqa %xmm15, 240(%rbp) + + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm0, 0(%rdi) + movdqa %xmm1, 16(%rdi) + movdqa %xmm2, 32(%rdi) + movdqa %xmm3, 48(%rdi) + movdqa %xmm8, 0(%rsi) + movdqa %xmm9, 16(%rsi) + movdqa %xmm10, 32(%rsi) + movdqa %xmm11, 48(%rsi) + xmm_dual_salsa8_core + paddd 0(%rdi), %xmm0 + paddd 16(%rdi), %xmm1 + paddd 32(%rdi), %xmm2 + paddd 48(%rdi), %xmm3 + paddd 0(%rsi), %xmm8 + paddd 16(%rsi), %xmm9 + paddd 32(%rsi), %xmm10 + paddd 48(%rsi), %xmm11 + movdqa %xmm0, 0(%rdi) + movdqa %xmm1, 16(%rdi) + movdqa %xmm2, 32(%rdi) + movdqa %xmm3, 48(%rdi) + movdqa %xmm8, 0(%rsi) + movdqa %xmm9, 16(%rsi) + movdqa %xmm10, 32(%rsi) + movdqa %xmm11, 48(%rsi) + + pxor 64(%rdi), %xmm0 + pxor 80(%rdi), %xmm1 + pxor 96(%rdi), %xmm2 + pxor 112(%rdi), %xmm3 + pxor 64(%rsi), %xmm8 + pxor 80(%rsi), %xmm9 + pxor 96(%rsi), %xmm10 + pxor 112(%rsi), %xmm11 + movdqa %xmm0, 64(%rdi) + movdqa %xmm1, 80(%rdi) + movdqa %xmm2, 96(%rdi) + movdqa %xmm3, 112(%rdi) + movdqa %xmm8, 64(%rsi) + movdqa %xmm9, 80(%rsi) + movdqa %xmm10, 96(%rsi) + movdqa %xmm11, 112(%rsi) + xmm_dual_salsa8_core + paddd 64(%rdi), %xmm0 + paddd 80(%rdi), %xmm1 + paddd 96(%rdi), %xmm2 + paddd 112(%rdi), %xmm3 + paddd 64(%rsi), %xmm8 + paddd 80(%rsi), %xmm9 + paddd 96(%rsi), %xmm10 + paddd 112(%rsi), %xmm11 + movdqa %xmm0, 64(%rdi) + movdqa %xmm1, 80(%rdi) + movdqa %xmm2, 96(%rdi) + movdqa %xmm3, 112(%rdi) + movdqa %xmm8, 64(%rsi) + movdqa %xmm9, 80(%rsi) + movdqa %xmm10, 96(%rsi) + movdqa %xmm11, 112(%rsi) + + addq $256, %rbp + cmpq %rcx, %rbp + jne dual_scrypt_core_loop1 + + movq $1024, %rcx + .align 8 +dual_scrypt_core_loop2: + movl 64(%rdi), %ebp + andl $1023, %ebp + shll $8, %ebp + movdqa 0(%rdx, %rbp), %xmm0 + movdqa 16(%rdx, %rbp), %xmm1 + movdqa 32(%rdx, %rbp), %xmm2 + movdqa 48(%rdx, %rbp), %xmm3 + movdqa 64(%rdx, %rbp), %xmm4 + movdqa 80(%rdx, %rbp), %xmm5 + movdqa 96(%rdx, %rbp), %xmm6 + movdqa 112(%rdx, %rbp), %xmm7 + movl 64(%rsi), %ebp + andl $1023, %ebp + shll $8, %ebp + addl $128, %ebp + movdqa 0(%rdx, %rbp), %xmm8 + movdqa 16(%rdx, %rbp), %xmm9 + movdqa 32(%rdx, %rbp), %xmm10 + movdqa 48(%rdx, %rbp), %xmm11 + movdqa 64(%rdx, %rbp), %xmm12 + movdqa 80(%rdx, %rbp), %xmm13 + movdqa 96(%rdx, %rbp), %xmm14 + movdqa 112(%rdx, %rbp), %xmm15 + pxor 0(%rdi), %xmm0 + pxor 16(%rdi), %xmm1 + pxor 32(%rdi), %xmm2 + pxor 48(%rdi), %xmm3 + pxor 64(%rdi), %xmm4 + pxor 80(%rdi), %xmm5 + pxor 96(%rdi), %xmm6 + pxor 112(%rdi), %xmm7 + pxor 0(%rsi), %xmm8 + pxor 16(%rsi), %xmm9 + pxor 32(%rsi), %xmm10 + pxor 48(%rsi), %xmm11 + pxor 64(%rsi), %xmm12 + pxor 80(%rsi), %xmm13 + pxor 96(%rsi), %xmm14 + pxor 112(%rsi), %xmm15 + + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm0, 0(%rdi) + movdqa %xmm1, 16(%rdi) + movdqa %xmm2, 32(%rdi) + movdqa %xmm3, 48(%rdi) + movdqa %xmm4, 64(%rdi) + movdqa %xmm5, 80(%rdi) + movdqa %xmm6, 96(%rdi) + movdqa %xmm7, 112(%rdi) + movdqa %xmm8, 0(%rsi) + movdqa %xmm9, 16(%rsi) + movdqa %xmm10, 32(%rsi) + movdqa %xmm11, 48(%rsi) + movdqa %xmm12, 64(%rsi) + movdqa %xmm13, 80(%rsi) + movdqa %xmm14, 96(%rsi) + movdqa %xmm15, 112(%rsi) + xmm_dual_salsa8_core + paddd 0(%rdi), %xmm0 + paddd 16(%rdi), %xmm1 + paddd 32(%rdi), %xmm2 + paddd 48(%rdi), %xmm3 + paddd 0(%rsi), %xmm8 + paddd 16(%rsi), %xmm9 + paddd 32(%rsi), %xmm10 + paddd 48(%rsi), %xmm11 + movdqa %xmm0, 0(%rdi) + movdqa %xmm1, 16(%rdi) + movdqa %xmm2, 32(%rdi) + movdqa %xmm3, 48(%rdi) + movdqa %xmm8, 0(%rsi) + movdqa %xmm9, 16(%rsi) + movdqa %xmm10, 32(%rsi) + movdqa %xmm11, 48(%rsi) + + pxor 64(%rdi), %xmm0 + pxor 80(%rdi), %xmm1 + pxor 96(%rdi), %xmm2 + pxor 112(%rdi), %xmm3 + pxor 64(%rsi), %xmm8 + pxor 80(%rsi), %xmm9 + pxor 96(%rsi), %xmm10 + pxor 112(%rsi), %xmm11 + movdqa %xmm0, 64(%rdi) + movdqa %xmm1, 80(%rdi) + movdqa %xmm2, 96(%rdi) + movdqa %xmm3, 112(%rdi) + movdqa %xmm8, 64(%rsi) + movdqa %xmm9, 80(%rsi) + movdqa %xmm10, 96(%rsi) + movdqa %xmm11, 112(%rsi) + xmm_dual_salsa8_core + paddd 64(%rdi), %xmm0 + paddd 80(%rdi), %xmm1 + paddd 96(%rdi), %xmm2 + paddd 112(%rdi), %xmm3 + paddd 64(%rsi), %xmm8 + paddd 80(%rsi), %xmm9 + paddd 96(%rsi), %xmm10 + paddd 112(%rsi), %xmm11 + movdqa %xmm0, 64(%rdi) + movdqa %xmm1, 80(%rdi) + movdqa %xmm2, 96(%rdi) + movdqa %xmm3, 112(%rdi) + movdqa %xmm8, 64(%rsi) + movdqa %xmm9, 80(%rsi) + movdqa %xmm10, 96(%rsi) + movdqa %xmm11, 112(%rsi) + + subq $1, %rcx + ja dual_scrypt_core_loop2 + + # shuffle 1st block + movl 60(%rdi), %ebp + movl 44(%rdi), %ecx + movl 28(%rdi), %ebx + movl 12(%rdi), %eax + movl %ebp, 12(%rdi) + movl %ecx, 28(%rdi) + movl %ebx, 44(%rdi) + movl %eax, 60(%rdi) + movl 40(%rdi), %ecx + movl 8(%rdi), %eax + movl 48(%rdi), %ebp + movl 16(%rdi), %ebx + movl %ecx, 8(%rdi) + movl %eax, 40(%rdi) + movl %ebp, 16(%rdi) + movl %ebx, 48(%rdi) + movl 20(%rdi), %ebx + movl 4(%rdi), %eax + movl 52(%rdi), %ebp + movl 36(%rdi), %ecx + movl %ebx, 4(%rdi) + movl %eax, 20(%rdi) + movl %ebp, 36(%rdi) + movl %ecx, 52(%rdi) + + # shuffle 2nd block + movl 124(%rdi), %ebp + movl 108(%rdi), %ecx + movl 92(%rdi), %ebx + movl 76(%rdi), %eax + movl %ebp, 76(%rdi) + movl %ecx, 92(%rdi) + movl %ebx, 108(%rdi) + movl %eax, 124(%rdi) + movl 104(%rdi), %ecx + movl 72(%rdi), %eax + movl 112(%rdi), %ebp + movl 80(%rdi), %ebx + movl %ecx, 72(%rdi) + movl %eax, 104(%rdi) + movl %ebp, 80(%rdi) + movl %ebx, 112(%rdi) + movl 84(%rdi), %ebx + movl 68(%rdi), %eax + movl 116(%rdi), %ebp + movl 100(%rdi), %ecx + movl %ebx, 68(%rdi) + movl %eax, 84(%rdi) + movl %ebp, 100(%rdi) + movl %ecx, 116(%rdi) + + # shuffle 3rd block + movl 60(%rsi), %ebp + movl 44(%rsi), %ecx + movl 28(%rsi), %ebx + movl 12(%rsi), %eax + movl %ebp, 12(%rsi) + movl %ecx, 28(%rsi) + movl %ebx, 44(%rsi) + movl %eax, 60(%rsi) + movl 40(%rsi), %ecx + movl 8(%rsi), %eax + movl 48(%rsi), %ebp + movl 16(%rsi), %ebx + movl %ecx, 8(%rsi) + movl %eax, 40(%rsi) + movl %ebp, 16(%rsi) + movl %ebx, 48(%rsi) + movl 20(%rsi), %ebx + movl 4(%rsi), %eax + movl 52(%rsi), %ebp + movl 36(%rsi), %ecx + movl %ebx, 4(%rsi) + movl %eax, 20(%rsi) + movl %ebp, 36(%rsi) + movl %ecx, 52(%rsi) + + # shuffle 4th block + movl 124(%rsi), %ebp + movl 108(%rsi), %ecx + movl 92(%rsi), %ebx + movl 76(%rsi), %eax + movl %ebp, 76(%rsi) + movl %ecx, 92(%rsi) + movl %ebx, 108(%rsi) + movl %eax, 124(%rsi) + movl 104(%rsi), %ecx + movl 72(%rsi), %eax + movl 112(%rsi), %ebp + movl 80(%rsi), %ebx + movl %ecx, 72(%rsi) + movl %eax, 104(%rsi) + movl %ebp, 80(%rsi) + movl %ebx, 112(%rsi) + movl 84(%rsi), %ebx + movl 68(%rsi), %eax + movl 116(%rsi), %ebp + movl 100(%rsi), %ecx + movl %ebx, 68(%rsi) + movl %eax, 84(%rsi) + movl %ebp, 100(%rsi) + movl %ecx, 116(%rsi) + +#if defined(WIN64) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx ret #endif diff --git a/scrypt-x86.S b/scrypt-x86.S index 52560c5..c3113a7 100644 --- a/scrypt-x86.S +++ b/scrypt-x86.S @@ -24,7 +24,7 @@ #if defined(__i386__) -.macro x86_gen_salsa8_core_quadround +.macro gen_salsa8_core_quadround movl 52(%esp), %ecx movl 4(%esp), %edx movl 20(%esp), %ebx @@ -346,18 +346,18 @@ .text .align 32 -x86_gen_salsa8_core: - x86_gen_salsa8_core_quadround - x86_gen_salsa8_core_quadround +gen_salsa8_core: + gen_salsa8_core_quadround + gen_salsa8_core_quadround ret .text .align 32 - .globl x86_scrypt_core - .globl _x86_scrypt_core -x86_scrypt_core: -_x86_scrypt_core: + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: pushl %ebx pushl %ebp pushl %edi @@ -367,14 +367,14 @@ _x86_scrypt_core: movl $1, %eax cpuid andl $0x04000000, %edx - jnz x86_xmm_scrypt_core + jnz xmm_scrypt_core -x86_gen_scrypt_core: +gen_scrypt_core: movl 20(%esp), %edi movl 24(%esp), %esi subl $72, %esp -.macro x86_scrypt_core_macro1a p, q +.macro scrypt_core_macro1a p, q movl \p(%edi), %eax movl \q(%edi), %edx movl %eax, \p(%esi) @@ -384,7 +384,7 @@ x86_gen_scrypt_core: movl %eax, \p(%esp) .endm -.macro x86_scrypt_core_macro1b p, q +.macro scrypt_core_macro1b p, q movl \p(%edi), %eax xorl \p(%esi, %edx), %eax movl \q(%edi), %ebx @@ -395,7 +395,7 @@ x86_gen_scrypt_core: movl %eax, \p(%esp) .endm -.macro x86_scrypt_core_macro2 p, q +.macro scrypt_core_macro2 p, q movl \p(%esp), %eax addl \p(%edi), %eax movl %eax, \p(%edi) @@ -404,150 +404,150 @@ x86_gen_scrypt_core: movl %eax, \p(%esp) .endm -.macro x86_scrypt_core_macro3 p, q +.macro scrypt_core_macro3 p, q movl \p(%esp), %eax addl \q(%edi), %eax movl %eax, \q(%edi) .endm leal 131072(%esi), %ecx -x86_gen_scrypt_core_loop1: +gen_scrypt_core_loop1: movl %esi, 64(%esp) movl %ecx, 68(%esp) - x86_scrypt_core_macro1a 0, 64 - x86_scrypt_core_macro1a 4, 68 - x86_scrypt_core_macro1a 8, 72 - x86_scrypt_core_macro1a 12, 76 - x86_scrypt_core_macro1a 16, 80 - x86_scrypt_core_macro1a 20, 84 - x86_scrypt_core_macro1a 24, 88 - x86_scrypt_core_macro1a 28, 92 - x86_scrypt_core_macro1a 32, 96 - x86_scrypt_core_macro1a 36, 100 - x86_scrypt_core_macro1a 40, 104 - x86_scrypt_core_macro1a 44, 108 - x86_scrypt_core_macro1a 48, 112 - x86_scrypt_core_macro1a 52, 116 - x86_scrypt_core_macro1a 56, 120 - x86_scrypt_core_macro1a 60, 124 + scrypt_core_macro1a 0, 64 + scrypt_core_macro1a 4, 68 + scrypt_core_macro1a 8, 72 + scrypt_core_macro1a 12, 76 + scrypt_core_macro1a 16, 80 + scrypt_core_macro1a 20, 84 + scrypt_core_macro1a 24, 88 + scrypt_core_macro1a 28, 92 + scrypt_core_macro1a 32, 96 + scrypt_core_macro1a 36, 100 + scrypt_core_macro1a 40, 104 + scrypt_core_macro1a 44, 108 + scrypt_core_macro1a 48, 112 + scrypt_core_macro1a 52, 116 + scrypt_core_macro1a 56, 120 + scrypt_core_macro1a 60, 124 - call x86_gen_salsa8_core + call gen_salsa8_core movl 92(%esp), %edi - x86_scrypt_core_macro2 0, 64 - x86_scrypt_core_macro2 4, 68 - x86_scrypt_core_macro2 8, 72 - x86_scrypt_core_macro2 12, 76 - x86_scrypt_core_macro2 16, 80 - x86_scrypt_core_macro2 20, 84 - x86_scrypt_core_macro2 24, 88 - x86_scrypt_core_macro2 28, 92 - x86_scrypt_core_macro2 32, 96 - x86_scrypt_core_macro2 36, 100 - x86_scrypt_core_macro2 40, 104 - x86_scrypt_core_macro2 44, 108 - x86_scrypt_core_macro2 48, 112 - x86_scrypt_core_macro2 52, 116 - x86_scrypt_core_macro2 56, 120 - x86_scrypt_core_macro2 60, 124 + scrypt_core_macro2 0, 64 + scrypt_core_macro2 4, 68 + scrypt_core_macro2 8, 72 + scrypt_core_macro2 12, 76 + scrypt_core_macro2 16, 80 + scrypt_core_macro2 20, 84 + scrypt_core_macro2 24, 88 + scrypt_core_macro2 28, 92 + scrypt_core_macro2 32, 96 + scrypt_core_macro2 36, 100 + scrypt_core_macro2 40, 104 + scrypt_core_macro2 44, 108 + scrypt_core_macro2 48, 112 + scrypt_core_macro2 52, 116 + scrypt_core_macro2 56, 120 + scrypt_core_macro2 60, 124 - call x86_gen_salsa8_core + call gen_salsa8_core movl 92(%esp), %edi - x86_scrypt_core_macro3 0, 64 - x86_scrypt_core_macro3 4, 68 - x86_scrypt_core_macro3 8, 72 - x86_scrypt_core_macro3 12, 76 - x86_scrypt_core_macro3 16, 80 - x86_scrypt_core_macro3 20, 84 - x86_scrypt_core_macro3 24, 88 - x86_scrypt_core_macro3 28, 92 - x86_scrypt_core_macro3 32, 96 - x86_scrypt_core_macro3 36, 100 - x86_scrypt_core_macro3 40, 104 - x86_scrypt_core_macro3 44, 108 - x86_scrypt_core_macro3 48, 112 - x86_scrypt_core_macro3 52, 116 - x86_scrypt_core_macro3 56, 120 - x86_scrypt_core_macro3 60, 124 + scrypt_core_macro3 0, 64 + scrypt_core_macro3 4, 68 + scrypt_core_macro3 8, 72 + scrypt_core_macro3 12, 76 + scrypt_core_macro3 16, 80 + scrypt_core_macro3 20, 84 + scrypt_core_macro3 24, 88 + scrypt_core_macro3 28, 92 + scrypt_core_macro3 32, 96 + scrypt_core_macro3 36, 100 + scrypt_core_macro3 40, 104 + scrypt_core_macro3 44, 108 + scrypt_core_macro3 48, 112 + scrypt_core_macro3 52, 116 + scrypt_core_macro3 56, 120 + scrypt_core_macro3 60, 124 movl 64(%esp), %esi movl 68(%esp), %ecx addl $128, %esi cmpl %ecx, %esi - jne x86_gen_scrypt_core_loop1 + jne gen_scrypt_core_loop1 movl 96(%esp), %esi movl $1024, %ecx -x86_gen_scrypt_core_loop2: +gen_scrypt_core_loop2: movl %ecx, 68(%esp) movl 64(%edi), %edx andl $1023, %edx shll $7, %edx - x86_scrypt_core_macro1b 0, 64 - x86_scrypt_core_macro1b 4, 68 - x86_scrypt_core_macro1b 8, 72 - x86_scrypt_core_macro1b 12, 76 - x86_scrypt_core_macro1b 16, 80 - x86_scrypt_core_macro1b 20, 84 - x86_scrypt_core_macro1b 24, 88 - x86_scrypt_core_macro1b 28, 92 - x86_scrypt_core_macro1b 32, 96 - x86_scrypt_core_macro1b 36, 100 - x86_scrypt_core_macro1b 40, 104 - x86_scrypt_core_macro1b 44, 108 - x86_scrypt_core_macro1b 48, 112 - x86_scrypt_core_macro1b 52, 116 - x86_scrypt_core_macro1b 56, 120 - x86_scrypt_core_macro1b 60, 124 + scrypt_core_macro1b 0, 64 + scrypt_core_macro1b 4, 68 + scrypt_core_macro1b 8, 72 + scrypt_core_macro1b 12, 76 + scrypt_core_macro1b 16, 80 + scrypt_core_macro1b 20, 84 + scrypt_core_macro1b 24, 88 + scrypt_core_macro1b 28, 92 + scrypt_core_macro1b 32, 96 + scrypt_core_macro1b 36, 100 + scrypt_core_macro1b 40, 104 + scrypt_core_macro1b 44, 108 + scrypt_core_macro1b 48, 112 + scrypt_core_macro1b 52, 116 + scrypt_core_macro1b 56, 120 + scrypt_core_macro1b 60, 124 - call x86_gen_salsa8_core + call gen_salsa8_core movl 92(%esp), %edi - x86_scrypt_core_macro2 0, 64 - x86_scrypt_core_macro2 4, 68 - x86_scrypt_core_macro2 8, 72 - x86_scrypt_core_macro2 12, 76 - x86_scrypt_core_macro2 16, 80 - x86_scrypt_core_macro2 20, 84 - x86_scrypt_core_macro2 24, 88 - x86_scrypt_core_macro2 28, 92 - x86_scrypt_core_macro2 32, 96 - x86_scrypt_core_macro2 36, 100 - x86_scrypt_core_macro2 40, 104 - x86_scrypt_core_macro2 44, 108 - x86_scrypt_core_macro2 48, 112 - x86_scrypt_core_macro2 52, 116 - x86_scrypt_core_macro2 56, 120 - x86_scrypt_core_macro2 60, 124 + scrypt_core_macro2 0, 64 + scrypt_core_macro2 4, 68 + scrypt_core_macro2 8, 72 + scrypt_core_macro2 12, 76 + scrypt_core_macro2 16, 80 + scrypt_core_macro2 20, 84 + scrypt_core_macro2 24, 88 + scrypt_core_macro2 28, 92 + scrypt_core_macro2 32, 96 + scrypt_core_macro2 36, 100 + scrypt_core_macro2 40, 104 + scrypt_core_macro2 44, 108 + scrypt_core_macro2 48, 112 + scrypt_core_macro2 52, 116 + scrypt_core_macro2 56, 120 + scrypt_core_macro2 60, 124 - call x86_gen_salsa8_core + call gen_salsa8_core movl 92(%esp), %edi movl 96(%esp), %esi - x86_scrypt_core_macro3 0, 64 - x86_scrypt_core_macro3 4, 68 - x86_scrypt_core_macro3 8, 72 - x86_scrypt_core_macro3 12, 76 - x86_scrypt_core_macro3 16, 80 - x86_scrypt_core_macro3 20, 84 - x86_scrypt_core_macro3 24, 88 - x86_scrypt_core_macro3 28, 92 - x86_scrypt_core_macro3 32, 96 - x86_scrypt_core_macro3 36, 100 - x86_scrypt_core_macro3 40, 104 - x86_scrypt_core_macro3 44, 108 - x86_scrypt_core_macro3 48, 112 - x86_scrypt_core_macro3 52, 116 - x86_scrypt_core_macro3 56, 120 - x86_scrypt_core_macro3 60, 124 + scrypt_core_macro3 0, 64 + scrypt_core_macro3 4, 68 + scrypt_core_macro3 8, 72 + scrypt_core_macro3 12, 76 + scrypt_core_macro3 16, 80 + scrypt_core_macro3 20, 84 + scrypt_core_macro3 24, 88 + scrypt_core_macro3 28, 92 + scrypt_core_macro3 32, 96 + scrypt_core_macro3 36, 100 + scrypt_core_macro3 40, 104 + scrypt_core_macro3 44, 108 + scrypt_core_macro3 48, 112 + scrypt_core_macro3 52, 116 + scrypt_core_macro3 56, 120 + scrypt_core_macro3 60, 124 movl 68(%esp), %ecx subl $1, %ecx - ja x86_gen_scrypt_core_loop2 + ja gen_scrypt_core_loop2 addl $72, %esp popl %esi @@ -557,7 +557,7 @@ x86_gen_scrypt_core_loop2: ret -.macro x86_xmm_salsa8_core_doubleround +.macro xmm_salsa8_core_doubleround paddd %xmm0, %xmm4 movdqa %xmm0, %xmm5 movdqa %xmm4, %xmm6 @@ -624,16 +624,16 @@ x86_gen_scrypt_core_loop2: pxor %xmm6, %xmm0 .endm -.macro x86_xmm_salsa8_core +.macro xmm_salsa8_core movdqa %xmm1, %xmm4 - x86_xmm_salsa8_core_doubleround - x86_xmm_salsa8_core_doubleround - x86_xmm_salsa8_core_doubleround - x86_xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround + xmm_salsa8_core_doubleround .endm .align 32 -x86_xmm_scrypt_core: +xmm_scrypt_core: movl 20(%esp), %edi movl 24(%esp), %esi movl %esp, %ebp @@ -710,7 +710,7 @@ x86_xmm_scrypt_core: movl %esi, %edx leal 131072(%esi), %ecx -x86_xmm_scrypt_core_loop1: +xmm_scrypt_core_loop1: movdqa 0(%esp), %xmm0 movdqa 16(%esp), %xmm1 movdqa 32(%esp), %xmm2 @@ -736,7 +736,7 @@ x86_xmm_scrypt_core_loop1: movdqa %xmm1, 16(%esp) movdqa %xmm2, 32(%esp) movdqa %xmm3, 48(%esp) - x86_xmm_salsa8_core + xmm_salsa8_core paddd 0(%esp), %xmm0 paddd 16(%esp), %xmm1 paddd 32(%esp), %xmm2 @@ -754,7 +754,7 @@ x86_xmm_scrypt_core_loop1: movdqa %xmm1, 80(%esp) movdqa %xmm2, 96(%esp) movdqa %xmm3, 112(%esp) - x86_xmm_salsa8_core + xmm_salsa8_core paddd 64(%esp), %xmm0 paddd 80(%esp), %xmm1 paddd 96(%esp), %xmm2 @@ -766,10 +766,10 @@ x86_xmm_scrypt_core_loop1: addl $128, %edx cmpl %ecx, %edx - jne x86_xmm_scrypt_core_loop1 + jne xmm_scrypt_core_loop1 movl $1024, %ecx -x86_xmm_scrypt_core_loop2: +xmm_scrypt_core_loop2: movdqa 0(%esp), %xmm0 movdqa 16(%esp), %xmm1 movdqa 32(%esp), %xmm2 @@ -802,7 +802,7 @@ x86_xmm_scrypt_core_loop2: movdqa %xmm1, 16(%esp) movdqa %xmm2, 32(%esp) movdqa %xmm3, 48(%esp) - x86_xmm_salsa8_core + xmm_salsa8_core paddd 0(%esp), %xmm0 paddd 16(%esp), %xmm1 paddd 32(%esp), %xmm2 @@ -820,7 +820,7 @@ x86_xmm_scrypt_core_loop2: movdqa %xmm1, 80(%esp) movdqa %xmm2, 96(%esp) movdqa %xmm3, 112(%esp) - x86_xmm_salsa8_core + xmm_salsa8_core paddd 64(%esp), %xmm0 paddd 80(%esp), %xmm1 paddd 96(%esp), %xmm2 @@ -831,7 +831,7 @@ x86_xmm_scrypt_core_loop2: movdqa %xmm3, 112(%esp) subl $1, %ecx - ja x86_xmm_scrypt_core_loop2 + ja xmm_scrypt_core_loop2 # re-shuffle 1st block back movl 60(%esp), %edx diff --git a/scrypt.c b/scrypt.c index b8eb9e9..8cbdf6e 100644 --- a/scrypt.c +++ b/scrypt.c @@ -193,79 +193,13 @@ SHA256_InitState(uint32_t * state) static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000}; static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300}; -/** - * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): - * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and - * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). - */ static inline void -PBKDF2_SHA256_80_128(const uint32_t * passwd, uint32_t * buf) +PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t ostate[8]) { - SHA256_CTX PShictx, PShoctx; - uint32_t tstate[8]; uint32_t ihash[8]; - uint32_t i; uint32_t pad[16]; - - static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000}; - - /* If Klen > 64, the key is really SHA256(K). */ - SHA256_InitState(tstate); - SHA256_Transform(tstate, passwd, 1); - memcpy(pad, passwd+16, 16); - memcpy(pad+4, passwdpad, 48); - SHA256_Transform(tstate, pad, 1); - memcpy(ihash, tstate, 32); - - SHA256_InitState(PShictx.state); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - SHA256_Transform(PShictx.state, pad, 0); - SHA256_Transform(PShictx.state, passwd, 1); - be32enc_vect(PShictx.buf, passwd+16, 4); - be32enc_vect(PShictx.buf+5, innerpad, 11); - - SHA256_InitState(PShoctx.state); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - SHA256_Transform(PShoctx.state, pad, 0); - memcpy(PShoctx.buf+8, outerpad, 32); - - /* Iterate through the blocks. */ - for (i = 0; i < 4; i++) { - uint32_t istate[8]; - uint32_t ostate[8]; - - memcpy(istate, PShictx.state, 32); - PShictx.buf[4] = i + 1; - SHA256_Transform(istate, PShictx.buf, 0); - memcpy(PShoctx.buf, istate, 32); - - memcpy(ostate, PShoctx.state, 32); - SHA256_Transform(ostate, PShoctx.buf, 0); - be32enc_vect(buf+i*8, ostate, 8); - } -} - - -static inline uint32_t -PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt) -{ - uint32_t tstate[8]; - uint32_t ostate[8]; - uint32_t ihash[8]; uint32_t i; - /* Compute HMAC state after processing P and S. */ - uint32_t pad[16]; - - static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620}; - - /* If Klen > 64, the key is really SHA256(K). */ SHA256_InitState(tstate); SHA256_Transform(tstate, passwd, 1); memcpy(pad, passwd+16, 16); @@ -286,16 +220,63 @@ PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt) for (; i < 16; i++) pad[i] = 0x36363636; SHA256_Transform(tstate, pad, 0); +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +static inline void +PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *passwd, uint32_t *buf) +{ + static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000}; + SHA256_CTX PShictx, PShoctx; + uint32_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + memcpy(PShictx.state, tstate, 32); + memcpy(PShoctx.state, ostate, 32); + + memcpy(PShoctx.buf+8, outerpad, 32); + + SHA256_Transform(PShictx.state, passwd, 1); + be32enc_vect(PShictx.buf, passwd+16, 4); + be32enc_vect(PShictx.buf+5, innerpad, 11); + + /* Iterate through the blocks. */ + for (i = 0; i < 4; i++) { + uint32_t ist[8]; + uint32_t ost[8]; + + memcpy(ist, PShictx.state, 32); + PShictx.buf[4] = i + 1; + SHA256_Transform(ist, PShictx.buf, 0); + memcpy(PShoctx.buf, ist, 32); + + memcpy(ost, PShoctx.state, 32); + SHA256_Transform(ost, PShoctx.buf, 0); + be32enc_vect(buf+i*8, ost, 8); + } +} + +static inline void +PBKDF2_SHA256_80_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *passwd, const uint32_t *salt, uint32_t *output) +{ + static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620}; + uint32_t pad[16]; + uint32_t i; + SHA256_Transform(tstate, salt, 1); SHA256_Transform(tstate, salt+16, 1); SHA256_Transform(tstate, ihash_finalblk, 0); memcpy(pad, tstate, 32); memcpy(pad+8, outerpad, 32); - /* Feed the inner hash to the outer SHA256 operation. */ SHA256_Transform(ostate, pad, 0); - /* Finish the outer SHA256 operation. */ - return byteswap(ostate[7]); + + for (i = 0; i < 8; i++) + output[i] = byteswap(ostate[i]); } @@ -358,34 +339,33 @@ salsa20_8(uint32_t B[16], const uint32_t Bx[16]) B[15] += x15; } -#if defined(__x86_64__) -void x64_scrypt_core(uint32_t *B, uint32_t *V); -#elif defined(__i386__) -void x86_scrypt_core(uint32_t *B, uint32_t *V); -#endif -/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output - scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes - */ -static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) +#if defined(__x86_64__) + +#define DUAL_SCRYPT +#define SCRYPT_BUFFER_SIZE (2 * 131072 + 63) + +int prefer_dual_scrypt(); +void scrypt_core(uint32_t *X, uint32_t *V); +void dual_scrypt_core(uint32_t *X, uint32_t *Y, uint32_t *V); + +#elif defined(__i386__) + +#define SCRYPT_BUFFER_SIZE (131072 + 63) + +void scrypt_core(uint32_t *X, uint32_t *V); + +#else + +#define SCRYPT_BUFFER_SIZE (131072 + 63) + +static inline void scrypt_core(uint32_t *X, uint32_t *V) { - uint32_t * V; - uint32_t X[32]; uint32_t i; uint32_t j; uint32_t k; uint64_t *p1, *p2; - p1 = (uint64_t *)X; - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - PBKDF2_SHA256_80_128(input, X); - -#if defined(__x86_64__) - x64_scrypt_core(X, V); -#elif defined(__i386__) - x86_scrypt_core(X, V); -#else for (i = 0; i < 1024; i += 2) { memcpy(&V[i * 32], X, 128); @@ -414,32 +394,93 @@ static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); } +} + #endif - return PBKDF2_SHA256_80_128_32(input, X); +unsigned char *scrypt_buffer_alloc() { + return malloc(SCRYPT_BUFFER_SIZE); } +/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output + scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes + r = 1, p = 1, N = 1024 + */ +static void scrypt_1024_1_1_256_sp(const uint32_t* input, unsigned char *scratchpad, uint32_t *res) +{ + uint32_t tstate[8], ostate[8]; + uint32_t *V; + uint32_t X[32]; + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + PBKDF2_SHA256_80_128_init(input, tstate, ostate); + PBKDF2_SHA256_80_128(tstate, ostate, input, X); + + scrypt_core(X, V); + + return PBKDF2_SHA256_80_128_32(tstate, ostate, input, X, res); +} + +#ifdef DUAL_SCRYPT +static void dual_scrypt_1024_1_1_256_sp(const uint32_t *input1, const uint32_t *input2, unsigned char *scratchpad, uint32_t *res1, uint32_t *res2) +{ + uint32_t tstate1[8], tstate2[8], ostate1[8], ostate2[8]; + uint32_t *V; + uint32_t X[32], Y[32]; + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1); + PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2); + PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X); + PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y); + + dual_scrypt_core(X, Y, V); + + PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, res1); + PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, res2); +} +#endif + int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { - uint32_t data[20]; - uint32_t tmp_hash7; + uint32_t data[20], hash[8]; +#ifdef DUAL_SCRYPT + uint32_t data2[20], hash2[8]; + int use_dual; +#endif uint32_t n = 0; uint32_t Htarg = ((const uint32_t *)ptarget)[7]; - int i; work_restart[thr_id].restart = 0; be32enc_vect(data, (const uint32_t *)pdata, 19); +#ifdef DUAL_SCRYPT + memcpy(data2, data, 80); + use_dual = prefer_dual_scrypt(); +#endif - while(1) { - n++; - data[19] = n; - tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf); + while (1) { + data[19] = n++; +#ifdef DUAL_SCRYPT + if (use_dual) { + data2[19] = n++; + dual_scrypt_1024_1_1_256_sp(data, data2, scratchbuf, hash, hash2); + if (hash2[7] <= Htarg) { + ((uint32_t *)pdata)[19] = byteswap(data2[19]); + *hashes_done = n; + return true; + } + } else { + scrypt_1024_1_1_256_sp(data, scratchbuf, hash); + } +#else + scrypt_1024_1_1_256_sp(data, scratchbuf, hash); +#endif - if (tmp_hash7 <= Htarg) { - ((uint32_t *)pdata)[19] = byteswap(n); + if (hash[7] <= Htarg) { + ((uint32_t *)pdata)[19] = byteswap(data[19]); *hashes_done = n; return true; } diff --git a/util.c b/util.c index db1e6af..23598e0 100644 --- a/util.c +++ b/util.c @@ -94,7 +94,9 @@ void applog(int prio, const char *fmt, ...) tm.tm_min, tm.tm_sec, fmt); + pthread_mutex_lock(&time_lock); vfprintf(stderr, f, ap); /* atomic write to stderr */ + pthread_mutex_unlock(&time_lock); } va_end(ap); }