Some more optimization
This commit is contained in:
parent
963efb9546
commit
36225b4206
6 changed files with 974 additions and 284 deletions
|
@ -551,7 +551,7 @@ static void *miner_thread(void *userdata)
|
|||
|
||||
if (opt_algo == ALGO_SCRYPT)
|
||||
{
|
||||
scratchbuf = malloc(131583);
|
||||
scratchbuf = scrypt_buffer_alloc();
|
||||
max_nonce = 0xffff;
|
||||
}
|
||||
|
||||
|
@ -955,7 +955,7 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
|
||||
applog(LOG_INFO, "%d miner threads started, "
|
||||
"using SHA256 '%s' algorithm.",
|
||||
"using '%s' algorithm.",
|
||||
opt_n_threads,
|
||||
algo_names[opt_algo]);
|
||||
|
||||
|
|
1
miner.h
1
miner.h
|
@ -127,6 +127,7 @@ extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
|
|||
extern char *bin2hex(const unsigned char *p, size_t len);
|
||||
extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
|
||||
|
||||
extern unsigned char *scrypt_buffer_alloc();
|
||||
extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf,
|
||||
const unsigned char *ptarget,
|
||||
uint32_t max_nonce, unsigned long *nHashesDone);
|
||||
|
|
742
scrypt-x64.S
742
scrypt-x64.S
|
@ -24,7 +24,7 @@
|
|||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
.macro x64_gen_salsa8_core_doubleround
|
||||
.macro gen_salsa8_core_doubleround
|
||||
movq 72(%rsp), %r15
|
||||
leaq (%r14, %rdx), %rbp
|
||||
roll $7, %ebp
|
||||
|
@ -137,7 +137,7 @@
|
|||
|
||||
.text
|
||||
.align 32
|
||||
x64_gen_salsa8_core:
|
||||
gen_salsa8_core:
|
||||
# 0: %rdx, %rdi, %rcx, %rsi
|
||||
movq 8(%rsp), %rdi
|
||||
movq %rdi, %rdx
|
||||
|
@ -170,52 +170,52 @@ x64_gen_salsa8_core:
|
|||
shrq $32, %r15
|
||||
movq %r15, 88(%rsp)
|
||||
|
||||
x64_gen_salsa8_core_doubleround
|
||||
x64_gen_salsa8_core_doubleround
|
||||
x64_gen_salsa8_core_doubleround
|
||||
x64_gen_salsa8_core_doubleround
|
||||
gen_salsa8_core_doubleround
|
||||
gen_salsa8_core_doubleround
|
||||
gen_salsa8_core_doubleround
|
||||
gen_salsa8_core_doubleround
|
||||
|
||||
movl %edx, %edx
|
||||
shlq $32, %rdi
|
||||
addq %rdi, %rdx
|
||||
movq %rdx, %xmm0
|
||||
movd %rdx, %xmm0
|
||||
|
||||
movl %ecx, %ecx
|
||||
shlq $32, %rsi
|
||||
addq %rsi, %rcx
|
||||
movq %rcx, %xmm4
|
||||
movd %rcx, %xmm4
|
||||
|
||||
movq 72(%rsp), %rdi
|
||||
movl %r9d, %r9d
|
||||
shlq $32, %rdi
|
||||
addq %rdi, %r9
|
||||
movq %r9, %xmm1
|
||||
movd %r9, %xmm1
|
||||
|
||||
movl %eax, %eax
|
||||
shlq $32, %r8
|
||||
addq %r8, %rax
|
||||
movq %rax, %xmm5
|
||||
movd %rax, %xmm5
|
||||
|
||||
movl %r11d, %r11d
|
||||
shlq $32, %r10
|
||||
addq %r10, %r11
|
||||
movq %r11, %xmm2
|
||||
movd %r11, %xmm2
|
||||
|
||||
movl 48(%rsp), %r8d
|
||||
shlq $32, %r12
|
||||
addq %r12, %r8
|
||||
movq %r8, %xmm6
|
||||
movd %r8, %xmm6
|
||||
|
||||
movl %r14d, %r14d
|
||||
shlq $32, %r13
|
||||
addq %r13, %r14
|
||||
movq %r14, %xmm3
|
||||
movd %r14, %xmm3
|
||||
|
||||
movq 88(%rsp), %rdi
|
||||
movl %ebx, %ebx
|
||||
shlq $32, %rdi
|
||||
addq %rdi, %rbx
|
||||
movq %rbx, %xmm7
|
||||
movd %rbx, %xmm7
|
||||
|
||||
punpcklqdq %xmm4, %xmm0
|
||||
punpcklqdq %xmm5, %xmm1
|
||||
|
@ -236,10 +236,10 @@ x64_gen_salsa8_core:
|
|||
|
||||
.text
|
||||
.align 32
|
||||
.globl x64_scrypt_core
|
||||
.globl _x64_scrypt_core
|
||||
x64_scrypt_core:
|
||||
_x64_scrypt_core:
|
||||
.globl scrypt_core
|
||||
.globl _scrypt_core
|
||||
scrypt_core:
|
||||
_scrypt_core:
|
||||
pushq %rbx
|
||||
pushq %rbp
|
||||
pushq %r12
|
||||
|
@ -264,7 +264,7 @@ _x64_scrypt_core:
|
|||
movq %rdx, %rsi
|
||||
#endif
|
||||
|
||||
.macro x64_scrypt_core_cleanup
|
||||
.macro scrypt_core_cleanup
|
||||
#if defined(WIN64)
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
|
@ -292,13 +292,13 @@ _x64_scrypt_core:
|
|||
xorl %eax, %eax
|
||||
cpuid
|
||||
cmpl $0x6c65746e, %ecx
|
||||
jne x64_gen_scrypt_core
|
||||
jne gen_scrypt_core
|
||||
cmpl $0x49656e69, %edx
|
||||
jne x64_gen_scrypt_core
|
||||
jne gen_scrypt_core
|
||||
cmpl $0x756e6547, %ebx
|
||||
je x64_xmm_scrypt_core
|
||||
je xmm_scrypt_core
|
||||
|
||||
x64_gen_scrypt_core:
|
||||
gen_scrypt_core:
|
||||
subq $136, %rsp
|
||||
movdqa 0(%rdi), %xmm8
|
||||
movdqa 16(%rdi), %xmm9
|
||||
|
@ -313,7 +313,7 @@ x64_gen_scrypt_core:
|
|||
movq %rdi, 104(%rsp)
|
||||
movq %rsi, 112(%rsp)
|
||||
movq %rcx, 120(%rsp)
|
||||
x64_gen_scrypt_core_loop1:
|
||||
gen_scrypt_core_loop1:
|
||||
movdqa %xmm8, 0(%rsi)
|
||||
movdqa %xmm9, 16(%rsi)
|
||||
movdqa %xmm10, 32(%rsi)
|
||||
|
@ -332,7 +332,7 @@ x64_gen_scrypt_core_loop1:
|
|||
movdqa %xmm10, 32(%rsp)
|
||||
movdqa %xmm11, 48(%rsp)
|
||||
movq %rsi, 128(%rsp)
|
||||
call x64_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
paddd %xmm0, %xmm8
|
||||
paddd %xmm1, %xmm9
|
||||
paddd %xmm2, %xmm10
|
||||
|
@ -346,7 +346,7 @@ x64_gen_scrypt_core_loop1:
|
|||
movdqa %xmm13, 16(%rsp)
|
||||
movdqa %xmm14, 32(%rsp)
|
||||
movdqa %xmm15, 48(%rsp)
|
||||
call x64_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
movq 128(%rsp), %rsi
|
||||
paddd %xmm0, %xmm12
|
||||
paddd %xmm1, %xmm13
|
||||
|
@ -356,10 +356,10 @@ x64_gen_scrypt_core_loop1:
|
|||
addq $128, %rsi
|
||||
movq 120(%rsp), %rcx
|
||||
cmpq %rcx, %rsi
|
||||
jne x64_gen_scrypt_core_loop1
|
||||
jne gen_scrypt_core_loop1
|
||||
|
||||
movq $1024, %rcx
|
||||
x64_gen_scrypt_core_loop2:
|
||||
gen_scrypt_core_loop2:
|
||||
movq 112(%rsp), %rsi
|
||||
movd %xmm12, %edx
|
||||
andl $1023, %edx
|
||||
|
@ -390,7 +390,7 @@ x64_gen_scrypt_core_loop2:
|
|||
movdqa %xmm10, 32(%rsp)
|
||||
movdqa %xmm11, 48(%rsp)
|
||||
movq %rcx, 128(%rsp)
|
||||
call x64_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
paddd %xmm0, %xmm8
|
||||
paddd %xmm1, %xmm9
|
||||
paddd %xmm2, %xmm10
|
||||
|
@ -404,7 +404,7 @@ x64_gen_scrypt_core_loop2:
|
|||
movdqa %xmm13, 16(%rsp)
|
||||
movdqa %xmm14, 32(%rsp)
|
||||
movdqa %xmm15, 48(%rsp)
|
||||
call x64_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
movq 128(%rsp), %rcx
|
||||
paddd %xmm0, %xmm12
|
||||
paddd %xmm1, %xmm13
|
||||
|
@ -412,7 +412,7 @@ x64_gen_scrypt_core_loop2:
|
|||
paddd %xmm3, %xmm15
|
||||
|
||||
subq $1, %rcx
|
||||
ja x64_gen_scrypt_core_loop2
|
||||
ja gen_scrypt_core_loop2
|
||||
|
||||
movq 104(%rsp), %rdi
|
||||
movdqa %xmm8, 0(%rdi)
|
||||
|
@ -425,11 +425,11 @@ x64_gen_scrypt_core_loop2:
|
|||
movdqa %xmm15, 112(%rdi)
|
||||
|
||||
addq $136, %rsp
|
||||
x64_scrypt_core_cleanup
|
||||
scrypt_core_cleanup
|
||||
ret
|
||||
|
||||
|
||||
.macro x64_xmm_salsa8_core_doubleround
|
||||
.macro xmm_salsa8_core_doubleround
|
||||
paddd %xmm0, %xmm4
|
||||
movdqa %xmm0, %xmm5
|
||||
movdqa %xmm4, %xmm6
|
||||
|
@ -495,16 +495,16 @@ x64_gen_scrypt_core_loop2:
|
|||
pxor %xmm6, %xmm0
|
||||
.endm
|
||||
|
||||
.macro x64_xmm_salsa8_core
|
||||
.macro xmm_salsa8_core
|
||||
movdqa %xmm1, %xmm4
|
||||
x64_xmm_salsa8_core_doubleround
|
||||
x64_xmm_salsa8_core_doubleround
|
||||
x64_xmm_salsa8_core_doubleround
|
||||
x64_xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
.endm
|
||||
|
||||
.align 32
|
||||
x64_xmm_scrypt_core:
|
||||
xmm_scrypt_core:
|
||||
# shuffle 1st block into %xmm8-%xmm11
|
||||
movl 60(%rdi), %edx
|
||||
movl 44(%rdi), %ecx
|
||||
|
@ -623,7 +623,7 @@ x64_xmm_scrypt_core:
|
|||
|
||||
movq %rsi, %rdx
|
||||
leaq 131072(%rsi), %rcx
|
||||
x64_xmm_scrypt_core_loop1:
|
||||
xmm_scrypt_core_loop1:
|
||||
movdqa %xmm8, 0(%rdx)
|
||||
movdqa %xmm9, 16(%rdx)
|
||||
movdqa %xmm10, 32(%rdx)
|
||||
|
@ -641,7 +641,7 @@ x64_xmm_scrypt_core_loop1:
|
|||
movdqa %xmm9, %xmm1
|
||||
movdqa %xmm10, %xmm2
|
||||
movdqa %xmm11, %xmm3
|
||||
x64_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd %xmm0, %xmm8
|
||||
paddd %xmm1, %xmm9
|
||||
paddd %xmm2, %xmm10
|
||||
|
@ -655,7 +655,7 @@ x64_xmm_scrypt_core_loop1:
|
|||
movdqa %xmm13, %xmm1
|
||||
movdqa %xmm14, %xmm2
|
||||
movdqa %xmm15, %xmm3
|
||||
x64_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd %xmm0, %xmm12
|
||||
paddd %xmm1, %xmm13
|
||||
paddd %xmm2, %xmm14
|
||||
|
@ -663,10 +663,10 @@ x64_xmm_scrypt_core_loop1:
|
|||
|
||||
addq $128, %rdx
|
||||
cmpq %rcx, %rdx
|
||||
jne x64_xmm_scrypt_core_loop1
|
||||
jne xmm_scrypt_core_loop1
|
||||
|
||||
movq $1024, %rcx
|
||||
x64_xmm_scrypt_core_loop2:
|
||||
xmm_scrypt_core_loop2:
|
||||
movd %xmm12, %edx
|
||||
andl $1023, %edx
|
||||
shll $7, %edx
|
||||
|
@ -695,7 +695,7 @@ x64_xmm_scrypt_core_loop2:
|
|||
movdqa %xmm9, %xmm1
|
||||
movdqa %xmm10, %xmm2
|
||||
movdqa %xmm11, %xmm3
|
||||
x64_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd %xmm0, %xmm8
|
||||
paddd %xmm1, %xmm9
|
||||
paddd %xmm2, %xmm10
|
||||
|
@ -709,14 +709,14 @@ x64_xmm_scrypt_core_loop2:
|
|||
movdqa %xmm13, %xmm1
|
||||
movdqa %xmm14, %xmm2
|
||||
movdqa %xmm15, %xmm3
|
||||
x64_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd %xmm0, %xmm12
|
||||
paddd %xmm1, %xmm13
|
||||
paddd %xmm2, %xmm14
|
||||
paddd %xmm3, %xmm15
|
||||
|
||||
subq $1, %rcx
|
||||
ja x64_xmm_scrypt_core_loop2
|
||||
ja xmm_scrypt_core_loop2
|
||||
|
||||
# re-shuffle 1st block back
|
||||
movd %xmm8, %eax
|
||||
|
@ -810,7 +810,653 @@ x64_xmm_scrypt_core_loop2:
|
|||
movl %ebx, 92(%rdi)
|
||||
movl %eax, 76(%rdi)
|
||||
|
||||
x64_scrypt_core_cleanup
|
||||
scrypt_core_cleanup
|
||||
ret
|
||||
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.globl prefer_dual_scrypt
|
||||
.globl _prefer_dual_scrypt
|
||||
prefer_dual_scrypt:
|
||||
_prefer_dual_scrypt:
|
||||
pushq %rbx
|
||||
xorq %rax, %rax
|
||||
cpuid
|
||||
xorq %rax, %rax
|
||||
cmpl $0x6c65746e, %ecx
|
||||
jne prefer_dual_scrypt_false
|
||||
cmpl $0x49656e69, %edx
|
||||
jne prefer_dual_scrypt_false
|
||||
cmpl $0x756e6547, %ebx
|
||||
jne prefer_dual_scrypt_false
|
||||
incl %eax
|
||||
prefer_dual_scrypt_false:
|
||||
popq %rbx
|
||||
ret
|
||||
|
||||
|
||||
.macro xmm_dual_salsa8_core_doubleround
|
||||
paddd %xmm0, %xmm4
|
||||
paddd %xmm8, %xmm12
|
||||
movdqa %xmm0, %xmm5
|
||||
movdqa %xmm8, %xmm13
|
||||
movdqa %xmm4, %xmm6
|
||||
movdqa %xmm12, %xmm14
|
||||
pslld $7, %xmm4
|
||||
pslld $7, %xmm12
|
||||
psrld $25, %xmm6
|
||||
psrld $25, %xmm14
|
||||
pxor %xmm4, %xmm3
|
||||
pxor %xmm12, %xmm11
|
||||
pxor %xmm6, %xmm3
|
||||
pxor %xmm14, %xmm11
|
||||
paddd %xmm3, %xmm5
|
||||
paddd %xmm11, %xmm13
|
||||
movdqa %xmm3, %xmm4
|
||||
movdqa %xmm11, %xmm12
|
||||
movdqa %xmm5, %xmm6
|
||||
movdqa %xmm13, %xmm14
|
||||
pslld $9, %xmm5
|
||||
pslld $9, %xmm13
|
||||
psrld $23, %xmm6
|
||||
psrld $23, %xmm14
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm13, %xmm10
|
||||
pshufd $0x93, %xmm3, %xmm3
|
||||
pshufd $0x93, %xmm11, %xmm11
|
||||
pxor %xmm6, %xmm2
|
||||
pxor %xmm14, %xmm10
|
||||
paddd %xmm2, %xmm4
|
||||
paddd %xmm10, %xmm12
|
||||
movdqa %xmm2, %xmm5
|
||||
movdqa %xmm10, %xmm13
|
||||
movdqa %xmm4, %xmm6
|
||||
movdqa %xmm12, %xmm14
|
||||
pslld $13, %xmm4
|
||||
pslld $13, %xmm12
|
||||
psrld $19, %xmm6
|
||||
psrld $19, %xmm14
|
||||
pxor %xmm4, %xmm1
|
||||
pxor %xmm12, %xmm9
|
||||
pshufd $0x4e, %xmm2, %xmm2
|
||||
pshufd $0x4e, %xmm10, %xmm10
|
||||
pxor %xmm6, %xmm1
|
||||
pxor %xmm14, %xmm9
|
||||
paddd %xmm1, %xmm5
|
||||
paddd %xmm9, %xmm13
|
||||
movdqa %xmm3, %xmm4
|
||||
movdqa %xmm11, %xmm12
|
||||
movdqa %xmm5, %xmm6
|
||||
movdqa %xmm13, %xmm14
|
||||
pslld $18, %xmm5
|
||||
pslld $18, %xmm13
|
||||
psrld $14, %xmm6
|
||||
psrld $14, %xmm14
|
||||
pxor %xmm5, %xmm0
|
||||
pxor %xmm13, %xmm8
|
||||
pshufd $0x39, %xmm1, %xmm1
|
||||
pshufd $0x39, %xmm9, %xmm9
|
||||
pxor %xmm6, %xmm0
|
||||
pxor %xmm14, %xmm8
|
||||
|
||||
paddd %xmm0, %xmm4
|
||||
paddd %xmm8, %xmm12
|
||||
movdqa %xmm0, %xmm5
|
||||
movdqa %xmm8, %xmm13
|
||||
movdqa %xmm4, %xmm6
|
||||
movdqa %xmm12, %xmm14
|
||||
pslld $7, %xmm4
|
||||
pslld $7, %xmm12
|
||||
psrld $25, %xmm6
|
||||
psrld $25, %xmm14
|
||||
pxor %xmm4, %xmm1
|
||||
pxor %xmm12, %xmm9
|
||||
pxor %xmm6, %xmm1
|
||||
pxor %xmm14, %xmm9
|
||||
paddd %xmm1, %xmm5
|
||||
paddd %xmm9, %xmm13
|
||||
movdqa %xmm1, %xmm4
|
||||
movdqa %xmm9, %xmm12
|
||||
movdqa %xmm5, %xmm6
|
||||
movdqa %xmm13, %xmm14
|
||||
pslld $9, %xmm5
|
||||
pslld $9, %xmm13
|
||||
psrld $23, %xmm6
|
||||
psrld $23, %xmm14
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm13, %xmm10
|
||||
pshufd $0x93, %xmm1, %xmm1
|
||||
pshufd $0x93, %xmm9, %xmm9
|
||||
pxor %xmm6, %xmm2
|
||||
pxor %xmm14, %xmm10
|
||||
paddd %xmm2, %xmm4
|
||||
paddd %xmm10, %xmm12
|
||||
movdqa %xmm2, %xmm5
|
||||
movdqa %xmm10, %xmm13
|
||||
movdqa %xmm4, %xmm6
|
||||
movdqa %xmm12, %xmm14
|
||||
pslld $13, %xmm4
|
||||
pslld $13, %xmm12
|
||||
psrld $19, %xmm6
|
||||
psrld $19, %xmm14
|
||||
pxor %xmm4, %xmm3
|
||||
pxor %xmm12, %xmm11
|
||||
pshufd $0x4e, %xmm2, %xmm2
|
||||
pshufd $0x4e, %xmm10, %xmm10
|
||||
pxor %xmm6, %xmm3
|
||||
pxor %xmm14, %xmm11
|
||||
paddd %xmm3, %xmm5
|
||||
paddd %xmm11, %xmm13
|
||||
movdqa %xmm1, %xmm4
|
||||
movdqa %xmm9, %xmm12
|
||||
movdqa %xmm5, %xmm6
|
||||
movdqa %xmm13, %xmm14
|
||||
pslld $18, %xmm5
|
||||
pslld $18, %xmm13
|
||||
psrld $14, %xmm6
|
||||
psrld $14, %xmm14
|
||||
pxor %xmm5, %xmm0
|
||||
pxor %xmm13, %xmm8
|
||||
pshufd $0x39, %xmm3, %xmm3
|
||||
pshufd $0x39, %xmm11, %xmm11
|
||||
pxor %xmm6, %xmm0
|
||||
pxor %xmm14, %xmm8
|
||||
.endm
|
||||
|
||||
.macro xmm_dual_salsa8_core
|
||||
movdqa %xmm1, %xmm4
|
||||
movdqa %xmm9, %xmm12
|
||||
xmm_dual_salsa8_core_doubleround
|
||||
xmm_dual_salsa8_core_doubleround
|
||||
xmm_dual_salsa8_core_doubleround
|
||||
xmm_dual_salsa8_core_doubleround
|
||||
.endm
|
||||
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.globl dual_scrypt_core
|
||||
.globl _dual_scrypt_core
|
||||
dual_scrypt_core:
|
||||
_dual_scrypt_core:
|
||||
pushq %rbx
|
||||
pushq %rbp
|
||||
#if defined(WIN64)
|
||||
subq $176, %rsp
|
||||
movdqa %xmm6, 8(%rsp)
|
||||
movdqa %xmm7, 24(%rsp)
|
||||
movdqa %xmm8, 40(%rsp)
|
||||
movdqa %xmm9, 56(%rsp)
|
||||
movdqa %xmm10, 72(%rsp)
|
||||
movdqa %xmm11, 88(%rsp)
|
||||
movdqa %xmm12, 104(%rsp)
|
||||
movdqa %xmm13, 120(%rsp)
|
||||
movdqa %xmm14, 136(%rsp)
|
||||
movdqa %xmm15, 152(%rsp)
|
||||
pushq %rdi
|
||||
pushq %rsi
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
#endif
|
||||
|
||||
# shuffle 1st block
|
||||
movl 60(%rdi), %ebp
|
||||
movl 44(%rdi), %ecx
|
||||
movl 28(%rdi), %ebx
|
||||
movl 12(%rdi), %eax
|
||||
movl %ebp, 12(%rdi)
|
||||
movl %ecx, 28(%rdi)
|
||||
movl %ebx, 44(%rdi)
|
||||
movl %eax, 60(%rdi)
|
||||
movl 40(%rdi), %ecx
|
||||
movl 8(%rdi), %eax
|
||||
movl 48(%rdi), %ebp
|
||||
movl 16(%rdi), %ebx
|
||||
movl %ecx, 8(%rdi)
|
||||
movl %eax, 40(%rdi)
|
||||
movl %ebp, 16(%rdi)
|
||||
movl %ebx, 48(%rdi)
|
||||
movl 20(%rdi), %ebx
|
||||
movl 4(%rdi), %eax
|
||||
movl 52(%rdi), %ebp
|
||||
movl 36(%rdi), %ecx
|
||||
movl %ebx, 4(%rdi)
|
||||
movl %eax, 20(%rdi)
|
||||
movl %ebp, 36(%rdi)
|
||||
movl %ecx, 52(%rdi)
|
||||
|
||||
# shuffle 2nd block
|
||||
movl 124(%rdi), %ebp
|
||||
movl 108(%rdi), %ecx
|
||||
movl 92(%rdi), %ebx
|
||||
movl 76(%rdi), %eax
|
||||
movl %ebp, 76(%rdi)
|
||||
movl %ecx, 92(%rdi)
|
||||
movl %ebx, 108(%rdi)
|
||||
movl %eax, 124(%rdi)
|
||||
movl 104(%rdi), %ecx
|
||||
movl 72(%rdi), %eax
|
||||
movl 112(%rdi), %ebp
|
||||
movl 80(%rdi), %ebx
|
||||
movl %ecx, 72(%rdi)
|
||||
movl %eax, 104(%rdi)
|
||||
movl %ebp, 80(%rdi)
|
||||
movl %ebx, 112(%rdi)
|
||||
movl 84(%rdi), %ebx
|
||||
movl 68(%rdi), %eax
|
||||
movl 116(%rdi), %ebp
|
||||
movl 100(%rdi), %ecx
|
||||
movl %ebx, 68(%rdi)
|
||||
movl %eax, 84(%rdi)
|
||||
movl %ebp, 100(%rdi)
|
||||
movl %ecx, 116(%rdi)
|
||||
|
||||
# shuffle 3rd block
|
||||
movl 60(%rsi), %ebp
|
||||
movl 44(%rsi), %ecx
|
||||
movl 28(%rsi), %ebx
|
||||
movl 12(%rsi), %eax
|
||||
movl %ebp, 12(%rsi)
|
||||
movl %ecx, 28(%rsi)
|
||||
movl %ebx, 44(%rsi)
|
||||
movl %eax, 60(%rsi)
|
||||
movl 40(%rsi), %ecx
|
||||
movl 8(%rsi), %eax
|
||||
movl 48(%rsi), %ebp
|
||||
movl 16(%rsi), %ebx
|
||||
movl %ecx, 8(%rsi)
|
||||
movl %eax, 40(%rsi)
|
||||
movl %ebp, 16(%rsi)
|
||||
movl %ebx, 48(%rsi)
|
||||
movl 20(%rsi), %ebx
|
||||
movl 4(%rsi), %eax
|
||||
movl 52(%rsi), %ebp
|
||||
movl 36(%rsi), %ecx
|
||||
movl %ebx, 4(%rsi)
|
||||
movl %eax, 20(%rsi)
|
||||
movl %ebp, 36(%rsi)
|
||||
movl %ecx, 52(%rsi)
|
||||
|
||||
# shuffle 4th block
|
||||
movl 124(%rsi), %ebp
|
||||
movl 108(%rsi), %ecx
|
||||
movl 92(%rsi), %ebx
|
||||
movl 76(%rsi), %eax
|
||||
movl %ebp, 76(%rsi)
|
||||
movl %ecx, 92(%rsi)
|
||||
movl %ebx, 108(%rsi)
|
||||
movl %eax, 124(%rsi)
|
||||
movl 104(%rsi), %ecx
|
||||
movl 72(%rsi), %eax
|
||||
movl 112(%rsi), %ebp
|
||||
movl 80(%rsi), %ebx
|
||||
movl %ecx, 72(%rsi)
|
||||
movl %eax, 104(%rsi)
|
||||
movl %ebp, 80(%rsi)
|
||||
movl %ebx, 112(%rsi)
|
||||
movl 84(%rsi), %ebx
|
||||
movl 68(%rsi), %eax
|
||||
movl 116(%rsi), %ebp
|
||||
movl 100(%rsi), %ecx
|
||||
movl %ebx, 68(%rsi)
|
||||
movl %eax, 84(%rsi)
|
||||
movl %ebp, 100(%rsi)
|
||||
movl %ecx, 116(%rsi)
|
||||
|
||||
movq %rdx, %rbp
|
||||
leaq 262144(%rdx), %rcx
|
||||
.align 8
|
||||
dual_scrypt_core_loop1:
|
||||
movdqa 0(%rdi), %xmm0
|
||||
movdqa 16(%rdi), %xmm1
|
||||
movdqa 32(%rdi), %xmm2
|
||||
movdqa 48(%rdi), %xmm3
|
||||
movdqa 64(%rdi), %xmm4
|
||||
movdqa 80(%rdi), %xmm5
|
||||
movdqa 96(%rdi), %xmm6
|
||||
movdqa 112(%rdi), %xmm7
|
||||
movdqa 0(%rsi), %xmm8
|
||||
movdqa 16(%rsi), %xmm9
|
||||
movdqa 32(%rsi), %xmm10
|
||||
movdqa 48(%rsi), %xmm11
|
||||
movdqa 64(%rsi), %xmm12
|
||||
movdqa 80(%rsi), %xmm13
|
||||
movdqa 96(%rsi), %xmm14
|
||||
movdqa 112(%rsi), %xmm15
|
||||
movdqa %xmm0, 0(%rbp)
|
||||
movdqa %xmm1, 16(%rbp)
|
||||
movdqa %xmm2, 32(%rbp)
|
||||
movdqa %xmm3, 48(%rbp)
|
||||
movdqa %xmm4, 64(%rbp)
|
||||
movdqa %xmm5, 80(%rbp)
|
||||
movdqa %xmm6, 96(%rbp)
|
||||
movdqa %xmm7, 112(%rbp)
|
||||
movdqa %xmm8, 128(%rbp)
|
||||
movdqa %xmm9, 144(%rbp)
|
||||
movdqa %xmm10, 160(%rbp)
|
||||
movdqa %xmm11, 176(%rbp)
|
||||
movdqa %xmm12, 192(%rbp)
|
||||
movdqa %xmm13, 208(%rbp)
|
||||
movdqa %xmm14, 224(%rbp)
|
||||
movdqa %xmm15, 240(%rbp)
|
||||
|
||||
pxor %xmm4, %xmm0
|
||||
pxor %xmm5, %xmm1
|
||||
pxor %xmm6, %xmm2
|
||||
pxor %xmm7, %xmm3
|
||||
pxor %xmm12, %xmm8
|
||||
pxor %xmm13, %xmm9
|
||||
pxor %xmm14, %xmm10
|
||||
pxor %xmm15, %xmm11
|
||||
movdqa %xmm0, 0(%rdi)
|
||||
movdqa %xmm1, 16(%rdi)
|
||||
movdqa %xmm2, 32(%rdi)
|
||||
movdqa %xmm3, 48(%rdi)
|
||||
movdqa %xmm8, 0(%rsi)
|
||||
movdqa %xmm9, 16(%rsi)
|
||||
movdqa %xmm10, 32(%rsi)
|
||||
movdqa %xmm11, 48(%rsi)
|
||||
xmm_dual_salsa8_core
|
||||
paddd 0(%rdi), %xmm0
|
||||
paddd 16(%rdi), %xmm1
|
||||
paddd 32(%rdi), %xmm2
|
||||
paddd 48(%rdi), %xmm3
|
||||
paddd 0(%rsi), %xmm8
|
||||
paddd 16(%rsi), %xmm9
|
||||
paddd 32(%rsi), %xmm10
|
||||
paddd 48(%rsi), %xmm11
|
||||
movdqa %xmm0, 0(%rdi)
|
||||
movdqa %xmm1, 16(%rdi)
|
||||
movdqa %xmm2, 32(%rdi)
|
||||
movdqa %xmm3, 48(%rdi)
|
||||
movdqa %xmm8, 0(%rsi)
|
||||
movdqa %xmm9, 16(%rsi)
|
||||
movdqa %xmm10, 32(%rsi)
|
||||
movdqa %xmm11, 48(%rsi)
|
||||
|
||||
pxor 64(%rdi), %xmm0
|
||||
pxor 80(%rdi), %xmm1
|
||||
pxor 96(%rdi), %xmm2
|
||||
pxor 112(%rdi), %xmm3
|
||||
pxor 64(%rsi), %xmm8
|
||||
pxor 80(%rsi), %xmm9
|
||||
pxor 96(%rsi), %xmm10
|
||||
pxor 112(%rsi), %xmm11
|
||||
movdqa %xmm0, 64(%rdi)
|
||||
movdqa %xmm1, 80(%rdi)
|
||||
movdqa %xmm2, 96(%rdi)
|
||||
movdqa %xmm3, 112(%rdi)
|
||||
movdqa %xmm8, 64(%rsi)
|
||||
movdqa %xmm9, 80(%rsi)
|
||||
movdqa %xmm10, 96(%rsi)
|
||||
movdqa %xmm11, 112(%rsi)
|
||||
xmm_dual_salsa8_core
|
||||
paddd 64(%rdi), %xmm0
|
||||
paddd 80(%rdi), %xmm1
|
||||
paddd 96(%rdi), %xmm2
|
||||
paddd 112(%rdi), %xmm3
|
||||
paddd 64(%rsi), %xmm8
|
||||
paddd 80(%rsi), %xmm9
|
||||
paddd 96(%rsi), %xmm10
|
||||
paddd 112(%rsi), %xmm11
|
||||
movdqa %xmm0, 64(%rdi)
|
||||
movdqa %xmm1, 80(%rdi)
|
||||
movdqa %xmm2, 96(%rdi)
|
||||
movdqa %xmm3, 112(%rdi)
|
||||
movdqa %xmm8, 64(%rsi)
|
||||
movdqa %xmm9, 80(%rsi)
|
||||
movdqa %xmm10, 96(%rsi)
|
||||
movdqa %xmm11, 112(%rsi)
|
||||
|
||||
addq $256, %rbp
|
||||
cmpq %rcx, %rbp
|
||||
jne dual_scrypt_core_loop1
|
||||
|
||||
movq $1024, %rcx
|
||||
.align 8
|
||||
dual_scrypt_core_loop2:
|
||||
movl 64(%rdi), %ebp
|
||||
andl $1023, %ebp
|
||||
shll $8, %ebp
|
||||
movdqa 0(%rdx, %rbp), %xmm0
|
||||
movdqa 16(%rdx, %rbp), %xmm1
|
||||
movdqa 32(%rdx, %rbp), %xmm2
|
||||
movdqa 48(%rdx, %rbp), %xmm3
|
||||
movdqa 64(%rdx, %rbp), %xmm4
|
||||
movdqa 80(%rdx, %rbp), %xmm5
|
||||
movdqa 96(%rdx, %rbp), %xmm6
|
||||
movdqa 112(%rdx, %rbp), %xmm7
|
||||
movl 64(%rsi), %ebp
|
||||
andl $1023, %ebp
|
||||
shll $8, %ebp
|
||||
addl $128, %ebp
|
||||
movdqa 0(%rdx, %rbp), %xmm8
|
||||
movdqa 16(%rdx, %rbp), %xmm9
|
||||
movdqa 32(%rdx, %rbp), %xmm10
|
||||
movdqa 48(%rdx, %rbp), %xmm11
|
||||
movdqa 64(%rdx, %rbp), %xmm12
|
||||
movdqa 80(%rdx, %rbp), %xmm13
|
||||
movdqa 96(%rdx, %rbp), %xmm14
|
||||
movdqa 112(%rdx, %rbp), %xmm15
|
||||
pxor 0(%rdi), %xmm0
|
||||
pxor 16(%rdi), %xmm1
|
||||
pxor 32(%rdi), %xmm2
|
||||
pxor 48(%rdi), %xmm3
|
||||
pxor 64(%rdi), %xmm4
|
||||
pxor 80(%rdi), %xmm5
|
||||
pxor 96(%rdi), %xmm6
|
||||
pxor 112(%rdi), %xmm7
|
||||
pxor 0(%rsi), %xmm8
|
||||
pxor 16(%rsi), %xmm9
|
||||
pxor 32(%rsi), %xmm10
|
||||
pxor 48(%rsi), %xmm11
|
||||
pxor 64(%rsi), %xmm12
|
||||
pxor 80(%rsi), %xmm13
|
||||
pxor 96(%rsi), %xmm14
|
||||
pxor 112(%rsi), %xmm15
|
||||
|
||||
pxor %xmm4, %xmm0
|
||||
pxor %xmm5, %xmm1
|
||||
pxor %xmm6, %xmm2
|
||||
pxor %xmm7, %xmm3
|
||||
pxor %xmm12, %xmm8
|
||||
pxor %xmm13, %xmm9
|
||||
pxor %xmm14, %xmm10
|
||||
pxor %xmm15, %xmm11
|
||||
movdqa %xmm0, 0(%rdi)
|
||||
movdqa %xmm1, 16(%rdi)
|
||||
movdqa %xmm2, 32(%rdi)
|
||||
movdqa %xmm3, 48(%rdi)
|
||||
movdqa %xmm4, 64(%rdi)
|
||||
movdqa %xmm5, 80(%rdi)
|
||||
movdqa %xmm6, 96(%rdi)
|
||||
movdqa %xmm7, 112(%rdi)
|
||||
movdqa %xmm8, 0(%rsi)
|
||||
movdqa %xmm9, 16(%rsi)
|
||||
movdqa %xmm10, 32(%rsi)
|
||||
movdqa %xmm11, 48(%rsi)
|
||||
movdqa %xmm12, 64(%rsi)
|
||||
movdqa %xmm13, 80(%rsi)
|
||||
movdqa %xmm14, 96(%rsi)
|
||||
movdqa %xmm15, 112(%rsi)
|
||||
xmm_dual_salsa8_core
|
||||
paddd 0(%rdi), %xmm0
|
||||
paddd 16(%rdi), %xmm1
|
||||
paddd 32(%rdi), %xmm2
|
||||
paddd 48(%rdi), %xmm3
|
||||
paddd 0(%rsi), %xmm8
|
||||
paddd 16(%rsi), %xmm9
|
||||
paddd 32(%rsi), %xmm10
|
||||
paddd 48(%rsi), %xmm11
|
||||
movdqa %xmm0, 0(%rdi)
|
||||
movdqa %xmm1, 16(%rdi)
|
||||
movdqa %xmm2, 32(%rdi)
|
||||
movdqa %xmm3, 48(%rdi)
|
||||
movdqa %xmm8, 0(%rsi)
|
||||
movdqa %xmm9, 16(%rsi)
|
||||
movdqa %xmm10, 32(%rsi)
|
||||
movdqa %xmm11, 48(%rsi)
|
||||
|
||||
pxor 64(%rdi), %xmm0
|
||||
pxor 80(%rdi), %xmm1
|
||||
pxor 96(%rdi), %xmm2
|
||||
pxor 112(%rdi), %xmm3
|
||||
pxor 64(%rsi), %xmm8
|
||||
pxor 80(%rsi), %xmm9
|
||||
pxor 96(%rsi), %xmm10
|
||||
pxor 112(%rsi), %xmm11
|
||||
movdqa %xmm0, 64(%rdi)
|
||||
movdqa %xmm1, 80(%rdi)
|
||||
movdqa %xmm2, 96(%rdi)
|
||||
movdqa %xmm3, 112(%rdi)
|
||||
movdqa %xmm8, 64(%rsi)
|
||||
movdqa %xmm9, 80(%rsi)
|
||||
movdqa %xmm10, 96(%rsi)
|
||||
movdqa %xmm11, 112(%rsi)
|
||||
xmm_dual_salsa8_core
|
||||
paddd 64(%rdi), %xmm0
|
||||
paddd 80(%rdi), %xmm1
|
||||
paddd 96(%rdi), %xmm2
|
||||
paddd 112(%rdi), %xmm3
|
||||
paddd 64(%rsi), %xmm8
|
||||
paddd 80(%rsi), %xmm9
|
||||
paddd 96(%rsi), %xmm10
|
||||
paddd 112(%rsi), %xmm11
|
||||
movdqa %xmm0, 64(%rdi)
|
||||
movdqa %xmm1, 80(%rdi)
|
||||
movdqa %xmm2, 96(%rdi)
|
||||
movdqa %xmm3, 112(%rdi)
|
||||
movdqa %xmm8, 64(%rsi)
|
||||
movdqa %xmm9, 80(%rsi)
|
||||
movdqa %xmm10, 96(%rsi)
|
||||
movdqa %xmm11, 112(%rsi)
|
||||
|
||||
subq $1, %rcx
|
||||
ja dual_scrypt_core_loop2
|
||||
|
||||
# shuffle 1st block
|
||||
movl 60(%rdi), %ebp
|
||||
movl 44(%rdi), %ecx
|
||||
movl 28(%rdi), %ebx
|
||||
movl 12(%rdi), %eax
|
||||
movl %ebp, 12(%rdi)
|
||||
movl %ecx, 28(%rdi)
|
||||
movl %ebx, 44(%rdi)
|
||||
movl %eax, 60(%rdi)
|
||||
movl 40(%rdi), %ecx
|
||||
movl 8(%rdi), %eax
|
||||
movl 48(%rdi), %ebp
|
||||
movl 16(%rdi), %ebx
|
||||
movl %ecx, 8(%rdi)
|
||||
movl %eax, 40(%rdi)
|
||||
movl %ebp, 16(%rdi)
|
||||
movl %ebx, 48(%rdi)
|
||||
movl 20(%rdi), %ebx
|
||||
movl 4(%rdi), %eax
|
||||
movl 52(%rdi), %ebp
|
||||
movl 36(%rdi), %ecx
|
||||
movl %ebx, 4(%rdi)
|
||||
movl %eax, 20(%rdi)
|
||||
movl %ebp, 36(%rdi)
|
||||
movl %ecx, 52(%rdi)
|
||||
|
||||
# shuffle 2nd block
|
||||
movl 124(%rdi), %ebp
|
||||
movl 108(%rdi), %ecx
|
||||
movl 92(%rdi), %ebx
|
||||
movl 76(%rdi), %eax
|
||||
movl %ebp, 76(%rdi)
|
||||
movl %ecx, 92(%rdi)
|
||||
movl %ebx, 108(%rdi)
|
||||
movl %eax, 124(%rdi)
|
||||
movl 104(%rdi), %ecx
|
||||
movl 72(%rdi), %eax
|
||||
movl 112(%rdi), %ebp
|
||||
movl 80(%rdi), %ebx
|
||||
movl %ecx, 72(%rdi)
|
||||
movl %eax, 104(%rdi)
|
||||
movl %ebp, 80(%rdi)
|
||||
movl %ebx, 112(%rdi)
|
||||
movl 84(%rdi), %ebx
|
||||
movl 68(%rdi), %eax
|
||||
movl 116(%rdi), %ebp
|
||||
movl 100(%rdi), %ecx
|
||||
movl %ebx, 68(%rdi)
|
||||
movl %eax, 84(%rdi)
|
||||
movl %ebp, 100(%rdi)
|
||||
movl %ecx, 116(%rdi)
|
||||
|
||||
# shuffle 3rd block
|
||||
movl 60(%rsi), %ebp
|
||||
movl 44(%rsi), %ecx
|
||||
movl 28(%rsi), %ebx
|
||||
movl 12(%rsi), %eax
|
||||
movl %ebp, 12(%rsi)
|
||||
movl %ecx, 28(%rsi)
|
||||
movl %ebx, 44(%rsi)
|
||||
movl %eax, 60(%rsi)
|
||||
movl 40(%rsi), %ecx
|
||||
movl 8(%rsi), %eax
|
||||
movl 48(%rsi), %ebp
|
||||
movl 16(%rsi), %ebx
|
||||
movl %ecx, 8(%rsi)
|
||||
movl %eax, 40(%rsi)
|
||||
movl %ebp, 16(%rsi)
|
||||
movl %ebx, 48(%rsi)
|
||||
movl 20(%rsi), %ebx
|
||||
movl 4(%rsi), %eax
|
||||
movl 52(%rsi), %ebp
|
||||
movl 36(%rsi), %ecx
|
||||
movl %ebx, 4(%rsi)
|
||||
movl %eax, 20(%rsi)
|
||||
movl %ebp, 36(%rsi)
|
||||
movl %ecx, 52(%rsi)
|
||||
|
||||
# shuffle 4th block
|
||||
movl 124(%rsi), %ebp
|
||||
movl 108(%rsi), %ecx
|
||||
movl 92(%rsi), %ebx
|
||||
movl 76(%rsi), %eax
|
||||
movl %ebp, 76(%rsi)
|
||||
movl %ecx, 92(%rsi)
|
||||
movl %ebx, 108(%rsi)
|
||||
movl %eax, 124(%rsi)
|
||||
movl 104(%rsi), %ecx
|
||||
movl 72(%rsi), %eax
|
||||
movl 112(%rsi), %ebp
|
||||
movl 80(%rsi), %ebx
|
||||
movl %ecx, 72(%rsi)
|
||||
movl %eax, 104(%rsi)
|
||||
movl %ebp, 80(%rsi)
|
||||
movl %ebx, 112(%rsi)
|
||||
movl 84(%rsi), %ebx
|
||||
movl 68(%rsi), %eax
|
||||
movl 116(%rsi), %ebp
|
||||
movl 100(%rsi), %ecx
|
||||
movl %ebx, 68(%rsi)
|
||||
movl %eax, 84(%rsi)
|
||||
movl %ebp, 100(%rsi)
|
||||
movl %ecx, 116(%rsi)
|
||||
|
||||
#if defined(WIN64)
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
movdqa 8(%rsp), %xmm6
|
||||
movdqa 24(%rsp), %xmm7
|
||||
movdqa 40(%rsp), %xmm8
|
||||
movdqa 56(%rsp), %xmm9
|
||||
movdqa 72(%rsp), %xmm10
|
||||
movdqa 88(%rsp), %xmm11
|
||||
movdqa 104(%rsp), %xmm12
|
||||
movdqa 120(%rsp), %xmm13
|
||||
movdqa 136(%rsp), %xmm14
|
||||
movdqa 152(%rsp), %xmm15
|
||||
addq $176, %rsp
|
||||
#endif
|
||||
popq %rbp
|
||||
popq %rbx
|
||||
ret
|
||||
|
||||
#endif
|
||||
|
|
266
scrypt-x86.S
266
scrypt-x86.S
|
@ -24,7 +24,7 @@
|
|||
|
||||
#if defined(__i386__)
|
||||
|
||||
.macro x86_gen_salsa8_core_quadround
|
||||
.macro gen_salsa8_core_quadround
|
||||
movl 52(%esp), %ecx
|
||||
movl 4(%esp), %edx
|
||||
movl 20(%esp), %ebx
|
||||
|
@ -346,18 +346,18 @@
|
|||
|
||||
.text
|
||||
.align 32
|
||||
x86_gen_salsa8_core:
|
||||
x86_gen_salsa8_core_quadround
|
||||
x86_gen_salsa8_core_quadround
|
||||
gen_salsa8_core:
|
||||
gen_salsa8_core_quadround
|
||||
gen_salsa8_core_quadround
|
||||
ret
|
||||
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.globl x86_scrypt_core
|
||||
.globl _x86_scrypt_core
|
||||
x86_scrypt_core:
|
||||
_x86_scrypt_core:
|
||||
.globl scrypt_core
|
||||
.globl _scrypt_core
|
||||
scrypt_core:
|
||||
_scrypt_core:
|
||||
pushl %ebx
|
||||
pushl %ebp
|
||||
pushl %edi
|
||||
|
@ -367,14 +367,14 @@ _x86_scrypt_core:
|
|||
movl $1, %eax
|
||||
cpuid
|
||||
andl $0x04000000, %edx
|
||||
jnz x86_xmm_scrypt_core
|
||||
jnz xmm_scrypt_core
|
||||
|
||||
x86_gen_scrypt_core:
|
||||
gen_scrypt_core:
|
||||
movl 20(%esp), %edi
|
||||
movl 24(%esp), %esi
|
||||
subl $72, %esp
|
||||
|
||||
.macro x86_scrypt_core_macro1a p, q
|
||||
.macro scrypt_core_macro1a p, q
|
||||
movl \p(%edi), %eax
|
||||
movl \q(%edi), %edx
|
||||
movl %eax, \p(%esi)
|
||||
|
@ -384,7 +384,7 @@ x86_gen_scrypt_core:
|
|||
movl %eax, \p(%esp)
|
||||
.endm
|
||||
|
||||
.macro x86_scrypt_core_macro1b p, q
|
||||
.macro scrypt_core_macro1b p, q
|
||||
movl \p(%edi), %eax
|
||||
xorl \p(%esi, %edx), %eax
|
||||
movl \q(%edi), %ebx
|
||||
|
@ -395,7 +395,7 @@ x86_gen_scrypt_core:
|
|||
movl %eax, \p(%esp)
|
||||
.endm
|
||||
|
||||
.macro x86_scrypt_core_macro2 p, q
|
||||
.macro scrypt_core_macro2 p, q
|
||||
movl \p(%esp), %eax
|
||||
addl \p(%edi), %eax
|
||||
movl %eax, \p(%edi)
|
||||
|
@ -404,150 +404,150 @@ x86_gen_scrypt_core:
|
|||
movl %eax, \p(%esp)
|
||||
.endm
|
||||
|
||||
.macro x86_scrypt_core_macro3 p, q
|
||||
.macro scrypt_core_macro3 p, q
|
||||
movl \p(%esp), %eax
|
||||
addl \q(%edi), %eax
|
||||
movl %eax, \q(%edi)
|
||||
.endm
|
||||
|
||||
leal 131072(%esi), %ecx
|
||||
x86_gen_scrypt_core_loop1:
|
||||
gen_scrypt_core_loop1:
|
||||
movl %esi, 64(%esp)
|
||||
movl %ecx, 68(%esp)
|
||||
|
||||
x86_scrypt_core_macro1a 0, 64
|
||||
x86_scrypt_core_macro1a 4, 68
|
||||
x86_scrypt_core_macro1a 8, 72
|
||||
x86_scrypt_core_macro1a 12, 76
|
||||
x86_scrypt_core_macro1a 16, 80
|
||||
x86_scrypt_core_macro1a 20, 84
|
||||
x86_scrypt_core_macro1a 24, 88
|
||||
x86_scrypt_core_macro1a 28, 92
|
||||
x86_scrypt_core_macro1a 32, 96
|
||||
x86_scrypt_core_macro1a 36, 100
|
||||
x86_scrypt_core_macro1a 40, 104
|
||||
x86_scrypt_core_macro1a 44, 108
|
||||
x86_scrypt_core_macro1a 48, 112
|
||||
x86_scrypt_core_macro1a 52, 116
|
||||
x86_scrypt_core_macro1a 56, 120
|
||||
x86_scrypt_core_macro1a 60, 124
|
||||
scrypt_core_macro1a 0, 64
|
||||
scrypt_core_macro1a 4, 68
|
||||
scrypt_core_macro1a 8, 72
|
||||
scrypt_core_macro1a 12, 76
|
||||
scrypt_core_macro1a 16, 80
|
||||
scrypt_core_macro1a 20, 84
|
||||
scrypt_core_macro1a 24, 88
|
||||
scrypt_core_macro1a 28, 92
|
||||
scrypt_core_macro1a 32, 96
|
||||
scrypt_core_macro1a 36, 100
|
||||
scrypt_core_macro1a 40, 104
|
||||
scrypt_core_macro1a 44, 108
|
||||
scrypt_core_macro1a 48, 112
|
||||
scrypt_core_macro1a 52, 116
|
||||
scrypt_core_macro1a 56, 120
|
||||
scrypt_core_macro1a 60, 124
|
||||
|
||||
call x86_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
|
||||
movl 92(%esp), %edi
|
||||
x86_scrypt_core_macro2 0, 64
|
||||
x86_scrypt_core_macro2 4, 68
|
||||
x86_scrypt_core_macro2 8, 72
|
||||
x86_scrypt_core_macro2 12, 76
|
||||
x86_scrypt_core_macro2 16, 80
|
||||
x86_scrypt_core_macro2 20, 84
|
||||
x86_scrypt_core_macro2 24, 88
|
||||
x86_scrypt_core_macro2 28, 92
|
||||
x86_scrypt_core_macro2 32, 96
|
||||
x86_scrypt_core_macro2 36, 100
|
||||
x86_scrypt_core_macro2 40, 104
|
||||
x86_scrypt_core_macro2 44, 108
|
||||
x86_scrypt_core_macro2 48, 112
|
||||
x86_scrypt_core_macro2 52, 116
|
||||
x86_scrypt_core_macro2 56, 120
|
||||
x86_scrypt_core_macro2 60, 124
|
||||
scrypt_core_macro2 0, 64
|
||||
scrypt_core_macro2 4, 68
|
||||
scrypt_core_macro2 8, 72
|
||||
scrypt_core_macro2 12, 76
|
||||
scrypt_core_macro2 16, 80
|
||||
scrypt_core_macro2 20, 84
|
||||
scrypt_core_macro2 24, 88
|
||||
scrypt_core_macro2 28, 92
|
||||
scrypt_core_macro2 32, 96
|
||||
scrypt_core_macro2 36, 100
|
||||
scrypt_core_macro2 40, 104
|
||||
scrypt_core_macro2 44, 108
|
||||
scrypt_core_macro2 48, 112
|
||||
scrypt_core_macro2 52, 116
|
||||
scrypt_core_macro2 56, 120
|
||||
scrypt_core_macro2 60, 124
|
||||
|
||||
call x86_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
|
||||
movl 92(%esp), %edi
|
||||
x86_scrypt_core_macro3 0, 64
|
||||
x86_scrypt_core_macro3 4, 68
|
||||
x86_scrypt_core_macro3 8, 72
|
||||
x86_scrypt_core_macro3 12, 76
|
||||
x86_scrypt_core_macro3 16, 80
|
||||
x86_scrypt_core_macro3 20, 84
|
||||
x86_scrypt_core_macro3 24, 88
|
||||
x86_scrypt_core_macro3 28, 92
|
||||
x86_scrypt_core_macro3 32, 96
|
||||
x86_scrypt_core_macro3 36, 100
|
||||
x86_scrypt_core_macro3 40, 104
|
||||
x86_scrypt_core_macro3 44, 108
|
||||
x86_scrypt_core_macro3 48, 112
|
||||
x86_scrypt_core_macro3 52, 116
|
||||
x86_scrypt_core_macro3 56, 120
|
||||
x86_scrypt_core_macro3 60, 124
|
||||
scrypt_core_macro3 0, 64
|
||||
scrypt_core_macro3 4, 68
|
||||
scrypt_core_macro3 8, 72
|
||||
scrypt_core_macro3 12, 76
|
||||
scrypt_core_macro3 16, 80
|
||||
scrypt_core_macro3 20, 84
|
||||
scrypt_core_macro3 24, 88
|
||||
scrypt_core_macro3 28, 92
|
||||
scrypt_core_macro3 32, 96
|
||||
scrypt_core_macro3 36, 100
|
||||
scrypt_core_macro3 40, 104
|
||||
scrypt_core_macro3 44, 108
|
||||
scrypt_core_macro3 48, 112
|
||||
scrypt_core_macro3 52, 116
|
||||
scrypt_core_macro3 56, 120
|
||||
scrypt_core_macro3 60, 124
|
||||
|
||||
movl 64(%esp), %esi
|
||||
movl 68(%esp), %ecx
|
||||
addl $128, %esi
|
||||
cmpl %ecx, %esi
|
||||
jne x86_gen_scrypt_core_loop1
|
||||
jne gen_scrypt_core_loop1
|
||||
|
||||
movl 96(%esp), %esi
|
||||
movl $1024, %ecx
|
||||
x86_gen_scrypt_core_loop2:
|
||||
gen_scrypt_core_loop2:
|
||||
movl %ecx, 68(%esp)
|
||||
|
||||
movl 64(%edi), %edx
|
||||
andl $1023, %edx
|
||||
shll $7, %edx
|
||||
|
||||
x86_scrypt_core_macro1b 0, 64
|
||||
x86_scrypt_core_macro1b 4, 68
|
||||
x86_scrypt_core_macro1b 8, 72
|
||||
x86_scrypt_core_macro1b 12, 76
|
||||
x86_scrypt_core_macro1b 16, 80
|
||||
x86_scrypt_core_macro1b 20, 84
|
||||
x86_scrypt_core_macro1b 24, 88
|
||||
x86_scrypt_core_macro1b 28, 92
|
||||
x86_scrypt_core_macro1b 32, 96
|
||||
x86_scrypt_core_macro1b 36, 100
|
||||
x86_scrypt_core_macro1b 40, 104
|
||||
x86_scrypt_core_macro1b 44, 108
|
||||
x86_scrypt_core_macro1b 48, 112
|
||||
x86_scrypt_core_macro1b 52, 116
|
||||
x86_scrypt_core_macro1b 56, 120
|
||||
x86_scrypt_core_macro1b 60, 124
|
||||
scrypt_core_macro1b 0, 64
|
||||
scrypt_core_macro1b 4, 68
|
||||
scrypt_core_macro1b 8, 72
|
||||
scrypt_core_macro1b 12, 76
|
||||
scrypt_core_macro1b 16, 80
|
||||
scrypt_core_macro1b 20, 84
|
||||
scrypt_core_macro1b 24, 88
|
||||
scrypt_core_macro1b 28, 92
|
||||
scrypt_core_macro1b 32, 96
|
||||
scrypt_core_macro1b 36, 100
|
||||
scrypt_core_macro1b 40, 104
|
||||
scrypt_core_macro1b 44, 108
|
||||
scrypt_core_macro1b 48, 112
|
||||
scrypt_core_macro1b 52, 116
|
||||
scrypt_core_macro1b 56, 120
|
||||
scrypt_core_macro1b 60, 124
|
||||
|
||||
call x86_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
|
||||
movl 92(%esp), %edi
|
||||
x86_scrypt_core_macro2 0, 64
|
||||
x86_scrypt_core_macro2 4, 68
|
||||
x86_scrypt_core_macro2 8, 72
|
||||
x86_scrypt_core_macro2 12, 76
|
||||
x86_scrypt_core_macro2 16, 80
|
||||
x86_scrypt_core_macro2 20, 84
|
||||
x86_scrypt_core_macro2 24, 88
|
||||
x86_scrypt_core_macro2 28, 92
|
||||
x86_scrypt_core_macro2 32, 96
|
||||
x86_scrypt_core_macro2 36, 100
|
||||
x86_scrypt_core_macro2 40, 104
|
||||
x86_scrypt_core_macro2 44, 108
|
||||
x86_scrypt_core_macro2 48, 112
|
||||
x86_scrypt_core_macro2 52, 116
|
||||
x86_scrypt_core_macro2 56, 120
|
||||
x86_scrypt_core_macro2 60, 124
|
||||
scrypt_core_macro2 0, 64
|
||||
scrypt_core_macro2 4, 68
|
||||
scrypt_core_macro2 8, 72
|
||||
scrypt_core_macro2 12, 76
|
||||
scrypt_core_macro2 16, 80
|
||||
scrypt_core_macro2 20, 84
|
||||
scrypt_core_macro2 24, 88
|
||||
scrypt_core_macro2 28, 92
|
||||
scrypt_core_macro2 32, 96
|
||||
scrypt_core_macro2 36, 100
|
||||
scrypt_core_macro2 40, 104
|
||||
scrypt_core_macro2 44, 108
|
||||
scrypt_core_macro2 48, 112
|
||||
scrypt_core_macro2 52, 116
|
||||
scrypt_core_macro2 56, 120
|
||||
scrypt_core_macro2 60, 124
|
||||
|
||||
call x86_gen_salsa8_core
|
||||
call gen_salsa8_core
|
||||
|
||||
movl 92(%esp), %edi
|
||||
movl 96(%esp), %esi
|
||||
x86_scrypt_core_macro3 0, 64
|
||||
x86_scrypt_core_macro3 4, 68
|
||||
x86_scrypt_core_macro3 8, 72
|
||||
x86_scrypt_core_macro3 12, 76
|
||||
x86_scrypt_core_macro3 16, 80
|
||||
x86_scrypt_core_macro3 20, 84
|
||||
x86_scrypt_core_macro3 24, 88
|
||||
x86_scrypt_core_macro3 28, 92
|
||||
x86_scrypt_core_macro3 32, 96
|
||||
x86_scrypt_core_macro3 36, 100
|
||||
x86_scrypt_core_macro3 40, 104
|
||||
x86_scrypt_core_macro3 44, 108
|
||||
x86_scrypt_core_macro3 48, 112
|
||||
x86_scrypt_core_macro3 52, 116
|
||||
x86_scrypt_core_macro3 56, 120
|
||||
x86_scrypt_core_macro3 60, 124
|
||||
scrypt_core_macro3 0, 64
|
||||
scrypt_core_macro3 4, 68
|
||||
scrypt_core_macro3 8, 72
|
||||
scrypt_core_macro3 12, 76
|
||||
scrypt_core_macro3 16, 80
|
||||
scrypt_core_macro3 20, 84
|
||||
scrypt_core_macro3 24, 88
|
||||
scrypt_core_macro3 28, 92
|
||||
scrypt_core_macro3 32, 96
|
||||
scrypt_core_macro3 36, 100
|
||||
scrypt_core_macro3 40, 104
|
||||
scrypt_core_macro3 44, 108
|
||||
scrypt_core_macro3 48, 112
|
||||
scrypt_core_macro3 52, 116
|
||||
scrypt_core_macro3 56, 120
|
||||
scrypt_core_macro3 60, 124
|
||||
|
||||
movl 68(%esp), %ecx
|
||||
subl $1, %ecx
|
||||
ja x86_gen_scrypt_core_loop2
|
||||
ja gen_scrypt_core_loop2
|
||||
|
||||
addl $72, %esp
|
||||
popl %esi
|
||||
|
@ -557,7 +557,7 @@ x86_gen_scrypt_core_loop2:
|
|||
ret
|
||||
|
||||
|
||||
.macro x86_xmm_salsa8_core_doubleround
|
||||
.macro xmm_salsa8_core_doubleround
|
||||
paddd %xmm0, %xmm4
|
||||
movdqa %xmm0, %xmm5
|
||||
movdqa %xmm4, %xmm6
|
||||
|
@ -624,16 +624,16 @@ x86_gen_scrypt_core_loop2:
|
|||
pxor %xmm6, %xmm0
|
||||
.endm
|
||||
|
||||
.macro x86_xmm_salsa8_core
|
||||
.macro xmm_salsa8_core
|
||||
movdqa %xmm1, %xmm4
|
||||
x86_xmm_salsa8_core_doubleround
|
||||
x86_xmm_salsa8_core_doubleround
|
||||
x86_xmm_salsa8_core_doubleround
|
||||
x86_xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
xmm_salsa8_core_doubleround
|
||||
.endm
|
||||
|
||||
.align 32
|
||||
x86_xmm_scrypt_core:
|
||||
xmm_scrypt_core:
|
||||
movl 20(%esp), %edi
|
||||
movl 24(%esp), %esi
|
||||
movl %esp, %ebp
|
||||
|
@ -710,7 +710,7 @@ x86_xmm_scrypt_core:
|
|||
|
||||
movl %esi, %edx
|
||||
leal 131072(%esi), %ecx
|
||||
x86_xmm_scrypt_core_loop1:
|
||||
xmm_scrypt_core_loop1:
|
||||
movdqa 0(%esp), %xmm0
|
||||
movdqa 16(%esp), %xmm1
|
||||
movdqa 32(%esp), %xmm2
|
||||
|
@ -736,7 +736,7 @@ x86_xmm_scrypt_core_loop1:
|
|||
movdqa %xmm1, 16(%esp)
|
||||
movdqa %xmm2, 32(%esp)
|
||||
movdqa %xmm3, 48(%esp)
|
||||
x86_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd 0(%esp), %xmm0
|
||||
paddd 16(%esp), %xmm1
|
||||
paddd 32(%esp), %xmm2
|
||||
|
@ -754,7 +754,7 @@ x86_xmm_scrypt_core_loop1:
|
|||
movdqa %xmm1, 80(%esp)
|
||||
movdqa %xmm2, 96(%esp)
|
||||
movdqa %xmm3, 112(%esp)
|
||||
x86_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd 64(%esp), %xmm0
|
||||
paddd 80(%esp), %xmm1
|
||||
paddd 96(%esp), %xmm2
|
||||
|
@ -766,10 +766,10 @@ x86_xmm_scrypt_core_loop1:
|
|||
|
||||
addl $128, %edx
|
||||
cmpl %ecx, %edx
|
||||
jne x86_xmm_scrypt_core_loop1
|
||||
jne xmm_scrypt_core_loop1
|
||||
|
||||
movl $1024, %ecx
|
||||
x86_xmm_scrypt_core_loop2:
|
||||
xmm_scrypt_core_loop2:
|
||||
movdqa 0(%esp), %xmm0
|
||||
movdqa 16(%esp), %xmm1
|
||||
movdqa 32(%esp), %xmm2
|
||||
|
@ -802,7 +802,7 @@ x86_xmm_scrypt_core_loop2:
|
|||
movdqa %xmm1, 16(%esp)
|
||||
movdqa %xmm2, 32(%esp)
|
||||
movdqa %xmm3, 48(%esp)
|
||||
x86_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd 0(%esp), %xmm0
|
||||
paddd 16(%esp), %xmm1
|
||||
paddd 32(%esp), %xmm2
|
||||
|
@ -820,7 +820,7 @@ x86_xmm_scrypt_core_loop2:
|
|||
movdqa %xmm1, 80(%esp)
|
||||
movdqa %xmm2, 96(%esp)
|
||||
movdqa %xmm3, 112(%esp)
|
||||
x86_xmm_salsa8_core
|
||||
xmm_salsa8_core
|
||||
paddd 64(%esp), %xmm0
|
||||
paddd 80(%esp), %xmm1
|
||||
paddd 96(%esp), %xmm2
|
||||
|
@ -831,7 +831,7 @@ x86_xmm_scrypt_core_loop2:
|
|||
movdqa %xmm3, 112(%esp)
|
||||
|
||||
subl $1, %ecx
|
||||
ja x86_xmm_scrypt_core_loop2
|
||||
ja xmm_scrypt_core_loop2
|
||||
|
||||
# re-shuffle 1st block back
|
||||
movl 60(%esp), %edx
|
||||
|
|
241
scrypt.c
241
scrypt.c
|
@ -193,79 +193,13 @@ SHA256_InitState(uint32_t * state)
|
|||
static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
|
||||
static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300};
|
||||
|
||||
/**
|
||||
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
|
||||
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
|
||||
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
|
||||
*/
|
||||
static inline void
|
||||
PBKDF2_SHA256_80_128(const uint32_t * passwd, uint32_t * buf)
|
||||
PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t ostate[8])
|
||||
{
|
||||
SHA256_CTX PShictx, PShoctx;
|
||||
uint32_t tstate[8];
|
||||
uint32_t ihash[8];
|
||||
uint32_t i;
|
||||
uint32_t pad[16];
|
||||
|
||||
static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
|
||||
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
SHA256_InitState(tstate);
|
||||
SHA256_Transform(tstate, passwd, 1);
|
||||
memcpy(pad, passwd+16, 16);
|
||||
memcpy(pad+4, passwdpad, 48);
|
||||
SHA256_Transform(tstate, pad, 1);
|
||||
memcpy(ihash, tstate, 32);
|
||||
|
||||
SHA256_InitState(PShictx.state);
|
||||
for (i = 0; i < 8; i++)
|
||||
pad[i] = ihash[i] ^ 0x36363636;
|
||||
for (; i < 16; i++)
|
||||
pad[i] = 0x36363636;
|
||||
SHA256_Transform(PShictx.state, pad, 0);
|
||||
SHA256_Transform(PShictx.state, passwd, 1);
|
||||
be32enc_vect(PShictx.buf, passwd+16, 4);
|
||||
be32enc_vect(PShictx.buf+5, innerpad, 11);
|
||||
|
||||
SHA256_InitState(PShoctx.state);
|
||||
for (i = 0; i < 8; i++)
|
||||
pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
||||
for (; i < 16; i++)
|
||||
pad[i] = 0x5c5c5c5c;
|
||||
SHA256_Transform(PShoctx.state, pad, 0);
|
||||
memcpy(PShoctx.buf+8, outerpad, 32);
|
||||
|
||||
/* Iterate through the blocks. */
|
||||
for (i = 0; i < 4; i++) {
|
||||
uint32_t istate[8];
|
||||
uint32_t ostate[8];
|
||||
|
||||
memcpy(istate, PShictx.state, 32);
|
||||
PShictx.buf[4] = i + 1;
|
||||
SHA256_Transform(istate, PShictx.buf, 0);
|
||||
memcpy(PShoctx.buf, istate, 32);
|
||||
|
||||
memcpy(ostate, PShoctx.state, 32);
|
||||
SHA256_Transform(ostate, PShoctx.buf, 0);
|
||||
be32enc_vect(buf+i*8, ostate, 8);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline uint32_t
|
||||
PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt)
|
||||
{
|
||||
uint32_t tstate[8];
|
||||
uint32_t ostate[8];
|
||||
uint32_t ihash[8];
|
||||
uint32_t i;
|
||||
|
||||
/* Compute HMAC state after processing P and S. */
|
||||
uint32_t pad[16];
|
||||
|
||||
static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
|
||||
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
SHA256_InitState(tstate);
|
||||
SHA256_Transform(tstate, passwd, 1);
|
||||
memcpy(pad, passwd+16, 16);
|
||||
|
@ -286,16 +220,63 @@ PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt)
|
|||
for (; i < 16; i++)
|
||||
pad[i] = 0x36363636;
|
||||
SHA256_Transform(tstate, pad, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
|
||||
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
|
||||
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
|
||||
*/
|
||||
static inline void
|
||||
PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *passwd, uint32_t *buf)
|
||||
{
|
||||
static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
|
||||
SHA256_CTX PShictx, PShoctx;
|
||||
uint32_t i;
|
||||
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
memcpy(PShictx.state, tstate, 32);
|
||||
memcpy(PShoctx.state, ostate, 32);
|
||||
|
||||
memcpy(PShoctx.buf+8, outerpad, 32);
|
||||
|
||||
SHA256_Transform(PShictx.state, passwd, 1);
|
||||
be32enc_vect(PShictx.buf, passwd+16, 4);
|
||||
be32enc_vect(PShictx.buf+5, innerpad, 11);
|
||||
|
||||
/* Iterate through the blocks. */
|
||||
for (i = 0; i < 4; i++) {
|
||||
uint32_t ist[8];
|
||||
uint32_t ost[8];
|
||||
|
||||
memcpy(ist, PShictx.state, 32);
|
||||
PShictx.buf[4] = i + 1;
|
||||
SHA256_Transform(ist, PShictx.buf, 0);
|
||||
memcpy(PShoctx.buf, ist, 32);
|
||||
|
||||
memcpy(ost, PShoctx.state, 32);
|
||||
SHA256_Transform(ost, PShoctx.buf, 0);
|
||||
be32enc_vect(buf+i*8, ost, 8);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
PBKDF2_SHA256_80_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *passwd, const uint32_t *salt, uint32_t *output)
|
||||
{
|
||||
static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
|
||||
uint32_t pad[16];
|
||||
uint32_t i;
|
||||
|
||||
SHA256_Transform(tstate, salt, 1);
|
||||
SHA256_Transform(tstate, salt+16, 1);
|
||||
SHA256_Transform(tstate, ihash_finalblk, 0);
|
||||
memcpy(pad, tstate, 32);
|
||||
memcpy(pad+8, outerpad, 32);
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
SHA256_Transform(ostate, pad, 0);
|
||||
/* Finish the outer SHA256 operation. */
|
||||
return byteswap(ostate[7]);
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
output[i] = byteswap(ostate[i]);
|
||||
}
|
||||
|
||||
|
||||
|
@ -358,34 +339,33 @@ salsa20_8(uint32_t B[16], const uint32_t Bx[16])
|
|||
B[15] += x15;
|
||||
}
|
||||
|
||||
#if defined(__x86_64__)
|
||||
void x64_scrypt_core(uint32_t *B, uint32_t *V);
|
||||
#elif defined(__i386__)
|
||||
void x86_scrypt_core(uint32_t *B, uint32_t *V);
|
||||
#endif
|
||||
|
||||
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
|
||||
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
|
||||
*/
|
||||
static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad)
|
||||
#if defined(__x86_64__)
|
||||
|
||||
#define DUAL_SCRYPT
|
||||
#define SCRYPT_BUFFER_SIZE (2 * 131072 + 63)
|
||||
|
||||
int prefer_dual_scrypt();
|
||||
void scrypt_core(uint32_t *X, uint32_t *V);
|
||||
void dual_scrypt_core(uint32_t *X, uint32_t *Y, uint32_t *V);
|
||||
|
||||
#elif defined(__i386__)
|
||||
|
||||
#define SCRYPT_BUFFER_SIZE (131072 + 63)
|
||||
|
||||
void scrypt_core(uint32_t *X, uint32_t *V);
|
||||
|
||||
#else
|
||||
|
||||
#define SCRYPT_BUFFER_SIZE (131072 + 63)
|
||||
|
||||
static inline void scrypt_core(uint32_t *X, uint32_t *V)
|
||||
{
|
||||
uint32_t * V;
|
||||
uint32_t X[32];
|
||||
uint32_t i;
|
||||
uint32_t j;
|
||||
uint32_t k;
|
||||
uint64_t *p1, *p2;
|
||||
|
||||
p1 = (uint64_t *)X;
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||
|
||||
PBKDF2_SHA256_80_128(input, X);
|
||||
|
||||
#if defined(__x86_64__)
|
||||
x64_scrypt_core(X, V);
|
||||
#elif defined(__i386__)
|
||||
x86_scrypt_core(X, V);
|
||||
#else
|
||||
for (i = 0; i < 1024; i += 2) {
|
||||
memcpy(&V[i * 32], X, 128);
|
||||
|
||||
|
@ -414,32 +394,93 @@ static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad)
|
|||
salsa20_8(&X[0], &X[16]);
|
||||
salsa20_8(&X[16], &X[0]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return PBKDF2_SHA256_80_128_32(input, X);
|
||||
unsigned char *scrypt_buffer_alloc() {
|
||||
return malloc(SCRYPT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
|
||||
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
|
||||
r = 1, p = 1, N = 1024
|
||||
*/
|
||||
static void scrypt_1024_1_1_256_sp(const uint32_t* input, unsigned char *scratchpad, uint32_t *res)
|
||||
{
|
||||
uint32_t tstate[8], ostate[8];
|
||||
uint32_t *V;
|
||||
uint32_t X[32];
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||
|
||||
PBKDF2_SHA256_80_128_init(input, tstate, ostate);
|
||||
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
|
||||
|
||||
scrypt_core(X, V);
|
||||
|
||||
return PBKDF2_SHA256_80_128_32(tstate, ostate, input, X, res);
|
||||
}
|
||||
|
||||
#ifdef DUAL_SCRYPT
|
||||
static void dual_scrypt_1024_1_1_256_sp(const uint32_t *input1, const uint32_t *input2, unsigned char *scratchpad, uint32_t *res1, uint32_t *res2)
|
||||
{
|
||||
uint32_t tstate1[8], tstate2[8], ostate1[8], ostate2[8];
|
||||
uint32_t *V;
|
||||
uint32_t X[32], Y[32];
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||
|
||||
PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1);
|
||||
PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2);
|
||||
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
|
||||
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
|
||||
|
||||
dual_scrypt_core(X, Y, V);
|
||||
|
||||
PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, res1);
|
||||
PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, res2);
|
||||
}
|
||||
#endif
|
||||
|
||||
int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
|
||||
const unsigned char *ptarget,
|
||||
uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
uint32_t data[20];
|
||||
uint32_t tmp_hash7;
|
||||
uint32_t data[20], hash[8];
|
||||
#ifdef DUAL_SCRYPT
|
||||
uint32_t data2[20], hash2[8];
|
||||
int use_dual;
|
||||
#endif
|
||||
uint32_t n = 0;
|
||||
uint32_t Htarg = ((const uint32_t *)ptarget)[7];
|
||||
int i;
|
||||
|
||||
work_restart[thr_id].restart = 0;
|
||||
|
||||
be32enc_vect(data, (const uint32_t *)pdata, 19);
|
||||
#ifdef DUAL_SCRYPT
|
||||
memcpy(data2, data, 80);
|
||||
use_dual = prefer_dual_scrypt();
|
||||
#endif
|
||||
|
||||
while (1) {
|
||||
n++;
|
||||
data[19] = n;
|
||||
tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf);
|
||||
data[19] = n++;
|
||||
#ifdef DUAL_SCRYPT
|
||||
if (use_dual) {
|
||||
data2[19] = n++;
|
||||
dual_scrypt_1024_1_1_256_sp(data, data2, scratchbuf, hash, hash2);
|
||||
if (hash2[7] <= Htarg) {
|
||||
((uint32_t *)pdata)[19] = byteswap(data2[19]);
|
||||
*hashes_done = n;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
scrypt_1024_1_1_256_sp(data, scratchbuf, hash);
|
||||
}
|
||||
#else
|
||||
scrypt_1024_1_1_256_sp(data, scratchbuf, hash);
|
||||
#endif
|
||||
|
||||
if (tmp_hash7 <= Htarg) {
|
||||
((uint32_t *)pdata)[19] = byteswap(n);
|
||||
if (hash[7] <= Htarg) {
|
||||
((uint32_t *)pdata)[19] = byteswap(data[19]);
|
||||
*hashes_done = n;
|
||||
return true;
|
||||
}
|
||||
|
|
2
util.c
2
util.c
|
@ -94,7 +94,9 @@ void applog(int prio, const char *fmt, ...)
|
|||
tm.tm_min,
|
||||
tm.tm_sec,
|
||||
fmt);
|
||||
pthread_mutex_lock(&time_lock);
|
||||
vfprintf(stderr, f, ap); /* atomic write to stderr */
|
||||
pthread_mutex_unlock(&time_lock);
|
||||
}
|
||||
va_end(ap);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue