Add AVX2-enabled functions for x86-64
This commit is contained in:
parent
44d4815b01
commit
e878267239
8 changed files with 1995 additions and 78 deletions
4
README
4
README
|
@ -42,7 +42,7 @@ Architecture-specific notes:
|
||||||
To use NEON instructions, add "-mfpu=neon" to CFLAGS.
|
To use NEON instructions, add "-mfpu=neon" to CFLAGS.
|
||||||
x86: The miner checks for SSE2 instructions support at runtime,
|
x86: The miner checks for SSE2 instructions support at runtime,
|
||||||
and uses them if they are available.
|
and uses them if they are available.
|
||||||
x86-64: The miner can take advantage of AVX and XOP instructions,
|
x86-64: The miner can take advantage of AVX, AVX2 and XOP instructions,
|
||||||
but only if both the CPU and the operating system support them.
|
but only if both the CPU and the operating system support them.
|
||||||
* Linux supports AVX starting from kernel version 2.6.30.
|
* Linux supports AVX starting from kernel version 2.6.30.
|
||||||
* FreeBSD supports AVX starting with 9.1-RELEASE.
|
* FreeBSD supports AVX starting with 9.1-RELEASE.
|
||||||
|
@ -50,7 +50,7 @@ Architecture-specific notes:
|
||||||
* Windows supports AVX starting from Windows 7 SP1 and
|
* Windows supports AVX starting from Windows 7 SP1 and
|
||||||
Windows Server 2008 R2 SP1.
|
Windows Server 2008 R2 SP1.
|
||||||
The configure script outputs a warning if the assembler
|
The configure script outputs a warning if the assembler
|
||||||
cannot compile AVX or XOP instructions. In that case, the miner
|
doesn't support some instruction sets. In that case, the miner
|
||||||
can still be built, but unavailable optimizations are left off.
|
can still be built, but unavailable optimizations are left off.
|
||||||
|
|
||||||
Usage instructions: Run "minerd --help" to see options.
|
Usage instructions: Run "minerd --help" to see options.
|
||||||
|
|
|
@ -77,6 +77,14 @@ then
|
||||||
AC_MSG_RESULT(no)
|
AC_MSG_RESULT(no)
|
||||||
AC_MSG_WARN([The assembler does not support the XOP instruction set.])
|
AC_MSG_WARN([The assembler does not support the XOP instruction set.])
|
||||||
)
|
)
|
||||||
|
AC_MSG_CHECKING(whether we can compile AVX2 code)
|
||||||
|
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
|
||||||
|
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
|
||||||
|
AC_MSG_RESULT(yes)
|
||||||
|
,
|
||||||
|
AC_MSG_RESULT(no)
|
||||||
|
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
|
||||||
|
)
|
||||||
,
|
,
|
||||||
AC_MSG_RESULT(no)
|
AC_MSG_RESULT(no)
|
||||||
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
|
AC_MSG_WARN([The assembler does not support the AVX instruction set.])
|
||||||
|
|
|
@ -668,7 +668,7 @@ static void *miner_thread(void *userdata)
|
||||||
int thr_id = mythr->id;
|
int thr_id = mythr->id;
|
||||||
struct work work;
|
struct work work;
|
||||||
uint32_t max_nonce;
|
uint32_t max_nonce;
|
||||||
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x10;
|
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
|
||||||
unsigned char *scratchbuf = NULL;
|
unsigned char *scratchbuf = NULL;
|
||||||
char s[16];
|
char s[16];
|
||||||
int i;
|
int i;
|
||||||
|
|
7
miner.h
7
miner.h
|
@ -141,6 +141,13 @@ void sha256_init_4way(uint32_t *state);
|
||||||
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
|
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__x86_64__) && defined(USE_AVX2)
|
||||||
|
#define HAVE_SHA256_8WAY 1
|
||||||
|
int sha256_use_8way();
|
||||||
|
void sha256_init_8way(uint32_t *state);
|
||||||
|
void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
|
||||||
|
#endif
|
||||||
|
|
||||||
extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
|
extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
|
||||||
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
|
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
|
|
||||||
|
|
639
scrypt-x64.S
639
scrypt-x64.S
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright 2011-2012 pooler@litecoinpool.org
|
* Copyright 2011-2013 pooler@litecoinpool.org
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -39,6 +39,30 @@
|
||||||
scrypt_best_throughput:
|
scrypt_best_throughput:
|
||||||
_scrypt_best_throughput:
|
_scrypt_best_throughput:
|
||||||
pushq %rbx
|
pushq %rbx
|
||||||
|
#if defined(USE_AVX2)
|
||||||
|
/* Check for AVX and OSXSAVE support */
|
||||||
|
movl $1, %eax
|
||||||
|
cpuid
|
||||||
|
andl $0x18000000, %ecx
|
||||||
|
cmpl $0x18000000, %ecx
|
||||||
|
jne scrypt_best_throughput_no_avx2
|
||||||
|
/* Check for AVX2 support */
|
||||||
|
movl $7, %eax
|
||||||
|
xorl %ecx, %ecx
|
||||||
|
cpuid
|
||||||
|
andl $0x00000020, %ebx
|
||||||
|
cmpl $0x00000020, %ebx
|
||||||
|
jne scrypt_best_throughput_no_avx2
|
||||||
|
/* Check for XMM and YMM state support */
|
||||||
|
xorl %ecx, %ecx
|
||||||
|
xgetbv
|
||||||
|
andl $0x00000006, %eax
|
||||||
|
cmpl $0x00000006, %eax
|
||||||
|
jne scrypt_best_throughput_no_avx2
|
||||||
|
movl $6, %eax
|
||||||
|
jmp scrypt_best_throughput_exit
|
||||||
|
scrypt_best_throughput_no_avx2:
|
||||||
|
#endif
|
||||||
/* Check for AuthenticAMD */
|
/* Check for AuthenticAMD */
|
||||||
xorq %rax, %rax
|
xorq %rax, %rax
|
||||||
cpuid
|
cpuid
|
||||||
|
@ -2239,4 +2263,617 @@ scrypt_core_3way_xmm_loop2:
|
||||||
scrypt_core_3way_cleanup
|
scrypt_core_3way_cleanup
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(USE_AVX2)
|
||||||
|
|
||||||
|
.macro salsa8_core_6way_avx2_doubleround
|
||||||
|
vpaddd %ymm0, %ymm1, %ymm4
|
||||||
|
vpaddd %ymm8, %ymm9, %ymm6
|
||||||
|
vpaddd %ymm12, %ymm13, %ymm7
|
||||||
|
vpslld $7, %ymm4, %ymm5
|
||||||
|
vpsrld $25, %ymm4, %ymm4
|
||||||
|
vpxor %ymm5, %ymm3, %ymm3
|
||||||
|
vpxor %ymm4, %ymm3, %ymm3
|
||||||
|
vpslld $7, %ymm6, %ymm5
|
||||||
|
vpsrld $25, %ymm6, %ymm6
|
||||||
|
vpxor %ymm5, %ymm11, %ymm11
|
||||||
|
vpxor %ymm6, %ymm11, %ymm11
|
||||||
|
vpslld $7, %ymm7, %ymm5
|
||||||
|
vpsrld $25, %ymm7, %ymm7
|
||||||
|
vpxor %ymm5, %ymm15, %ymm15
|
||||||
|
vpxor %ymm7, %ymm15, %ymm15
|
||||||
|
|
||||||
|
vpaddd %ymm3, %ymm0, %ymm4
|
||||||
|
vpaddd %ymm11, %ymm8, %ymm6
|
||||||
|
vpaddd %ymm15, %ymm12, %ymm7
|
||||||
|
vpslld $9, %ymm4, %ymm5
|
||||||
|
vpsrld $23, %ymm4, %ymm4
|
||||||
|
vpxor %ymm5, %ymm2, %ymm2
|
||||||
|
vpxor %ymm4, %ymm2, %ymm2
|
||||||
|
vpslld $9, %ymm6, %ymm5
|
||||||
|
vpsrld $23, %ymm6, %ymm6
|
||||||
|
vpxor %ymm5, %ymm10, %ymm10
|
||||||
|
vpxor %ymm6, %ymm10, %ymm10
|
||||||
|
vpslld $9, %ymm7, %ymm5
|
||||||
|
vpsrld $23, %ymm7, %ymm7
|
||||||
|
vpxor %ymm5, %ymm14, %ymm14
|
||||||
|
vpxor %ymm7, %ymm14, %ymm14
|
||||||
|
|
||||||
|
vpaddd %ymm2, %ymm3, %ymm4
|
||||||
|
vpaddd %ymm10, %ymm11, %ymm6
|
||||||
|
vpaddd %ymm14, %ymm15, %ymm7
|
||||||
|
vpslld $13, %ymm4, %ymm5
|
||||||
|
vpsrld $19, %ymm4, %ymm4
|
||||||
|
vpshufd $0x93, %ymm3, %ymm3
|
||||||
|
vpshufd $0x93, %ymm11, %ymm11
|
||||||
|
vpshufd $0x93, %ymm15, %ymm15
|
||||||
|
vpxor %ymm5, %ymm1, %ymm1
|
||||||
|
vpxor %ymm4, %ymm1, %ymm1
|
||||||
|
vpslld $13, %ymm6, %ymm5
|
||||||
|
vpsrld $19, %ymm6, %ymm6
|
||||||
|
vpxor %ymm5, %ymm9, %ymm9
|
||||||
|
vpxor %ymm6, %ymm9, %ymm9
|
||||||
|
vpslld $13, %ymm7, %ymm5
|
||||||
|
vpsrld $19, %ymm7, %ymm7
|
||||||
|
vpxor %ymm5, %ymm13, %ymm13
|
||||||
|
vpxor %ymm7, %ymm13, %ymm13
|
||||||
|
|
||||||
|
vpaddd %ymm1, %ymm2, %ymm4
|
||||||
|
vpaddd %ymm9, %ymm10, %ymm6
|
||||||
|
vpaddd %ymm13, %ymm14, %ymm7
|
||||||
|
vpslld $18, %ymm4, %ymm5
|
||||||
|
vpsrld $14, %ymm4, %ymm4
|
||||||
|
vpshufd $0x4e, %ymm2, %ymm2
|
||||||
|
vpshufd $0x4e, %ymm10, %ymm10
|
||||||
|
vpshufd $0x4e, %ymm14, %ymm14
|
||||||
|
vpxor %ymm5, %ymm0, %ymm0
|
||||||
|
vpxor %ymm4, %ymm0, %ymm0
|
||||||
|
vpslld $18, %ymm6, %ymm5
|
||||||
|
vpsrld $14, %ymm6, %ymm6
|
||||||
|
vpxor %ymm5, %ymm8, %ymm8
|
||||||
|
vpxor %ymm6, %ymm8, %ymm8
|
||||||
|
vpslld $18, %ymm7, %ymm5
|
||||||
|
vpsrld $14, %ymm7, %ymm7
|
||||||
|
vpxor %ymm5, %ymm12, %ymm12
|
||||||
|
vpxor %ymm7, %ymm12, %ymm12
|
||||||
|
|
||||||
|
vpaddd %ymm0, %ymm3, %ymm4
|
||||||
|
vpaddd %ymm8, %ymm11, %ymm6
|
||||||
|
vpaddd %ymm12, %ymm15, %ymm7
|
||||||
|
vpslld $7, %ymm4, %ymm5
|
||||||
|
vpsrld $25, %ymm4, %ymm4
|
||||||
|
vpshufd $0x39, %ymm1, %ymm1
|
||||||
|
vpxor %ymm5, %ymm1, %ymm1
|
||||||
|
vpxor %ymm4, %ymm1, %ymm1
|
||||||
|
vpslld $7, %ymm6, %ymm5
|
||||||
|
vpsrld $25, %ymm6, %ymm6
|
||||||
|
vpshufd $0x39, %ymm9, %ymm9
|
||||||
|
vpxor %ymm5, %ymm9, %ymm9
|
||||||
|
vpxor %ymm6, %ymm9, %ymm9
|
||||||
|
vpslld $7, %ymm7, %ymm5
|
||||||
|
vpsrld $25, %ymm7, %ymm7
|
||||||
|
vpshufd $0x39, %ymm13, %ymm13
|
||||||
|
vpxor %ymm5, %ymm13, %ymm13
|
||||||
|
vpxor %ymm7, %ymm13, %ymm13
|
||||||
|
|
||||||
|
vpaddd %ymm1, %ymm0, %ymm4
|
||||||
|
vpaddd %ymm9, %ymm8, %ymm6
|
||||||
|
vpaddd %ymm13, %ymm12, %ymm7
|
||||||
|
vpslld $9, %ymm4, %ymm5
|
||||||
|
vpsrld $23, %ymm4, %ymm4
|
||||||
|
vpxor %ymm5, %ymm2, %ymm2
|
||||||
|
vpxor %ymm4, %ymm2, %ymm2
|
||||||
|
vpslld $9, %ymm6, %ymm5
|
||||||
|
vpsrld $23, %ymm6, %ymm6
|
||||||
|
vpxor %ymm5, %ymm10, %ymm10
|
||||||
|
vpxor %ymm6, %ymm10, %ymm10
|
||||||
|
vpslld $9, %ymm7, %ymm5
|
||||||
|
vpsrld $23, %ymm7, %ymm7
|
||||||
|
vpxor %ymm5, %ymm14, %ymm14
|
||||||
|
vpxor %ymm7, %ymm14, %ymm14
|
||||||
|
|
||||||
|
vpaddd %ymm2, %ymm1, %ymm4
|
||||||
|
vpaddd %ymm10, %ymm9, %ymm6
|
||||||
|
vpaddd %ymm14, %ymm13, %ymm7
|
||||||
|
vpslld $13, %ymm4, %ymm5
|
||||||
|
vpsrld $19, %ymm4, %ymm4
|
||||||
|
vpshufd $0x93, %ymm1, %ymm1
|
||||||
|
vpshufd $0x93, %ymm9, %ymm9
|
||||||
|
vpshufd $0x93, %ymm13, %ymm13
|
||||||
|
vpxor %ymm5, %ymm3, %ymm3
|
||||||
|
vpxor %ymm4, %ymm3, %ymm3
|
||||||
|
vpslld $13, %ymm6, %ymm5
|
||||||
|
vpsrld $19, %ymm6, %ymm6
|
||||||
|
vpxor %ymm5, %ymm11, %ymm11
|
||||||
|
vpxor %ymm6, %ymm11, %ymm11
|
||||||
|
vpslld $13, %ymm7, %ymm5
|
||||||
|
vpsrld $19, %ymm7, %ymm7
|
||||||
|
vpxor %ymm5, %ymm15, %ymm15
|
||||||
|
vpxor %ymm7, %ymm15, %ymm15
|
||||||
|
|
||||||
|
vpaddd %ymm3, %ymm2, %ymm4
|
||||||
|
vpaddd %ymm11, %ymm10, %ymm6
|
||||||
|
vpaddd %ymm15, %ymm14, %ymm7
|
||||||
|
vpslld $18, %ymm4, %ymm5
|
||||||
|
vpsrld $14, %ymm4, %ymm4
|
||||||
|
vpshufd $0x4e, %ymm2, %ymm2
|
||||||
|
vpshufd $0x4e, %ymm10, %ymm10
|
||||||
|
vpxor %ymm5, %ymm0, %ymm0
|
||||||
|
vpxor %ymm4, %ymm0, %ymm0
|
||||||
|
vpslld $18, %ymm6, %ymm5
|
||||||
|
vpsrld $14, %ymm6, %ymm6
|
||||||
|
vpshufd $0x4e, %ymm14, %ymm14
|
||||||
|
vpshufd $0x39, %ymm11, %ymm11
|
||||||
|
vpxor %ymm5, %ymm8, %ymm8
|
||||||
|
vpxor %ymm6, %ymm8, %ymm8
|
||||||
|
vpslld $18, %ymm7, %ymm5
|
||||||
|
vpsrld $14, %ymm7, %ymm7
|
||||||
|
vpshufd $0x39, %ymm3, %ymm3
|
||||||
|
vpshufd $0x39, %ymm15, %ymm15
|
||||||
|
vpxor %ymm5, %ymm12, %ymm12
|
||||||
|
vpxor %ymm7, %ymm12, %ymm12
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro salsa8_core_6way_avx2
|
||||||
|
salsa8_core_6way_avx2_doubleround
|
||||||
|
salsa8_core_6way_avx2_doubleround
|
||||||
|
salsa8_core_6way_avx2_doubleround
|
||||||
|
salsa8_core_6way_avx2_doubleround
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.text
|
||||||
|
.p2align 6
|
||||||
|
.globl scrypt_core_6way
|
||||||
|
.globl _scrypt_core_6way
|
||||||
|
scrypt_core_6way:
|
||||||
|
_scrypt_core_6way:
|
||||||
|
pushq %rbx
|
||||||
|
pushq %rbp
|
||||||
|
#if defined(WIN64)
|
||||||
|
subq $176, %rsp
|
||||||
|
vmovdqa %xmm6, 8(%rsp)
|
||||||
|
vmovdqa %xmm7, 24(%rsp)
|
||||||
|
vmovdqa %xmm8, 40(%rsp)
|
||||||
|
vmovdqa %xmm9, 56(%rsp)
|
||||||
|
vmovdqa %xmm10, 72(%rsp)
|
||||||
|
vmovdqa %xmm11, 88(%rsp)
|
||||||
|
vmovdqa %xmm12, 104(%rsp)
|
||||||
|
vmovdqa %xmm13, 120(%rsp)
|
||||||
|
vmovdqa %xmm14, 136(%rsp)
|
||||||
|
vmovdqa %xmm15, 152(%rsp)
|
||||||
|
pushq %rdi
|
||||||
|
pushq %rsi
|
||||||
|
movq %rcx, %rdi
|
||||||
|
movq %rdx, %rsi
|
||||||
|
#endif
|
||||||
|
movq %rsp, %rdx
|
||||||
|
subq $768, %rsp
|
||||||
|
andq $-128, %rsp
|
||||||
|
|
||||||
|
.macro scrypt_core_6way_cleanup
|
||||||
|
movq %rdx, %rsp
|
||||||
|
#if defined(WIN64)
|
||||||
|
popq %rsi
|
||||||
|
popq %rdi
|
||||||
|
vmovdqa 8(%rsp), %xmm6
|
||||||
|
vmovdqa 24(%rsp), %xmm7
|
||||||
|
vmovdqa 40(%rsp), %xmm8
|
||||||
|
vmovdqa 56(%rsp), %xmm9
|
||||||
|
vmovdqa 72(%rsp), %xmm10
|
||||||
|
vmovdqa 88(%rsp), %xmm11
|
||||||
|
vmovdqa 104(%rsp), %xmm12
|
||||||
|
vmovdqa 120(%rsp), %xmm13
|
||||||
|
vmovdqa 136(%rsp), %xmm14
|
||||||
|
vmovdqa 152(%rsp), %xmm15
|
||||||
|
addq $176, %rsp
|
||||||
|
#endif
|
||||||
|
popq %rbp
|
||||||
|
popq %rbx
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro scrypt_shuffle_pack2 src, so, dest, do
|
||||||
|
vmovdqa \so+0*16(\src), %xmm0
|
||||||
|
vmovdqa \so+1*16(\src), %xmm1
|
||||||
|
vmovdqa \so+2*16(\src), %xmm2
|
||||||
|
vmovdqa \so+3*16(\src), %xmm3
|
||||||
|
vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0
|
||||||
|
vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1
|
||||||
|
vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2
|
||||||
|
vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3
|
||||||
|
vpblendd $0x33, %ymm0, %ymm2, %ymm4
|
||||||
|
vpblendd $0xcc, %ymm1, %ymm3, %ymm5
|
||||||
|
vpblendd $0x33, %ymm2, %ymm0, %ymm6
|
||||||
|
vpblendd $0xcc, %ymm3, %ymm1, %ymm7
|
||||||
|
vpblendd $0x55, %ymm7, %ymm6, %ymm3
|
||||||
|
vpblendd $0x55, %ymm6, %ymm5, %ymm2
|
||||||
|
vpblendd $0x55, %ymm5, %ymm4, %ymm1
|
||||||
|
vpblendd $0x55, %ymm4, %ymm7, %ymm0
|
||||||
|
vmovdqa %ymm0, \do+0*32(\dest)
|
||||||
|
vmovdqa %ymm1, \do+1*32(\dest)
|
||||||
|
vmovdqa %ymm2, \do+2*32(\dest)
|
||||||
|
vmovdqa %ymm3, \do+3*32(\dest)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro scrypt_shuffle_unpack2 src, so, dest, do
|
||||||
|
vmovdqa \so+0*32(\src), %ymm0
|
||||||
|
vmovdqa \so+1*32(\src), %ymm1
|
||||||
|
vmovdqa \so+2*32(\src), %ymm2
|
||||||
|
vmovdqa \so+3*32(\src), %ymm3
|
||||||
|
vpblendd $0x33, %ymm0, %ymm2, %ymm4
|
||||||
|
vpblendd $0xcc, %ymm1, %ymm3, %ymm5
|
||||||
|
vpblendd $0x33, %ymm2, %ymm0, %ymm6
|
||||||
|
vpblendd $0xcc, %ymm3, %ymm1, %ymm7
|
||||||
|
vpblendd $0x55, %ymm7, %ymm6, %ymm3
|
||||||
|
vpblendd $0x55, %ymm6, %ymm5, %ymm2
|
||||||
|
vpblendd $0x55, %ymm5, %ymm4, %ymm1
|
||||||
|
vpblendd $0x55, %ymm4, %ymm7, %ymm0
|
||||||
|
vmovdqa %xmm0, \do+0*16(\dest)
|
||||||
|
vmovdqa %xmm1, \do+1*16(\dest)
|
||||||
|
vmovdqa %xmm2, \do+2*16(\dest)
|
||||||
|
vmovdqa %xmm3, \do+3*16(\dest)
|
||||||
|
vextracti128 $1, %ymm0, \do+128+0*16(\dest)
|
||||||
|
vextracti128 $1, %ymm1, \do+128+1*16(\dest)
|
||||||
|
vextracti128 $1, %ymm2, \do+128+2*16(\dest)
|
||||||
|
vextracti128 $1, %ymm3, \do+128+3*16(\dest)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
scrypt_core_6way_avx2:
|
||||||
|
scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128
|
||||||
|
scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128
|
||||||
|
scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128
|
||||||
|
scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128
|
||||||
|
scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128
|
||||||
|
scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128
|
||||||
|
|
||||||
|
vmovdqa 0*256+4*32(%rsp), %ymm0
|
||||||
|
vmovdqa 0*256+5*32(%rsp), %ymm1
|
||||||
|
vmovdqa 0*256+6*32(%rsp), %ymm2
|
||||||
|
vmovdqa 0*256+7*32(%rsp), %ymm3
|
||||||
|
vmovdqa 1*256+4*32(%rsp), %ymm8
|
||||||
|
vmovdqa 1*256+5*32(%rsp), %ymm9
|
||||||
|
vmovdqa 1*256+6*32(%rsp), %ymm10
|
||||||
|
vmovdqa 1*256+7*32(%rsp), %ymm11
|
||||||
|
vmovdqa 2*256+4*32(%rsp), %ymm12
|
||||||
|
vmovdqa 2*256+5*32(%rsp), %ymm13
|
||||||
|
vmovdqa 2*256+6*32(%rsp), %ymm14
|
||||||
|
vmovdqa 2*256+7*32(%rsp), %ymm15
|
||||||
|
|
||||||
|
movq %rsi, %rbx
|
||||||
|
leaq 6*131072(%rsi), %rax
|
||||||
|
scrypt_core_6way_avx2_loop1:
|
||||||
|
vmovdqa %ymm0, 0*256+4*32(%rbx)
|
||||||
|
vmovdqa %ymm1, 0*256+5*32(%rbx)
|
||||||
|
vmovdqa %ymm2, 0*256+6*32(%rbx)
|
||||||
|
vmovdqa %ymm3, 0*256+7*32(%rbx)
|
||||||
|
vpxor 0*256+0*32(%rsp), %ymm0, %ymm0
|
||||||
|
vpxor 0*256+1*32(%rsp), %ymm1, %ymm1
|
||||||
|
vpxor 0*256+2*32(%rsp), %ymm2, %ymm2
|
||||||
|
vpxor 0*256+3*32(%rsp), %ymm3, %ymm3
|
||||||
|
vmovdqa %ymm8, 1*256+4*32(%rbx)
|
||||||
|
vmovdqa %ymm9, 1*256+5*32(%rbx)
|
||||||
|
vmovdqa %ymm10, 1*256+6*32(%rbx)
|
||||||
|
vmovdqa %ymm11, 1*256+7*32(%rbx)
|
||||||
|
vpxor 1*256+0*32(%rsp), %ymm8, %ymm8
|
||||||
|
vpxor 1*256+1*32(%rsp), %ymm9, %ymm9
|
||||||
|
vpxor 1*256+2*32(%rsp), %ymm10, %ymm10
|
||||||
|
vpxor 1*256+3*32(%rsp), %ymm11, %ymm11
|
||||||
|
vmovdqa %ymm12, 2*256+4*32(%rbx)
|
||||||
|
vmovdqa %ymm13, 2*256+5*32(%rbx)
|
||||||
|
vmovdqa %ymm14, 2*256+6*32(%rbx)
|
||||||
|
vmovdqa %ymm15, 2*256+7*32(%rbx)
|
||||||
|
vpxor 2*256+0*32(%rsp), %ymm12, %ymm12
|
||||||
|
vpxor 2*256+1*32(%rsp), %ymm13, %ymm13
|
||||||
|
vpxor 2*256+2*32(%rsp), %ymm14, %ymm14
|
||||||
|
vpxor 2*256+3*32(%rsp), %ymm15, %ymm15
|
||||||
|
vmovdqa %ymm0, 0*256+0*32(%rbx)
|
||||||
|
vmovdqa %ymm1, 0*256+1*32(%rbx)
|
||||||
|
vmovdqa %ymm2, 0*256+2*32(%rbx)
|
||||||
|
vmovdqa %ymm3, 0*256+3*32(%rbx)
|
||||||
|
vmovdqa %ymm8, 1*256+0*32(%rbx)
|
||||||
|
vmovdqa %ymm9, 1*256+1*32(%rbx)
|
||||||
|
vmovdqa %ymm10, 1*256+2*32(%rbx)
|
||||||
|
vmovdqa %ymm11, 1*256+3*32(%rbx)
|
||||||
|
vmovdqa %ymm12, 2*256+0*32(%rbx)
|
||||||
|
vmovdqa %ymm13, 2*256+1*32(%rbx)
|
||||||
|
vmovdqa %ymm14, 2*256+2*32(%rbx)
|
||||||
|
vmovdqa %ymm15, 2*256+3*32(%rbx)
|
||||||
|
|
||||||
|
salsa8_core_6way_avx2
|
||||||
|
vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0
|
||||||
|
vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1
|
||||||
|
vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2
|
||||||
|
vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3
|
||||||
|
vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8
|
||||||
|
vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9
|
||||||
|
vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10
|
||||||
|
vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11
|
||||||
|
vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12
|
||||||
|
vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13
|
||||||
|
vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14
|
||||||
|
vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15
|
||||||
|
vmovdqa %ymm0, 0*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm1, 0*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm2, 0*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm3, 0*256+3*32(%rsp)
|
||||||
|
vmovdqa %ymm8, 1*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm9, 1*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm10, 1*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm11, 1*256+3*32(%rsp)
|
||||||
|
vmovdqa %ymm12, 2*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm13, 2*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm14, 2*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm15, 2*256+3*32(%rsp)
|
||||||
|
|
||||||
|
vpxor 0*256+4*32(%rbx), %ymm0, %ymm0
|
||||||
|
vpxor 0*256+5*32(%rbx), %ymm1, %ymm1
|
||||||
|
vpxor 0*256+6*32(%rbx), %ymm2, %ymm2
|
||||||
|
vpxor 0*256+7*32(%rbx), %ymm3, %ymm3
|
||||||
|
vpxor 1*256+4*32(%rbx), %ymm8, %ymm8
|
||||||
|
vpxor 1*256+5*32(%rbx), %ymm9, %ymm9
|
||||||
|
vpxor 1*256+6*32(%rbx), %ymm10, %ymm10
|
||||||
|
vpxor 1*256+7*32(%rbx), %ymm11, %ymm11
|
||||||
|
vpxor 2*256+4*32(%rbx), %ymm12, %ymm12
|
||||||
|
vpxor 2*256+5*32(%rbx), %ymm13, %ymm13
|
||||||
|
vpxor 2*256+6*32(%rbx), %ymm14, %ymm14
|
||||||
|
vpxor 2*256+7*32(%rbx), %ymm15, %ymm15
|
||||||
|
vmovdqa %ymm0, 0*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm1, 0*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm2, 0*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm3, 0*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm8, 1*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm9, 1*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm10, 1*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm11, 1*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm12, 2*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm13, 2*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm14, 2*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm15, 2*256+7*32(%rsp)
|
||||||
|
salsa8_core_6way_avx2
|
||||||
|
vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0
|
||||||
|
vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1
|
||||||
|
vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2
|
||||||
|
vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3
|
||||||
|
vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8
|
||||||
|
vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9
|
||||||
|
vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10
|
||||||
|
vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11
|
||||||
|
vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12
|
||||||
|
vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13
|
||||||
|
vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14
|
||||||
|
vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15
|
||||||
|
|
||||||
|
addq $6*128, %rbx
|
||||||
|
cmpq %rax, %rbx
|
||||||
|
jne scrypt_core_6way_avx2_loop1
|
||||||
|
|
||||||
|
vmovdqa %ymm0, 0*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm1, 0*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm2, 0*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm3, 0*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm8, 1*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm9, 1*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm10, 1*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm11, 1*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm12, 2*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm13, 2*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm14, 2*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm15, 2*256+7*32(%rsp)
|
||||||
|
|
||||||
|
movq $1024, %rcx
|
||||||
|
scrypt_core_6way_avx2_loop2:
|
||||||
|
vmovd %xmm0, %ebp
|
||||||
|
vmovd %xmm8, %ebx
|
||||||
|
vmovd %xmm12, %eax
|
||||||
|
vextracti128 $1, %ymm0, %xmm4
|
||||||
|
vextracti128 $1, %ymm8, %xmm5
|
||||||
|
vextracti128 $1, %ymm12, %xmm6
|
||||||
|
vmovd %xmm4, %r8d
|
||||||
|
vmovd %xmm5, %r9d
|
||||||
|
vmovd %xmm6, %r10d
|
||||||
|
vpxor 0*256+0*32(%rsp), %ymm0, %ymm0
|
||||||
|
vpxor 0*256+1*32(%rsp), %ymm1, %ymm1
|
||||||
|
vpxor 0*256+2*32(%rsp), %ymm2, %ymm2
|
||||||
|
vpxor 0*256+3*32(%rsp), %ymm3, %ymm3
|
||||||
|
vpxor 1*256+0*32(%rsp), %ymm8, %ymm8
|
||||||
|
vpxor 1*256+1*32(%rsp), %ymm9, %ymm9
|
||||||
|
vpxor 1*256+2*32(%rsp), %ymm10, %ymm10
|
||||||
|
vpxor 1*256+3*32(%rsp), %ymm11, %ymm11
|
||||||
|
vpxor 2*256+0*32(%rsp), %ymm12, %ymm12
|
||||||
|
vpxor 2*256+1*32(%rsp), %ymm13, %ymm13
|
||||||
|
vpxor 2*256+2*32(%rsp), %ymm14, %ymm14
|
||||||
|
vpxor 2*256+3*32(%rsp), %ymm15, %ymm15
|
||||||
|
andl $1023, %ebp
|
||||||
|
leaq 0(%rbp, %rbp, 2), %rbp
|
||||||
|
shll $8, %ebp
|
||||||
|
andl $1023, %ebx
|
||||||
|
leaq 1(%rbx, %rbx, 2), %rbx
|
||||||
|
shll $8, %ebx
|
||||||
|
andl $1023, %eax
|
||||||
|
leaq 2(%rax, %rax, 2), %rax
|
||||||
|
shll $8, %eax
|
||||||
|
andl $1023, %r8d
|
||||||
|
leaq 0(%r8, %r8, 2), %r8
|
||||||
|
shll $8, %r8d
|
||||||
|
andl $1023, %r9d
|
||||||
|
leaq 1(%r9, %r9, 2), %r9
|
||||||
|
shll $8, %r9d
|
||||||
|
andl $1023, %r10d
|
||||||
|
leaq 2(%r10, %r10, 2), %r10
|
||||||
|
shll $8, %r10d
|
||||||
|
vmovdqa 0*32(%rsi, %rbp), %xmm4
|
||||||
|
vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4
|
||||||
|
vmovdqa 1*32(%rsi, %rbp), %xmm5
|
||||||
|
vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5
|
||||||
|
vmovdqa 2*32(%rsi, %rbp), %xmm6
|
||||||
|
vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6
|
||||||
|
vmovdqa 3*32(%rsi, %rbp), %xmm7
|
||||||
|
vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7
|
||||||
|
vpxor %ymm4, %ymm0, %ymm0
|
||||||
|
vpxor %ymm5, %ymm1, %ymm1
|
||||||
|
vpxor %ymm6, %ymm2, %ymm2
|
||||||
|
vpxor %ymm7, %ymm3, %ymm3
|
||||||
|
vmovdqa 0*32(%rsi, %rbx), %xmm4
|
||||||
|
vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4
|
||||||
|
vmovdqa 1*32(%rsi, %rbx), %xmm5
|
||||||
|
vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5
|
||||||
|
vmovdqa 2*32(%rsi, %rbx), %xmm6
|
||||||
|
vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6
|
||||||
|
vmovdqa 3*32(%rsi, %rbx), %xmm7
|
||||||
|
vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7
|
||||||
|
vpxor %ymm4, %ymm8, %ymm8
|
||||||
|
vpxor %ymm5, %ymm9, %ymm9
|
||||||
|
vpxor %ymm6, %ymm10, %ymm10
|
||||||
|
vpxor %ymm7, %ymm11, %ymm11
|
||||||
|
vmovdqa 0*32(%rsi, %rax), %xmm4
|
||||||
|
vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4
|
||||||
|
vmovdqa 1*32(%rsi, %rax), %xmm5
|
||||||
|
vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5
|
||||||
|
vmovdqa 2*32(%rsi, %rax), %xmm6
|
||||||
|
vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6
|
||||||
|
vmovdqa 3*32(%rsi, %rax), %xmm7
|
||||||
|
vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7
|
||||||
|
vpxor %ymm4, %ymm12, %ymm12
|
||||||
|
vpxor %ymm5, %ymm13, %ymm13
|
||||||
|
vpxor %ymm6, %ymm14, %ymm14
|
||||||
|
vpxor %ymm7, %ymm15, %ymm15
|
||||||
|
|
||||||
|
vmovdqa %ymm0, 0*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm1, 0*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm2, 0*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm3, 0*256+3*32(%rsp)
|
||||||
|
vmovdqa %ymm8, 1*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm9, 1*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm10, 1*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm11, 1*256+3*32(%rsp)
|
||||||
|
vmovdqa %ymm12, 2*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm13, 2*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm14, 2*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm15, 2*256+3*32(%rsp)
|
||||||
|
salsa8_core_6way_avx2
|
||||||
|
vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0
|
||||||
|
vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1
|
||||||
|
vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2
|
||||||
|
vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3
|
||||||
|
vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8
|
||||||
|
vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9
|
||||||
|
vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10
|
||||||
|
vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11
|
||||||
|
vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12
|
||||||
|
vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13
|
||||||
|
vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14
|
||||||
|
vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15
|
||||||
|
vmovdqa %ymm0, 0*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm1, 0*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm2, 0*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm3, 0*256+3*32(%rsp)
|
||||||
|
vmovdqa %ymm8, 1*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm9, 1*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm10, 1*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm11, 1*256+3*32(%rsp)
|
||||||
|
vmovdqa %ymm12, 2*256+0*32(%rsp)
|
||||||
|
vmovdqa %ymm13, 2*256+1*32(%rsp)
|
||||||
|
vmovdqa %ymm14, 2*256+2*32(%rsp)
|
||||||
|
vmovdqa %ymm15, 2*256+3*32(%rsp)
|
||||||
|
|
||||||
|
vmovdqa 4*32(%rsi, %rbp), %xmm4
|
||||||
|
vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4
|
||||||
|
vmovdqa 5*32(%rsi, %rbp), %xmm5
|
||||||
|
vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5
|
||||||
|
vmovdqa 6*32(%rsi, %rbp), %xmm6
|
||||||
|
vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6
|
||||||
|
vmovdqa 7*32(%rsi, %rbp), %xmm7
|
||||||
|
vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7
|
||||||
|
vpxor %ymm4, %ymm0, %ymm0
|
||||||
|
vpxor %ymm5, %ymm1, %ymm1
|
||||||
|
vpxor %ymm6, %ymm2, %ymm2
|
||||||
|
vpxor %ymm7, %ymm3, %ymm3
|
||||||
|
vmovdqa 4*32(%rsi, %rbx), %xmm4
|
||||||
|
vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4
|
||||||
|
vmovdqa 5*32(%rsi, %rbx), %xmm5
|
||||||
|
vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5
|
||||||
|
vmovdqa 6*32(%rsi, %rbx), %xmm6
|
||||||
|
vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6
|
||||||
|
vmovdqa 7*32(%rsi, %rbx), %xmm7
|
||||||
|
vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7
|
||||||
|
vpxor %ymm4, %ymm8, %ymm8
|
||||||
|
vpxor %ymm5, %ymm9, %ymm9
|
||||||
|
vpxor %ymm6, %ymm10, %ymm10
|
||||||
|
vpxor %ymm7, %ymm11, %ymm11
|
||||||
|
vmovdqa 4*32(%rsi, %rax), %xmm4
|
||||||
|
vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4
|
||||||
|
vmovdqa 5*32(%rsi, %rax), %xmm5
|
||||||
|
vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5
|
||||||
|
vmovdqa 6*32(%rsi, %rax), %xmm6
|
||||||
|
vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6
|
||||||
|
vmovdqa 7*32(%rsi, %rax), %xmm7
|
||||||
|
vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7
|
||||||
|
vpxor %ymm4, %ymm12, %ymm12
|
||||||
|
vpxor %ymm5, %ymm13, %ymm13
|
||||||
|
vpxor %ymm6, %ymm14, %ymm14
|
||||||
|
vpxor %ymm7, %ymm15, %ymm15
|
||||||
|
vpxor 0*256+4*32(%rsp), %ymm0, %ymm0
|
||||||
|
vpxor 0*256+5*32(%rsp), %ymm1, %ymm1
|
||||||
|
vpxor 0*256+6*32(%rsp), %ymm2, %ymm2
|
||||||
|
vpxor 0*256+7*32(%rsp), %ymm3, %ymm3
|
||||||
|
vpxor 1*256+4*32(%rsp), %ymm8, %ymm8
|
||||||
|
vpxor 1*256+5*32(%rsp), %ymm9, %ymm9
|
||||||
|
vpxor 1*256+6*32(%rsp), %ymm10, %ymm10
|
||||||
|
vpxor 1*256+7*32(%rsp), %ymm11, %ymm11
|
||||||
|
vpxor 2*256+4*32(%rsp), %ymm12, %ymm12
|
||||||
|
vpxor 2*256+5*32(%rsp), %ymm13, %ymm13
|
||||||
|
vpxor 2*256+6*32(%rsp), %ymm14, %ymm14
|
||||||
|
vpxor 2*256+7*32(%rsp), %ymm15, %ymm15
|
||||||
|
vmovdqa %ymm0, 0*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm1, 0*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm2, 0*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm3, 0*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm8, 1*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm9, 1*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm10, 1*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm11, 1*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm12, 2*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm13, 2*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm14, 2*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm15, 2*256+7*32(%rsp)
|
||||||
|
salsa8_core_6way_avx2
|
||||||
|
vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0
|
||||||
|
vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1
|
||||||
|
vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2
|
||||||
|
vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3
|
||||||
|
vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8
|
||||||
|
vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9
|
||||||
|
vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10
|
||||||
|
vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11
|
||||||
|
vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12
|
||||||
|
vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13
|
||||||
|
vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14
|
||||||
|
vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15
|
||||||
|
vmovdqa %ymm0, 0*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm1, 0*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm2, 0*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm3, 0*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm8, 1*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm9, 1*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm10, 1*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm11, 1*256+7*32(%rsp)
|
||||||
|
vmovdqa %ymm12, 2*256+4*32(%rsp)
|
||||||
|
vmovdqa %ymm13, 2*256+5*32(%rsp)
|
||||||
|
vmovdqa %ymm14, 2*256+6*32(%rsp)
|
||||||
|
vmovdqa %ymm15, 2*256+7*32(%rsp)
|
||||||
|
|
||||||
|
subq $1, %rcx
|
||||||
|
ja scrypt_core_6way_avx2_loop2
|
||||||
|
|
||||||
|
scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0
|
||||||
|
scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64
|
||||||
|
scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0
|
||||||
|
scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64
|
||||||
|
scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0
|
||||||
|
scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64
|
||||||
|
|
||||||
|
scrypt_core_6way_cleanup
|
||||||
|
ret
|
||||||
|
|
||||||
|
#endif /* USE_AVX2 */
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
293
scrypt.c
293
scrypt.c
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2012 pooler
|
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
*
|
*
|
||||||
* Redistribution and use in source and binary forms, with or without
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -256,6 +256,128 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
|
||||||
#endif /* HAVE_SHA256_4WAY */
|
#endif /* HAVE_SHA256_4WAY */
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef HAVE_SHA256_8WAY
|
||||||
|
|
||||||
|
static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
|
||||||
|
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||||
|
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
|
||||||
|
uint32_t *tstate, uint32_t *ostate)
|
||||||
|
{
|
||||||
|
uint32_t ihash[8 * 8] __attribute__((aligned(32)));
|
||||||
|
uint32_t pad[8 * 16] __attribute__((aligned(32)));
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* tstate is assumed to contain the midstate of key */
|
||||||
|
memcpy(pad, key + 8 * 16, 8 * 16);
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
pad[8 * 4 + i] = 0x80000000;
|
||||||
|
memset(pad + 8 * 5, 0x00, 8 * 40);
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
pad[8 * 15 + i] = 0x00000280;
|
||||||
|
sha256_transform_8way(tstate, pad, 0);
|
||||||
|
memcpy(ihash, tstate, 8 * 32);
|
||||||
|
|
||||||
|
sha256_init_8way(ostate);
|
||||||
|
for (i = 0; i < 8 * 8; i++)
|
||||||
|
pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
||||||
|
for (; i < 8 * 16; i++)
|
||||||
|
pad[i] = 0x5c5c5c5c;
|
||||||
|
sha256_transform_8way(ostate, pad, 0);
|
||||||
|
|
||||||
|
sha256_init_8way(tstate);
|
||||||
|
for (i = 0; i < 8 * 8; i++)
|
||||||
|
pad[i] = ihash[i] ^ 0x36363636;
|
||||||
|
for (; i < 8 * 16; i++)
|
||||||
|
pad[i] = 0x36363636;
|
||||||
|
sha256_transform_8way(tstate, pad, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
|
||||||
|
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
|
||||||
|
{
|
||||||
|
uint32_t istate[8 * 8] __attribute__((aligned(32)));
|
||||||
|
uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
|
||||||
|
uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
|
||||||
|
uint32_t obuf[8 * 16] __attribute__((aligned(32)));
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
memcpy(istate, tstate, 8 * 32);
|
||||||
|
sha256_transform_8way(istate, salt, 0);
|
||||||
|
|
||||||
|
memcpy(ibuf, salt + 8 * 16, 8 * 16);
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
ibuf[8 * 5 + i] = 0x80000000;
|
||||||
|
memset(ibuf + 8 * 6, 0x00, 8 * 36);
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
ibuf[8 * 15 + i] = 0x000004a0;
|
||||||
|
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
obuf[8 * 8 + i] = 0x80000000;
|
||||||
|
memset(obuf + 8 * 9, 0x00, 8 * 24);
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
obuf[8 * 15 + i] = 0x00000300;
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
memcpy(obuf, istate, 8 * 32);
|
||||||
|
ibuf[8 * 4 + 0] = i + 1;
|
||||||
|
ibuf[8 * 4 + 1] = i + 1;
|
||||||
|
ibuf[8 * 4 + 2] = i + 1;
|
||||||
|
ibuf[8 * 4 + 3] = i + 1;
|
||||||
|
ibuf[8 * 4 + 4] = i + 1;
|
||||||
|
ibuf[8 * 4 + 5] = i + 1;
|
||||||
|
ibuf[8 * 4 + 6] = i + 1;
|
||||||
|
ibuf[8 * 4 + 7] = i + 1;
|
||||||
|
sha256_transform_8way(obuf, ibuf, 0);
|
||||||
|
|
||||||
|
memcpy(ostate2, ostate, 8 * 32);
|
||||||
|
sha256_transform_8way(ostate2, obuf, 0);
|
||||||
|
for (j = 0; j < 8 * 8; j++)
|
||||||
|
output[8 * 8 * i + j] = swab32(ostate2[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
|
||||||
|
uint32_t *ostate, const uint32_t *salt, uint32_t *output)
|
||||||
|
{
|
||||||
|
uint32_t buf[8 * 16] __attribute__((aligned(32)));
|
||||||
|
int i;
|
||||||
|
|
||||||
|
sha256_transform_8way(tstate, salt, 1);
|
||||||
|
sha256_transform_8way(tstate, salt + 8 * 16, 1);
|
||||||
|
sha256_transform_8way(tstate, finalblk_8way, 0);
|
||||||
|
|
||||||
|
memcpy(buf, tstate, 8 * 32);
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
buf[8 * 8 + i] = 0x80000000;
|
||||||
|
memset(buf + 8 * 9, 0x00, 8 * 24);
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
buf[8 * 15 + i] = 0x00000300;
|
||||||
|
sha256_transform_8way(ostate, buf, 0);
|
||||||
|
|
||||||
|
for (i = 0; i < 8 * 8; i++)
|
||||||
|
output[i] = swab32(ostate[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* HAVE_SHA256_8WAY */
|
||||||
|
|
||||||
|
|
||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__)
|
||||||
|
|
||||||
#define SCRYPT_MAX_WAYS 12
|
#define SCRYPT_MAX_WAYS 12
|
||||||
|
@ -263,6 +385,12 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
|
||||||
int scrypt_best_throughput();
|
int scrypt_best_throughput();
|
||||||
void scrypt_core(uint32_t *X, uint32_t *V);
|
void scrypt_core(uint32_t *X, uint32_t *V);
|
||||||
void scrypt_core_3way(uint32_t *X, uint32_t *V);
|
void scrypt_core_3way(uint32_t *X, uint32_t *V);
|
||||||
|
#if defined(USE_AVX2)
|
||||||
|
#undef SCRYPT_MAX_WAYS
|
||||||
|
#define SCRYPT_MAX_WAYS 24
|
||||||
|
#define HAVE_SCRYPT_6WAY 1
|
||||||
|
void scrypt_core_6way(uint32_t *X, uint32_t *V);
|
||||||
|
#endif
|
||||||
|
|
||||||
#elif defined(__i386__)
|
#elif defined(__i386__)
|
||||||
|
|
||||||
|
@ -410,47 +538,32 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input,
|
||||||
uint32_t W[4 * 32] __attribute__((aligned(128)));
|
uint32_t W[4 * 32] __attribute__((aligned(128)));
|
||||||
uint32_t X[4 * 32] __attribute__((aligned(128)));
|
uint32_t X[4 * 32] __attribute__((aligned(128)));
|
||||||
uint32_t *V;
|
uint32_t *V;
|
||||||
int i;
|
int i, k;
|
||||||
|
|
||||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||||
|
|
||||||
for (i = 0; i < 20; i++) {
|
for (i = 0; i < 20; i++)
|
||||||
W[4 * i + 0] = input[0 * 20 + i];
|
for (k = 0; k < 4; k++)
|
||||||
W[4 * i + 1] = input[1 * 20 + i];
|
W[4 * i + k] = input[k * 20 + i];
|
||||||
W[4 * i + 2] = input[2 * 20 + i];
|
for (i = 0; i < 8; i++)
|
||||||
W[4 * i + 3] = input[3 * 20 + i];
|
for (k = 0; k < 4; k++)
|
||||||
}
|
tstate[4 * i + k] = midstate[i];
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
tstate[4 * i + 0] = midstate[i];
|
|
||||||
tstate[4 * i + 1] = midstate[i];
|
|
||||||
tstate[4 * i + 2] = midstate[i];
|
|
||||||
tstate[4 * i + 3] = midstate[i];
|
|
||||||
}
|
|
||||||
HMAC_SHA256_80_init_4way(W, tstate, ostate);
|
HMAC_SHA256_80_init_4way(W, tstate, ostate);
|
||||||
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
|
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++)
|
||||||
X[0 * 32 + i] = W[4 * i + 0];
|
for (k = 0; k < 4; k++)
|
||||||
X[1 * 32 + i] = W[4 * i + 1];
|
X[k * 32 + i] = W[4 * i + k];
|
||||||
X[2 * 32 + i] = W[4 * i + 2];
|
|
||||||
X[3 * 32 + i] = W[4 * i + 3];
|
|
||||||
}
|
|
||||||
scrypt_core(X + 0 * 32, V);
|
scrypt_core(X + 0 * 32, V);
|
||||||
scrypt_core(X + 1 * 32, V);
|
scrypt_core(X + 1 * 32, V);
|
||||||
scrypt_core(X + 2 * 32, V);
|
scrypt_core(X + 2 * 32, V);
|
||||||
scrypt_core(X + 3 * 32, V);
|
scrypt_core(X + 3 * 32, V);
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++)
|
||||||
W[4 * i + 0] = X[0 * 32 + i];
|
for (k = 0; k < 4; k++)
|
||||||
W[4 * i + 1] = X[1 * 32 + i];
|
W[4 * i + k] = X[k * 32 + i];
|
||||||
W[4 * i + 2] = X[2 * 32 + i];
|
|
||||||
W[4 * i + 3] = X[3 * 32 + i];
|
|
||||||
}
|
|
||||||
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
|
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
|
||||||
for (i = 0; i < 8; i++) {
|
for (i = 0; i < 8; i++)
|
||||||
output[0 * 8 + i] = W[4 * i + 0];
|
for (k = 0; k < 4; k++)
|
||||||
output[1 * 8 + i] = W[4 * i + 1];
|
output[k * 8 + i] = W[4 * i + k];
|
||||||
output[2 * 8 + i] = W[4 * i + 2];
|
|
||||||
output[3 * 8 + i] = W[4 * i + 3];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif /* HAVE_SHA256_4WAY */
|
#endif /* HAVE_SHA256_4WAY */
|
||||||
|
|
||||||
|
@ -491,68 +604,97 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input,
|
||||||
uint32_t W[12 * 32] __attribute__((aligned(128)));
|
uint32_t W[12 * 32] __attribute__((aligned(128)));
|
||||||
uint32_t X[12 * 32] __attribute__((aligned(128)));
|
uint32_t X[12 * 32] __attribute__((aligned(128)));
|
||||||
uint32_t *V;
|
uint32_t *V;
|
||||||
int i, j;
|
int i, j, k;
|
||||||
|
|
||||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||||
|
|
||||||
for (j = 0; j < 3; j++) {
|
for (j = 0; j < 3; j++)
|
||||||
for (i = 0; i < 20; i++) {
|
for (i = 0; i < 20; i++)
|
||||||
W[128 * j + 4 * i + 0] = input[80 * j + 0 * 20 + i];
|
for (k = 0; k < 4; k++)
|
||||||
W[128 * j + 4 * i + 1] = input[80 * j + 1 * 20 + i];
|
W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
|
||||||
W[128 * j + 4 * i + 2] = input[80 * j + 2 * 20 + i];
|
for (j = 0; j < 3; j++)
|
||||||
W[128 * j + 4 * i + 3] = input[80 * j + 3 * 20 + i];
|
for (i = 0; i < 8; i++)
|
||||||
}
|
for (k = 0; k < 4; k++)
|
||||||
}
|
tstate[32 * j + 4 * i + k] = midstate[i];
|
||||||
for (j = 0; j < 3; j++) {
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
tstate[32 * j + 4 * i + 0] = midstate[i];
|
|
||||||
tstate[32 * j + 4 * i + 1] = midstate[i];
|
|
||||||
tstate[32 * j + 4 * i + 2] = midstate[i];
|
|
||||||
tstate[32 * j + 4 * i + 3] = midstate[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0);
|
HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0);
|
||||||
HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
|
HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
|
||||||
HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
|
HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
|
||||||
PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0);
|
PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0);
|
||||||
PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
|
PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
|
||||||
PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
|
PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
|
||||||
for (j = 0; j < 3; j++) {
|
for (j = 0; j < 3; j++)
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++)
|
||||||
X[128 * j + 0 * 32 + i] = W[128 * j + 4 * i + 0];
|
for (k = 0; k < 4; k++)
|
||||||
X[128 * j + 1 * 32 + i] = W[128 * j + 4 * i + 1];
|
X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
|
||||||
X[128 * j + 2 * 32 + i] = W[128 * j + 4 * i + 2];
|
|
||||||
X[128 * j + 3 * 32 + i] = W[128 * j + 4 * i + 3];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
scrypt_core_3way(X + 0 * 96, V);
|
scrypt_core_3way(X + 0 * 96, V);
|
||||||
scrypt_core_3way(X + 1 * 96, V);
|
scrypt_core_3way(X + 1 * 96, V);
|
||||||
scrypt_core_3way(X + 2 * 96, V);
|
scrypt_core_3way(X + 2 * 96, V);
|
||||||
scrypt_core_3way(X + 3 * 96, V);
|
scrypt_core_3way(X + 3 * 96, V);
|
||||||
for (j = 0; j < 3; j++) {
|
for (j = 0; j < 3; j++)
|
||||||
for (i = 0; i < 32; i++) {
|
for (i = 0; i < 32; i++)
|
||||||
W[128 * j + 4 * i + 0] = X[128 * j + 0 * 32 + i];
|
for (k = 0; k < 4; k++)
|
||||||
W[128 * j + 4 * i + 1] = X[128 * j + 1 * 32 + i];
|
W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
|
||||||
W[128 * j + 4 * i + 2] = X[128 * j + 2 * 32 + i];
|
|
||||||
W[128 * j + 4 * i + 3] = X[128 * j + 3 * 32 + i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0);
|
PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0);
|
||||||
PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
|
PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
|
||||||
PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
|
PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
|
||||||
for (j = 0; j < 3; j++) {
|
for (j = 0; j < 3; j++)
|
||||||
for (i = 0; i < 8; i++) {
|
for (i = 0; i < 8; i++)
|
||||||
output[32 * j + 0 * 8 + i] = W[128 * j + 4 * i + 0];
|
for (k = 0; k < 4; k++)
|
||||||
output[32 * j + 1 * 8 + i] = W[128 * j + 4 * i + 1];
|
output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
|
||||||
output[32 * j + 2 * 8 + i] = W[128 * j + 4 * i + 2];
|
|
||||||
output[32 * j + 3 * 8 + i] = W[128 * j + 4 * i + 3];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif /* HAVE_SHA256_4WAY */
|
#endif /* HAVE_SHA256_4WAY */
|
||||||
|
|
||||||
#endif /* HAVE_SCRYPT_3WAY */
|
#endif /* HAVE_SCRYPT_3WAY */
|
||||||
|
|
||||||
|
#ifdef HAVE_SCRYPT_6WAY
|
||||||
|
static void scrypt_1024_1_1_256_24way(const uint32_t *input,
|
||||||
|
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
|
||||||
|
{
|
||||||
|
uint32_t tstate[24 * 8] __attribute__((aligned(128)));
|
||||||
|
uint32_t ostate[24 * 8] __attribute__((aligned(128)));
|
||||||
|
uint32_t W[24 * 32] __attribute__((aligned(128)));
|
||||||
|
uint32_t X[24 * 32] __attribute__((aligned(128)));
|
||||||
|
uint32_t *V;
|
||||||
|
int i, j, k;
|
||||||
|
|
||||||
|
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||||
|
|
||||||
|
for (j = 0; j < 3; j++)
|
||||||
|
for (i = 0; i < 20; i++)
|
||||||
|
for (k = 0; k < 8; k++)
|
||||||
|
W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
|
||||||
|
for (j = 0; j < 3; j++)
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
for (k = 0; k < 8; k++)
|
||||||
|
tstate[8 * 8 * j + 8 * i + k] = midstate[i];
|
||||||
|
HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0);
|
||||||
|
HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64);
|
||||||
|
HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
|
||||||
|
PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0);
|
||||||
|
PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256);
|
||||||
|
PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
|
||||||
|
for (j = 0; j < 3; j++)
|
||||||
|
for (i = 0; i < 32; i++)
|
||||||
|
for (k = 0; k < 8; k++)
|
||||||
|
X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
|
||||||
|
scrypt_core_6way(X + 0 * 32, V);
|
||||||
|
scrypt_core_6way(X + 6 * 32, V);
|
||||||
|
scrypt_core_6way(X + 12 * 32, V);
|
||||||
|
scrypt_core_6way(X + 18 * 32, V);
|
||||||
|
for (j = 0; j < 3; j++)
|
||||||
|
for (i = 0; i < 32; i++)
|
||||||
|
for (k = 0; k < 8; k++)
|
||||||
|
W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
|
||||||
|
PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0);
|
||||||
|
PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256);
|
||||||
|
PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
|
||||||
|
for (j = 0; j < 3; j++)
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
for (k = 0; k < 8; k++)
|
||||||
|
output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
|
||||||
|
}
|
||||||
|
#endif /* HAVE_SCRYPT_6WAY */
|
||||||
|
|
||||||
int scanhash_scrypt(int thr_id, uint32_t *pdata,
|
int scanhash_scrypt(int thr_id, uint32_t *pdata,
|
||||||
unsigned char *scratchbuf, const uint32_t *ptarget,
|
unsigned char *scratchbuf, const uint32_t *ptarget,
|
||||||
uint32_t max_nonce, unsigned long *hashes_done)
|
uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
|
@ -589,6 +731,11 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
|
||||||
scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
|
scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAVE_SCRYPT_6WAY)
|
||||||
|
if (throughput == 24)
|
||||||
|
scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
#if defined(HAVE_SCRYPT_3WAY)
|
#if defined(HAVE_SCRYPT_3WAY)
|
||||||
if (throughput == 3)
|
if (throughput == 3)
|
||||||
scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
|
scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);
|
||||||
|
|
1056
sha2-x64.S
1056
sha2-x64.S
File diff suppressed because it is too large
Load diff
64
sha2.c
64
sha2.c
|
@ -522,6 +522,65 @@ static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
|
||||||
|
|
||||||
#endif /* HAVE_SHA256_4WAY */
|
#endif /* HAVE_SHA256_4WAY */
|
||||||
|
|
||||||
|
#ifdef HAVE_SHA256_8WAY
|
||||||
|
|
||||||
|
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
|
||||||
|
const uint32_t *midstate, const uint32_t *prehash);
|
||||||
|
|
||||||
|
static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
|
||||||
|
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
|
{
|
||||||
|
uint32_t data[8 * 64] __attribute__((aligned(128)));
|
||||||
|
uint32_t hash[8 * 8] __attribute__((aligned(32)));
|
||||||
|
uint32_t midstate[8 * 8] __attribute__((aligned(32)));
|
||||||
|
uint32_t prehash[8 * 8] __attribute__((aligned(32)));
|
||||||
|
uint32_t n = pdata[19] - 1;
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
memcpy(data, pdata + 16, 64);
|
||||||
|
sha256d_preextend(data);
|
||||||
|
for (i = 31; i >= 0; i--)
|
||||||
|
for (j = 0; j < 8; j++)
|
||||||
|
data[i * 8 + j] = data[i];
|
||||||
|
|
||||||
|
sha256_init(midstate);
|
||||||
|
sha256_transform(midstate, pdata, 0);
|
||||||
|
memcpy(prehash, midstate, 32);
|
||||||
|
sha256d_prehash(prehash, pdata + 16);
|
||||||
|
for (i = 7; i >= 0; i--) {
|
||||||
|
for (j = 0; j < 8; j++) {
|
||||||
|
midstate[i * 8 + j] = midstate[i];
|
||||||
|
prehash[i * 8 + j] = prehash[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
data[8 * 3 + i] = ++n;
|
||||||
|
|
||||||
|
sha256d_ms_8way(hash, data, midstate, prehash);
|
||||||
|
|
||||||
|
for (i = 0; i < 8; i++) {
|
||||||
|
if (swab32(hash[8 * 7 + i]) <= Htarg) {
|
||||||
|
pdata[19] = data[8 * 3 + i];
|
||||||
|
sha256d_80_swap(hash, pdata);
|
||||||
|
if (fulltest(hash, ptarget)) {
|
||||||
|
*hashes_done = n - first_nonce + 1;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||||
|
|
||||||
|
*hashes_done = n - first_nonce + 1;
|
||||||
|
pdata[19] = n;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* HAVE_SHA256_8WAY */
|
||||||
|
|
||||||
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
uint32_t max_nonce, unsigned long *hashes_done)
|
uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
{
|
{
|
||||||
|
@ -533,6 +592,11 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
|
||||||
|
#ifdef HAVE_SHA256_8WAY
|
||||||
|
if (sha256_use_8way())
|
||||||
|
return scanhash_sha256d_8way(thr_id, pdata, ptarget,
|
||||||
|
max_nonce, hashes_done);
|
||||||
|
#endif
|
||||||
#ifdef HAVE_SHA256_4WAY
|
#ifdef HAVE_SHA256_4WAY
|
||||||
if (sha256_use_4way())
|
if (sha256_use_4way())
|
||||||
return scanhash_sha256d_4way(thr_id, pdata, ptarget,
|
return scanhash_sha256d_4way(thr_id, pdata, ptarget,
|
||||||
|
|
Loading…
Reference in a new issue