Add AVX2-enabled functions for x86-64

This commit is contained in:
pooler 2013-07-05 18:25:34 +02:00
parent 44d4815b01
commit e878267239
8 changed files with 1995 additions and 78 deletions

4
README
View file

@ -42,7 +42,7 @@ Architecture-specific notes:
To use NEON instructions, add "-mfpu=neon" to CFLAGS. To use NEON instructions, add "-mfpu=neon" to CFLAGS.
x86: The miner checks for SSE2 instructions support at runtime, x86: The miner checks for SSE2 instructions support at runtime,
and uses them if they are available. and uses them if they are available.
x86-64: The miner can take advantage of AVX and XOP instructions, x86-64: The miner can take advantage of AVX, AVX2 and XOP instructions,
but only if both the CPU and the operating system support them. but only if both the CPU and the operating system support them.
* Linux supports AVX starting from kernel version 2.6.30. * Linux supports AVX starting from kernel version 2.6.30.
* FreeBSD supports AVX starting with 9.1-RELEASE. * FreeBSD supports AVX starting with 9.1-RELEASE.
@ -50,7 +50,7 @@ Architecture-specific notes:
* Windows supports AVX starting from Windows 7 SP1 and * Windows supports AVX starting from Windows 7 SP1 and
Windows Server 2008 R2 SP1. Windows Server 2008 R2 SP1.
The configure script outputs a warning if the assembler The configure script outputs a warning if the assembler
cannot compile AVX or XOP instructions. In that case, the miner doesn't support some instruction sets. In that case, the miner
can still be built, but unavailable optimizations are left off. can still be built, but unavailable optimizations are left off.
Usage instructions: Run "minerd --help" to see options. Usage instructions: Run "minerd --help" to see options.

View file

@ -77,6 +77,14 @@ then
AC_MSG_RESULT(no) AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the XOP instruction set.]) AC_MSG_WARN([The assembler does not support the XOP instruction set.])
) )
AC_MSG_CHECKING(whether we can compile AVX2 code)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])],
AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.])
AC_MSG_RESULT(yes)
,
AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX2 instruction set.])
)
, ,
AC_MSG_RESULT(no) AC_MSG_RESULT(no)
AC_MSG_WARN([The assembler does not support the AVX instruction set.]) AC_MSG_WARN([The assembler does not support the AVX instruction set.])

View file

@ -668,7 +668,7 @@ static void *miner_thread(void *userdata)
int thr_id = mythr->id; int thr_id = mythr->id;
struct work work; struct work work;
uint32_t max_nonce; uint32_t max_nonce;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x10; uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
unsigned char *scratchbuf = NULL; unsigned char *scratchbuf = NULL;
char s[16]; char s[16];
int i; int i;

View file

@ -141,6 +141,13 @@ void sha256_init_4way(uint32_t *state);
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
#endif #endif
#if defined(__x86_64__) && defined(USE_AVX2)
#define HAVE_SHA256_8WAY 1
int sha256_use_8way();
void sha256_init_8way(uint32_t *state);
void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap);
#endif
extern int scanhash_sha256d(int thr_id, uint32_t *pdata, extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);

View file

@ -1,5 +1,5 @@
/* /*
* Copyright 2011-2012 pooler@litecoinpool.org * Copyright 2011-2013 pooler@litecoinpool.org
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -39,6 +39,30 @@
scrypt_best_throughput: scrypt_best_throughput:
_scrypt_best_throughput: _scrypt_best_throughput:
pushq %rbx pushq %rbx
#if defined(USE_AVX2)
/* Check for AVX and OSXSAVE support */
movl $1, %eax
cpuid
andl $0x18000000, %ecx
cmpl $0x18000000, %ecx
jne scrypt_best_throughput_no_avx2
/* Check for AVX2 support */
movl $7, %eax
xorl %ecx, %ecx
cpuid
andl $0x00000020, %ebx
cmpl $0x00000020, %ebx
jne scrypt_best_throughput_no_avx2
/* Check for XMM and YMM state support */
xorl %ecx, %ecx
xgetbv
andl $0x00000006, %eax
cmpl $0x00000006, %eax
jne scrypt_best_throughput_no_avx2
movl $6, %eax
jmp scrypt_best_throughput_exit
scrypt_best_throughput_no_avx2:
#endif
/* Check for AuthenticAMD */ /* Check for AuthenticAMD */
xorq %rax, %rax xorq %rax, %rax
cpuid cpuid
@ -2239,4 +2263,617 @@ scrypt_core_3way_xmm_loop2:
scrypt_core_3way_cleanup scrypt_core_3way_cleanup
ret ret
#if defined(USE_AVX2)
.macro salsa8_core_6way_avx2_doubleround
vpaddd %ymm0, %ymm1, %ymm4
vpaddd %ymm8, %ymm9, %ymm6
vpaddd %ymm12, %ymm13, %ymm7
vpslld $7, %ymm4, %ymm5
vpsrld $25, %ymm4, %ymm4
vpxor %ymm5, %ymm3, %ymm3
vpxor %ymm4, %ymm3, %ymm3
vpslld $7, %ymm6, %ymm5
vpsrld $25, %ymm6, %ymm6
vpxor %ymm5, %ymm11, %ymm11
vpxor %ymm6, %ymm11, %ymm11
vpslld $7, %ymm7, %ymm5
vpsrld $25, %ymm7, %ymm7
vpxor %ymm5, %ymm15, %ymm15
vpxor %ymm7, %ymm15, %ymm15
vpaddd %ymm3, %ymm0, %ymm4
vpaddd %ymm11, %ymm8, %ymm6
vpaddd %ymm15, %ymm12, %ymm7
vpslld $9, %ymm4, %ymm5
vpsrld $23, %ymm4, %ymm4
vpxor %ymm5, %ymm2, %ymm2
vpxor %ymm4, %ymm2, %ymm2
vpslld $9, %ymm6, %ymm5
vpsrld $23, %ymm6, %ymm6
vpxor %ymm5, %ymm10, %ymm10
vpxor %ymm6, %ymm10, %ymm10
vpslld $9, %ymm7, %ymm5
vpsrld $23, %ymm7, %ymm7
vpxor %ymm5, %ymm14, %ymm14
vpxor %ymm7, %ymm14, %ymm14
vpaddd %ymm2, %ymm3, %ymm4
vpaddd %ymm10, %ymm11, %ymm6
vpaddd %ymm14, %ymm15, %ymm7
vpslld $13, %ymm4, %ymm5
vpsrld $19, %ymm4, %ymm4
vpshufd $0x93, %ymm3, %ymm3
vpshufd $0x93, %ymm11, %ymm11
vpshufd $0x93, %ymm15, %ymm15
vpxor %ymm5, %ymm1, %ymm1
vpxor %ymm4, %ymm1, %ymm1
vpslld $13, %ymm6, %ymm5
vpsrld $19, %ymm6, %ymm6
vpxor %ymm5, %ymm9, %ymm9
vpxor %ymm6, %ymm9, %ymm9
vpslld $13, %ymm7, %ymm5
vpsrld $19, %ymm7, %ymm7
vpxor %ymm5, %ymm13, %ymm13
vpxor %ymm7, %ymm13, %ymm13
vpaddd %ymm1, %ymm2, %ymm4
vpaddd %ymm9, %ymm10, %ymm6
vpaddd %ymm13, %ymm14, %ymm7
vpslld $18, %ymm4, %ymm5
vpsrld $14, %ymm4, %ymm4
vpshufd $0x4e, %ymm2, %ymm2
vpshufd $0x4e, %ymm10, %ymm10
vpshufd $0x4e, %ymm14, %ymm14
vpxor %ymm5, %ymm0, %ymm0
vpxor %ymm4, %ymm0, %ymm0
vpslld $18, %ymm6, %ymm5
vpsrld $14, %ymm6, %ymm6
vpxor %ymm5, %ymm8, %ymm8
vpxor %ymm6, %ymm8, %ymm8
vpslld $18, %ymm7, %ymm5
vpsrld $14, %ymm7, %ymm7
vpxor %ymm5, %ymm12, %ymm12
vpxor %ymm7, %ymm12, %ymm12
vpaddd %ymm0, %ymm3, %ymm4
vpaddd %ymm8, %ymm11, %ymm6
vpaddd %ymm12, %ymm15, %ymm7
vpslld $7, %ymm4, %ymm5
vpsrld $25, %ymm4, %ymm4
vpshufd $0x39, %ymm1, %ymm1
vpxor %ymm5, %ymm1, %ymm1
vpxor %ymm4, %ymm1, %ymm1
vpslld $7, %ymm6, %ymm5
vpsrld $25, %ymm6, %ymm6
vpshufd $0x39, %ymm9, %ymm9
vpxor %ymm5, %ymm9, %ymm9
vpxor %ymm6, %ymm9, %ymm9
vpslld $7, %ymm7, %ymm5
vpsrld $25, %ymm7, %ymm7
vpshufd $0x39, %ymm13, %ymm13
vpxor %ymm5, %ymm13, %ymm13
vpxor %ymm7, %ymm13, %ymm13
vpaddd %ymm1, %ymm0, %ymm4
vpaddd %ymm9, %ymm8, %ymm6
vpaddd %ymm13, %ymm12, %ymm7
vpslld $9, %ymm4, %ymm5
vpsrld $23, %ymm4, %ymm4
vpxor %ymm5, %ymm2, %ymm2
vpxor %ymm4, %ymm2, %ymm2
vpslld $9, %ymm6, %ymm5
vpsrld $23, %ymm6, %ymm6
vpxor %ymm5, %ymm10, %ymm10
vpxor %ymm6, %ymm10, %ymm10
vpslld $9, %ymm7, %ymm5
vpsrld $23, %ymm7, %ymm7
vpxor %ymm5, %ymm14, %ymm14
vpxor %ymm7, %ymm14, %ymm14
vpaddd %ymm2, %ymm1, %ymm4
vpaddd %ymm10, %ymm9, %ymm6
vpaddd %ymm14, %ymm13, %ymm7
vpslld $13, %ymm4, %ymm5
vpsrld $19, %ymm4, %ymm4
vpshufd $0x93, %ymm1, %ymm1
vpshufd $0x93, %ymm9, %ymm9
vpshufd $0x93, %ymm13, %ymm13
vpxor %ymm5, %ymm3, %ymm3
vpxor %ymm4, %ymm3, %ymm3
vpslld $13, %ymm6, %ymm5
vpsrld $19, %ymm6, %ymm6
vpxor %ymm5, %ymm11, %ymm11
vpxor %ymm6, %ymm11, %ymm11
vpslld $13, %ymm7, %ymm5
vpsrld $19, %ymm7, %ymm7
vpxor %ymm5, %ymm15, %ymm15
vpxor %ymm7, %ymm15, %ymm15
vpaddd %ymm3, %ymm2, %ymm4
vpaddd %ymm11, %ymm10, %ymm6
vpaddd %ymm15, %ymm14, %ymm7
vpslld $18, %ymm4, %ymm5
vpsrld $14, %ymm4, %ymm4
vpshufd $0x4e, %ymm2, %ymm2
vpshufd $0x4e, %ymm10, %ymm10
vpxor %ymm5, %ymm0, %ymm0
vpxor %ymm4, %ymm0, %ymm0
vpslld $18, %ymm6, %ymm5
vpsrld $14, %ymm6, %ymm6
vpshufd $0x4e, %ymm14, %ymm14
vpshufd $0x39, %ymm11, %ymm11
vpxor %ymm5, %ymm8, %ymm8
vpxor %ymm6, %ymm8, %ymm8
vpslld $18, %ymm7, %ymm5
vpsrld $14, %ymm7, %ymm7
vpshufd $0x39, %ymm3, %ymm3
vpshufd $0x39, %ymm15, %ymm15
vpxor %ymm5, %ymm12, %ymm12
vpxor %ymm7, %ymm12, %ymm12
.endm
.macro salsa8_core_6way_avx2
salsa8_core_6way_avx2_doubleround
salsa8_core_6way_avx2_doubleround
salsa8_core_6way_avx2_doubleround
salsa8_core_6way_avx2_doubleround
.endm
.text
.p2align 6
.globl scrypt_core_6way
.globl _scrypt_core_6way
scrypt_core_6way:
_scrypt_core_6way:
pushq %rbx
pushq %rbp
#if defined(WIN64)
subq $176, %rsp
vmovdqa %xmm6, 8(%rsp)
vmovdqa %xmm7, 24(%rsp)
vmovdqa %xmm8, 40(%rsp)
vmovdqa %xmm9, 56(%rsp)
vmovdqa %xmm10, 72(%rsp)
vmovdqa %xmm11, 88(%rsp)
vmovdqa %xmm12, 104(%rsp)
vmovdqa %xmm13, 120(%rsp)
vmovdqa %xmm14, 136(%rsp)
vmovdqa %xmm15, 152(%rsp)
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#endif
movq %rsp, %rdx
subq $768, %rsp
andq $-128, %rsp
.macro scrypt_core_6way_cleanup
movq %rdx, %rsp
#if defined(WIN64)
popq %rsi
popq %rdi
vmovdqa 8(%rsp), %xmm6
vmovdqa 24(%rsp), %xmm7
vmovdqa 40(%rsp), %xmm8
vmovdqa 56(%rsp), %xmm9
vmovdqa 72(%rsp), %xmm10
vmovdqa 88(%rsp), %xmm11
vmovdqa 104(%rsp), %xmm12
vmovdqa 120(%rsp), %xmm13
vmovdqa 136(%rsp), %xmm14
vmovdqa 152(%rsp), %xmm15
addq $176, %rsp
#endif
popq %rbp
popq %rbx
.endm
.macro scrypt_shuffle_pack2 src, so, dest, do
vmovdqa \so+0*16(\src), %xmm0
vmovdqa \so+1*16(\src), %xmm1
vmovdqa \so+2*16(\src), %xmm2
vmovdqa \so+3*16(\src), %xmm3
vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0
vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1
vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2
vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3
vpblendd $0x33, %ymm0, %ymm2, %ymm4
vpblendd $0xcc, %ymm1, %ymm3, %ymm5
vpblendd $0x33, %ymm2, %ymm0, %ymm6
vpblendd $0xcc, %ymm3, %ymm1, %ymm7
vpblendd $0x55, %ymm7, %ymm6, %ymm3
vpblendd $0x55, %ymm6, %ymm5, %ymm2
vpblendd $0x55, %ymm5, %ymm4, %ymm1
vpblendd $0x55, %ymm4, %ymm7, %ymm0
vmovdqa %ymm0, \do+0*32(\dest)
vmovdqa %ymm1, \do+1*32(\dest)
vmovdqa %ymm2, \do+2*32(\dest)
vmovdqa %ymm3, \do+3*32(\dest)
.endm
.macro scrypt_shuffle_unpack2 src, so, dest, do
vmovdqa \so+0*32(\src), %ymm0
vmovdqa \so+1*32(\src), %ymm1
vmovdqa \so+2*32(\src), %ymm2
vmovdqa \so+3*32(\src), %ymm3
vpblendd $0x33, %ymm0, %ymm2, %ymm4
vpblendd $0xcc, %ymm1, %ymm3, %ymm5
vpblendd $0x33, %ymm2, %ymm0, %ymm6
vpblendd $0xcc, %ymm3, %ymm1, %ymm7
vpblendd $0x55, %ymm7, %ymm6, %ymm3
vpblendd $0x55, %ymm6, %ymm5, %ymm2
vpblendd $0x55, %ymm5, %ymm4, %ymm1
vpblendd $0x55, %ymm4, %ymm7, %ymm0
vmovdqa %xmm0, \do+0*16(\dest)
vmovdqa %xmm1, \do+1*16(\dest)
vmovdqa %xmm2, \do+2*16(\dest)
vmovdqa %xmm3, \do+3*16(\dest)
vextracti128 $1, %ymm0, \do+128+0*16(\dest)
vextracti128 $1, %ymm1, \do+128+1*16(\dest)
vextracti128 $1, %ymm2, \do+128+2*16(\dest)
vextracti128 $1, %ymm3, \do+128+3*16(\dest)
.endm
scrypt_core_6way_avx2:
scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128
scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128
scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128
scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128
scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128
scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128
vmovdqa 0*256+4*32(%rsp), %ymm0
vmovdqa 0*256+5*32(%rsp), %ymm1
vmovdqa 0*256+6*32(%rsp), %ymm2
vmovdqa 0*256+7*32(%rsp), %ymm3
vmovdqa 1*256+4*32(%rsp), %ymm8
vmovdqa 1*256+5*32(%rsp), %ymm9
vmovdqa 1*256+6*32(%rsp), %ymm10
vmovdqa 1*256+7*32(%rsp), %ymm11
vmovdqa 2*256+4*32(%rsp), %ymm12
vmovdqa 2*256+5*32(%rsp), %ymm13
vmovdqa 2*256+6*32(%rsp), %ymm14
vmovdqa 2*256+7*32(%rsp), %ymm15
movq %rsi, %rbx
leaq 6*131072(%rsi), %rax
scrypt_core_6way_avx2_loop1:
vmovdqa %ymm0, 0*256+4*32(%rbx)
vmovdqa %ymm1, 0*256+5*32(%rbx)
vmovdqa %ymm2, 0*256+6*32(%rbx)
vmovdqa %ymm3, 0*256+7*32(%rbx)
vpxor 0*256+0*32(%rsp), %ymm0, %ymm0
vpxor 0*256+1*32(%rsp), %ymm1, %ymm1
vpxor 0*256+2*32(%rsp), %ymm2, %ymm2
vpxor 0*256+3*32(%rsp), %ymm3, %ymm3
vmovdqa %ymm8, 1*256+4*32(%rbx)
vmovdqa %ymm9, 1*256+5*32(%rbx)
vmovdqa %ymm10, 1*256+6*32(%rbx)
vmovdqa %ymm11, 1*256+7*32(%rbx)
vpxor 1*256+0*32(%rsp), %ymm8, %ymm8
vpxor 1*256+1*32(%rsp), %ymm9, %ymm9
vpxor 1*256+2*32(%rsp), %ymm10, %ymm10
vpxor 1*256+3*32(%rsp), %ymm11, %ymm11
vmovdqa %ymm12, 2*256+4*32(%rbx)
vmovdqa %ymm13, 2*256+5*32(%rbx)
vmovdqa %ymm14, 2*256+6*32(%rbx)
vmovdqa %ymm15, 2*256+7*32(%rbx)
vpxor 2*256+0*32(%rsp), %ymm12, %ymm12
vpxor 2*256+1*32(%rsp), %ymm13, %ymm13
vpxor 2*256+2*32(%rsp), %ymm14, %ymm14
vpxor 2*256+3*32(%rsp), %ymm15, %ymm15
vmovdqa %ymm0, 0*256+0*32(%rbx)
vmovdqa %ymm1, 0*256+1*32(%rbx)
vmovdqa %ymm2, 0*256+2*32(%rbx)
vmovdqa %ymm3, 0*256+3*32(%rbx)
vmovdqa %ymm8, 1*256+0*32(%rbx)
vmovdqa %ymm9, 1*256+1*32(%rbx)
vmovdqa %ymm10, 1*256+2*32(%rbx)
vmovdqa %ymm11, 1*256+3*32(%rbx)
vmovdqa %ymm12, 2*256+0*32(%rbx)
vmovdqa %ymm13, 2*256+1*32(%rbx)
vmovdqa %ymm14, 2*256+2*32(%rbx)
vmovdqa %ymm15, 2*256+3*32(%rbx)
salsa8_core_6way_avx2
vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0
vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1
vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2
vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3
vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8
vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9
vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10
vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11
vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12
vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13
vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14
vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15
vmovdqa %ymm0, 0*256+0*32(%rsp)
vmovdqa %ymm1, 0*256+1*32(%rsp)
vmovdqa %ymm2, 0*256+2*32(%rsp)
vmovdqa %ymm3, 0*256+3*32(%rsp)
vmovdqa %ymm8, 1*256+0*32(%rsp)
vmovdqa %ymm9, 1*256+1*32(%rsp)
vmovdqa %ymm10, 1*256+2*32(%rsp)
vmovdqa %ymm11, 1*256+3*32(%rsp)
vmovdqa %ymm12, 2*256+0*32(%rsp)
vmovdqa %ymm13, 2*256+1*32(%rsp)
vmovdqa %ymm14, 2*256+2*32(%rsp)
vmovdqa %ymm15, 2*256+3*32(%rsp)
vpxor 0*256+4*32(%rbx), %ymm0, %ymm0
vpxor 0*256+5*32(%rbx), %ymm1, %ymm1
vpxor 0*256+6*32(%rbx), %ymm2, %ymm2
vpxor 0*256+7*32(%rbx), %ymm3, %ymm3
vpxor 1*256+4*32(%rbx), %ymm8, %ymm8
vpxor 1*256+5*32(%rbx), %ymm9, %ymm9
vpxor 1*256+6*32(%rbx), %ymm10, %ymm10
vpxor 1*256+7*32(%rbx), %ymm11, %ymm11
vpxor 2*256+4*32(%rbx), %ymm12, %ymm12
vpxor 2*256+5*32(%rbx), %ymm13, %ymm13
vpxor 2*256+6*32(%rbx), %ymm14, %ymm14
vpxor 2*256+7*32(%rbx), %ymm15, %ymm15
vmovdqa %ymm0, 0*256+4*32(%rsp)
vmovdqa %ymm1, 0*256+5*32(%rsp)
vmovdqa %ymm2, 0*256+6*32(%rsp)
vmovdqa %ymm3, 0*256+7*32(%rsp)
vmovdqa %ymm8, 1*256+4*32(%rsp)
vmovdqa %ymm9, 1*256+5*32(%rsp)
vmovdqa %ymm10, 1*256+6*32(%rsp)
vmovdqa %ymm11, 1*256+7*32(%rsp)
vmovdqa %ymm12, 2*256+4*32(%rsp)
vmovdqa %ymm13, 2*256+5*32(%rsp)
vmovdqa %ymm14, 2*256+6*32(%rsp)
vmovdqa %ymm15, 2*256+7*32(%rsp)
salsa8_core_6way_avx2
vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0
vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1
vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2
vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3
vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8
vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9
vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10
vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11
vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12
vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13
vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14
vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15
addq $6*128, %rbx
cmpq %rax, %rbx
jne scrypt_core_6way_avx2_loop1
vmovdqa %ymm0, 0*256+4*32(%rsp)
vmovdqa %ymm1, 0*256+5*32(%rsp)
vmovdqa %ymm2, 0*256+6*32(%rsp)
vmovdqa %ymm3, 0*256+7*32(%rsp)
vmovdqa %ymm8, 1*256+4*32(%rsp)
vmovdqa %ymm9, 1*256+5*32(%rsp)
vmovdqa %ymm10, 1*256+6*32(%rsp)
vmovdqa %ymm11, 1*256+7*32(%rsp)
vmovdqa %ymm12, 2*256+4*32(%rsp)
vmovdqa %ymm13, 2*256+5*32(%rsp)
vmovdqa %ymm14, 2*256+6*32(%rsp)
vmovdqa %ymm15, 2*256+7*32(%rsp)
movq $1024, %rcx
scrypt_core_6way_avx2_loop2:
vmovd %xmm0, %ebp
vmovd %xmm8, %ebx
vmovd %xmm12, %eax
vextracti128 $1, %ymm0, %xmm4
vextracti128 $1, %ymm8, %xmm5
vextracti128 $1, %ymm12, %xmm6
vmovd %xmm4, %r8d
vmovd %xmm5, %r9d
vmovd %xmm6, %r10d
vpxor 0*256+0*32(%rsp), %ymm0, %ymm0
vpxor 0*256+1*32(%rsp), %ymm1, %ymm1
vpxor 0*256+2*32(%rsp), %ymm2, %ymm2
vpxor 0*256+3*32(%rsp), %ymm3, %ymm3
vpxor 1*256+0*32(%rsp), %ymm8, %ymm8
vpxor 1*256+1*32(%rsp), %ymm9, %ymm9
vpxor 1*256+2*32(%rsp), %ymm10, %ymm10
vpxor 1*256+3*32(%rsp), %ymm11, %ymm11
vpxor 2*256+0*32(%rsp), %ymm12, %ymm12
vpxor 2*256+1*32(%rsp), %ymm13, %ymm13
vpxor 2*256+2*32(%rsp), %ymm14, %ymm14
vpxor 2*256+3*32(%rsp), %ymm15, %ymm15
andl $1023, %ebp
leaq 0(%rbp, %rbp, 2), %rbp
shll $8, %ebp
andl $1023, %ebx
leaq 1(%rbx, %rbx, 2), %rbx
shll $8, %ebx
andl $1023, %eax
leaq 2(%rax, %rax, 2), %rax
shll $8, %eax
andl $1023, %r8d
leaq 0(%r8, %r8, 2), %r8
shll $8, %r8d
andl $1023, %r9d
leaq 1(%r9, %r9, 2), %r9
shll $8, %r9d
andl $1023, %r10d
leaq 2(%r10, %r10, 2), %r10
shll $8, %r10d
vmovdqa 0*32(%rsi, %rbp), %xmm4
vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4
vmovdqa 1*32(%rsi, %rbp), %xmm5
vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5
vmovdqa 2*32(%rsi, %rbp), %xmm6
vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6
vmovdqa 3*32(%rsi, %rbp), %xmm7
vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7
vpxor %ymm4, %ymm0, %ymm0
vpxor %ymm5, %ymm1, %ymm1
vpxor %ymm6, %ymm2, %ymm2
vpxor %ymm7, %ymm3, %ymm3
vmovdqa 0*32(%rsi, %rbx), %xmm4
vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4
vmovdqa 1*32(%rsi, %rbx), %xmm5
vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5
vmovdqa 2*32(%rsi, %rbx), %xmm6
vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6
vmovdqa 3*32(%rsi, %rbx), %xmm7
vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7
vpxor %ymm4, %ymm8, %ymm8
vpxor %ymm5, %ymm9, %ymm9
vpxor %ymm6, %ymm10, %ymm10
vpxor %ymm7, %ymm11, %ymm11
vmovdqa 0*32(%rsi, %rax), %xmm4
vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4
vmovdqa 1*32(%rsi, %rax), %xmm5
vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5
vmovdqa 2*32(%rsi, %rax), %xmm6
vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6
vmovdqa 3*32(%rsi, %rax), %xmm7
vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7
vpxor %ymm4, %ymm12, %ymm12
vpxor %ymm5, %ymm13, %ymm13
vpxor %ymm6, %ymm14, %ymm14
vpxor %ymm7, %ymm15, %ymm15
vmovdqa %ymm0, 0*256+0*32(%rsp)
vmovdqa %ymm1, 0*256+1*32(%rsp)
vmovdqa %ymm2, 0*256+2*32(%rsp)
vmovdqa %ymm3, 0*256+3*32(%rsp)
vmovdqa %ymm8, 1*256+0*32(%rsp)
vmovdqa %ymm9, 1*256+1*32(%rsp)
vmovdqa %ymm10, 1*256+2*32(%rsp)
vmovdqa %ymm11, 1*256+3*32(%rsp)
vmovdqa %ymm12, 2*256+0*32(%rsp)
vmovdqa %ymm13, 2*256+1*32(%rsp)
vmovdqa %ymm14, 2*256+2*32(%rsp)
vmovdqa %ymm15, 2*256+3*32(%rsp)
salsa8_core_6way_avx2
vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0
vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1
vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2
vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3
vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8
vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9
vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10
vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11
vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12
vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13
vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14
vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15
vmovdqa %ymm0, 0*256+0*32(%rsp)
vmovdqa %ymm1, 0*256+1*32(%rsp)
vmovdqa %ymm2, 0*256+2*32(%rsp)
vmovdqa %ymm3, 0*256+3*32(%rsp)
vmovdqa %ymm8, 1*256+0*32(%rsp)
vmovdqa %ymm9, 1*256+1*32(%rsp)
vmovdqa %ymm10, 1*256+2*32(%rsp)
vmovdqa %ymm11, 1*256+3*32(%rsp)
vmovdqa %ymm12, 2*256+0*32(%rsp)
vmovdqa %ymm13, 2*256+1*32(%rsp)
vmovdqa %ymm14, 2*256+2*32(%rsp)
vmovdqa %ymm15, 2*256+3*32(%rsp)
vmovdqa 4*32(%rsi, %rbp), %xmm4
vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4
vmovdqa 5*32(%rsi, %rbp), %xmm5
vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5
vmovdqa 6*32(%rsi, %rbp), %xmm6
vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6
vmovdqa 7*32(%rsi, %rbp), %xmm7
vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7
vpxor %ymm4, %ymm0, %ymm0
vpxor %ymm5, %ymm1, %ymm1
vpxor %ymm6, %ymm2, %ymm2
vpxor %ymm7, %ymm3, %ymm3
vmovdqa 4*32(%rsi, %rbx), %xmm4
vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4
vmovdqa 5*32(%rsi, %rbx), %xmm5
vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5
vmovdqa 6*32(%rsi, %rbx), %xmm6
vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6
vmovdqa 7*32(%rsi, %rbx), %xmm7
vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7
vpxor %ymm4, %ymm8, %ymm8
vpxor %ymm5, %ymm9, %ymm9
vpxor %ymm6, %ymm10, %ymm10
vpxor %ymm7, %ymm11, %ymm11
vmovdqa 4*32(%rsi, %rax), %xmm4
vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4
vmovdqa 5*32(%rsi, %rax), %xmm5
vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5
vmovdqa 6*32(%rsi, %rax), %xmm6
vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6
vmovdqa 7*32(%rsi, %rax), %xmm7
vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7
vpxor %ymm4, %ymm12, %ymm12
vpxor %ymm5, %ymm13, %ymm13
vpxor %ymm6, %ymm14, %ymm14
vpxor %ymm7, %ymm15, %ymm15
vpxor 0*256+4*32(%rsp), %ymm0, %ymm0
vpxor 0*256+5*32(%rsp), %ymm1, %ymm1
vpxor 0*256+6*32(%rsp), %ymm2, %ymm2
vpxor 0*256+7*32(%rsp), %ymm3, %ymm3
vpxor 1*256+4*32(%rsp), %ymm8, %ymm8
vpxor 1*256+5*32(%rsp), %ymm9, %ymm9
vpxor 1*256+6*32(%rsp), %ymm10, %ymm10
vpxor 1*256+7*32(%rsp), %ymm11, %ymm11
vpxor 2*256+4*32(%rsp), %ymm12, %ymm12
vpxor 2*256+5*32(%rsp), %ymm13, %ymm13
vpxor 2*256+6*32(%rsp), %ymm14, %ymm14
vpxor 2*256+7*32(%rsp), %ymm15, %ymm15
vmovdqa %ymm0, 0*256+4*32(%rsp)
vmovdqa %ymm1, 0*256+5*32(%rsp)
vmovdqa %ymm2, 0*256+6*32(%rsp)
vmovdqa %ymm3, 0*256+7*32(%rsp)
vmovdqa %ymm8, 1*256+4*32(%rsp)
vmovdqa %ymm9, 1*256+5*32(%rsp)
vmovdqa %ymm10, 1*256+6*32(%rsp)
vmovdqa %ymm11, 1*256+7*32(%rsp)
vmovdqa %ymm12, 2*256+4*32(%rsp)
vmovdqa %ymm13, 2*256+5*32(%rsp)
vmovdqa %ymm14, 2*256+6*32(%rsp)
vmovdqa %ymm15, 2*256+7*32(%rsp)
salsa8_core_6way_avx2
vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0
vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1
vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2
vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3
vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8
vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9
vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10
vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11
vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12
vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13
vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14
vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15
vmovdqa %ymm0, 0*256+4*32(%rsp)
vmovdqa %ymm1, 0*256+5*32(%rsp)
vmovdqa %ymm2, 0*256+6*32(%rsp)
vmovdqa %ymm3, 0*256+7*32(%rsp)
vmovdqa %ymm8, 1*256+4*32(%rsp)
vmovdqa %ymm9, 1*256+5*32(%rsp)
vmovdqa %ymm10, 1*256+6*32(%rsp)
vmovdqa %ymm11, 1*256+7*32(%rsp)
vmovdqa %ymm12, 2*256+4*32(%rsp)
vmovdqa %ymm13, 2*256+5*32(%rsp)
vmovdqa %ymm14, 2*256+6*32(%rsp)
vmovdqa %ymm15, 2*256+7*32(%rsp)
subq $1, %rcx
ja scrypt_core_6way_avx2_loop2
scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0
scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64
scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0
scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64
scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0
scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64
scrypt_core_6way_cleanup
ret
#endif /* USE_AVX2 */
#endif #endif

293
scrypt.c
View file

@ -1,5 +1,5 @@
/* /*
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2012 pooler * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler
* All rights reserved. * All rights reserved.
* *
* Redistribution and use in source and binary forms, with or without * Redistribution and use in source and binary forms, with or without
@ -256,6 +256,128 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
#endif /* HAVE_SHA256_4WAY */ #endif /* HAVE_SHA256_4WAY */
#ifdef HAVE_SHA256_8WAY
static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = {
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620
};
static inline void HMAC_SHA256_80_init_8way(const uint32_t *key,
uint32_t *tstate, uint32_t *ostate)
{
uint32_t ihash[8 * 8] __attribute__((aligned(32)));
uint32_t pad[8 * 16] __attribute__((aligned(32)));
int i;
/* tstate is assumed to contain the midstate of key */
memcpy(pad, key + 8 * 16, 8 * 16);
for (i = 0; i < 8; i++)
pad[8 * 4 + i] = 0x80000000;
memset(pad + 8 * 5, 0x00, 8 * 40);
for (i = 0; i < 8; i++)
pad[8 * 15 + i] = 0x00000280;
sha256_transform_8way(tstate, pad, 0);
memcpy(ihash, tstate, 8 * 32);
sha256_init_8way(ostate);
for (i = 0; i < 8 * 8; i++)
pad[i] = ihash[i] ^ 0x5c5c5c5c;
for (; i < 8 * 16; i++)
pad[i] = 0x5c5c5c5c;
sha256_transform_8way(ostate, pad, 0);
sha256_init_8way(tstate);
for (i = 0; i < 8 * 8; i++)
pad[i] = ihash[i] ^ 0x36363636;
for (; i < 8 * 16; i++)
pad[i] = 0x36363636;
sha256_transform_8way(tstate, pad, 0);
}
static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate,
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t istate[8 * 8] __attribute__((aligned(32)));
uint32_t ostate2[8 * 8] __attribute__((aligned(32)));
uint32_t ibuf[8 * 16] __attribute__((aligned(32)));
uint32_t obuf[8 * 16] __attribute__((aligned(32)));
int i, j;
memcpy(istate, tstate, 8 * 32);
sha256_transform_8way(istate, salt, 0);
memcpy(ibuf, salt + 8 * 16, 8 * 16);
for (i = 0; i < 8; i++)
ibuf[8 * 5 + i] = 0x80000000;
memset(ibuf + 8 * 6, 0x00, 8 * 36);
for (i = 0; i < 8; i++)
ibuf[8 * 15 + i] = 0x000004a0;
for (i = 0; i < 8; i++)
obuf[8 * 8 + i] = 0x80000000;
memset(obuf + 8 * 9, 0x00, 8 * 24);
for (i = 0; i < 8; i++)
obuf[8 * 15 + i] = 0x00000300;
for (i = 0; i < 4; i++) {
memcpy(obuf, istate, 8 * 32);
ibuf[8 * 4 + 0] = i + 1;
ibuf[8 * 4 + 1] = i + 1;
ibuf[8 * 4 + 2] = i + 1;
ibuf[8 * 4 + 3] = i + 1;
ibuf[8 * 4 + 4] = i + 1;
ibuf[8 * 4 + 5] = i + 1;
ibuf[8 * 4 + 6] = i + 1;
ibuf[8 * 4 + 7] = i + 1;
sha256_transform_8way(obuf, ibuf, 0);
memcpy(ostate2, ostate, 8 * 32);
sha256_transform_8way(ostate2, obuf, 0);
for (j = 0; j < 8 * 8; j++)
output[8 * 8 * i + j] = swab32(ostate2[j]);
}
}
static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate,
uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t buf[8 * 16] __attribute__((aligned(32)));
int i;
sha256_transform_8way(tstate, salt, 1);
sha256_transform_8way(tstate, salt + 8 * 16, 1);
sha256_transform_8way(tstate, finalblk_8way, 0);
memcpy(buf, tstate, 8 * 32);
for (i = 0; i < 8; i++)
buf[8 * 8 + i] = 0x80000000;
memset(buf + 8 * 9, 0x00, 8 * 24);
for (i = 0; i < 8; i++)
buf[8 * 15 + i] = 0x00000300;
sha256_transform_8way(ostate, buf, 0);
for (i = 0; i < 8 * 8; i++)
output[i] = swab32(ostate[i]);
}
#endif /* HAVE_SHA256_8WAY */
#if defined(__x86_64__) #if defined(__x86_64__)
#define SCRYPT_MAX_WAYS 12 #define SCRYPT_MAX_WAYS 12
@ -263,6 +385,12 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
int scrypt_best_throughput(); int scrypt_best_throughput();
void scrypt_core(uint32_t *X, uint32_t *V); void scrypt_core(uint32_t *X, uint32_t *V);
void scrypt_core_3way(uint32_t *X, uint32_t *V); void scrypt_core_3way(uint32_t *X, uint32_t *V);
#if defined(USE_AVX2)
#undef SCRYPT_MAX_WAYS
#define SCRYPT_MAX_WAYS 24
#define HAVE_SCRYPT_6WAY 1
void scrypt_core_6way(uint32_t *X, uint32_t *V);
#endif
#elif defined(__i386__) #elif defined(__i386__)
@ -410,47 +538,32 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input,
uint32_t W[4 * 32] __attribute__((aligned(128))); uint32_t W[4 * 32] __attribute__((aligned(128)));
uint32_t X[4 * 32] __attribute__((aligned(128))); uint32_t X[4 * 32] __attribute__((aligned(128)));
uint32_t *V; uint32_t *V;
int i; int i, k;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
for (i = 0; i < 20; i++) { for (i = 0; i < 20; i++)
W[4 * i + 0] = input[0 * 20 + i]; for (k = 0; k < 4; k++)
W[4 * i + 1] = input[1 * 20 + i]; W[4 * i + k] = input[k * 20 + i];
W[4 * i + 2] = input[2 * 20 + i]; for (i = 0; i < 8; i++)
W[4 * i + 3] = input[3 * 20 + i]; for (k = 0; k < 4; k++)
} tstate[4 * i + k] = midstate[i];
for (i = 0; i < 8; i++) {
tstate[4 * i + 0] = midstate[i];
tstate[4 * i + 1] = midstate[i];
tstate[4 * i + 2] = midstate[i];
tstate[4 * i + 3] = midstate[i];
}
HMAC_SHA256_80_init_4way(W, tstate, ostate); HMAC_SHA256_80_init_4way(W, tstate, ostate);
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
for (i = 0; i < 32; i++) { for (i = 0; i < 32; i++)
X[0 * 32 + i] = W[4 * i + 0]; for (k = 0; k < 4; k++)
X[1 * 32 + i] = W[4 * i + 1]; X[k * 32 + i] = W[4 * i + k];
X[2 * 32 + i] = W[4 * i + 2];
X[3 * 32 + i] = W[4 * i + 3];
}
scrypt_core(X + 0 * 32, V); scrypt_core(X + 0 * 32, V);
scrypt_core(X + 1 * 32, V); scrypt_core(X + 1 * 32, V);
scrypt_core(X + 2 * 32, V); scrypt_core(X + 2 * 32, V);
scrypt_core(X + 3 * 32, V); scrypt_core(X + 3 * 32, V);
for (i = 0; i < 32; i++) { for (i = 0; i < 32; i++)
W[4 * i + 0] = X[0 * 32 + i]; for (k = 0; k < 4; k++)
W[4 * i + 1] = X[1 * 32 + i]; W[4 * i + k] = X[k * 32 + i];
W[4 * i + 2] = X[2 * 32 + i];
W[4 * i + 3] = X[3 * 32 + i];
}
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W);
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++)
output[0 * 8 + i] = W[4 * i + 0]; for (k = 0; k < 4; k++)
output[1 * 8 + i] = W[4 * i + 1]; output[k * 8 + i] = W[4 * i + k];
output[2 * 8 + i] = W[4 * i + 2];
output[3 * 8 + i] = W[4 * i + 3];
}
} }
#endif /* HAVE_SHA256_4WAY */ #endif /* HAVE_SHA256_4WAY */
@ -491,68 +604,97 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input,
uint32_t W[12 * 32] __attribute__((aligned(128))); uint32_t W[12 * 32] __attribute__((aligned(128)));
uint32_t X[12 * 32] __attribute__((aligned(128))); uint32_t X[12 * 32] __attribute__((aligned(128)));
uint32_t *V; uint32_t *V;
int i, j; int i, j, k;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
for (j = 0; j < 3; j++) { for (j = 0; j < 3; j++)
for (i = 0; i < 20; i++) { for (i = 0; i < 20; i++)
W[128 * j + 4 * i + 0] = input[80 * j + 0 * 20 + i]; for (k = 0; k < 4; k++)
W[128 * j + 4 * i + 1] = input[80 * j + 1 * 20 + i]; W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i];
W[128 * j + 4 * i + 2] = input[80 * j + 2 * 20 + i]; for (j = 0; j < 3; j++)
W[128 * j + 4 * i + 3] = input[80 * j + 3 * 20 + i]; for (i = 0; i < 8; i++)
} for (k = 0; k < 4; k++)
} tstate[32 * j + 4 * i + k] = midstate[i];
for (j = 0; j < 3; j++) {
for (i = 0; i < 8; i++) {
tstate[32 * j + 4 * i + 0] = midstate[i];
tstate[32 * j + 4 * i + 1] = midstate[i];
tstate[32 * j + 4 * i + 2] = midstate[i];
tstate[32 * j + 4 * i + 3] = midstate[i];
}
}
HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0);
HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32);
HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64);
PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128);
PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256);
for (j = 0; j < 3; j++) { for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++) { for (i = 0; i < 32; i++)
X[128 * j + 0 * 32 + i] = W[128 * j + 4 * i + 0]; for (k = 0; k < 4; k++)
X[128 * j + 1 * 32 + i] = W[128 * j + 4 * i + 1]; X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k];
X[128 * j + 2 * 32 + i] = W[128 * j + 4 * i + 2];
X[128 * j + 3 * 32 + i] = W[128 * j + 4 * i + 3];
}
}
scrypt_core_3way(X + 0 * 96, V); scrypt_core_3way(X + 0 * 96, V);
scrypt_core_3way(X + 1 * 96, V); scrypt_core_3way(X + 1 * 96, V);
scrypt_core_3way(X + 2 * 96, V); scrypt_core_3way(X + 2 * 96, V);
scrypt_core_3way(X + 3 * 96, V); scrypt_core_3way(X + 3 * 96, V);
for (j = 0; j < 3; j++) { for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++) { for (i = 0; i < 32; i++)
W[128 * j + 4 * i + 0] = X[128 * j + 0 * 32 + i]; for (k = 0; k < 4; k++)
W[128 * j + 4 * i + 1] = X[128 * j + 1 * 32 + i]; W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i];
W[128 * j + 4 * i + 2] = X[128 * j + 2 * 32 + i];
W[128 * j + 4 * i + 3] = X[128 * j + 3 * 32 + i];
}
}
PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128);
PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256);
for (j = 0; j < 3; j++) { for (j = 0; j < 3; j++)
for (i = 0; i < 8; i++) { for (i = 0; i < 8; i++)
output[32 * j + 0 * 8 + i] = W[128 * j + 4 * i + 0]; for (k = 0; k < 4; k++)
output[32 * j + 1 * 8 + i] = W[128 * j + 4 * i + 1]; output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k];
output[32 * j + 2 * 8 + i] = W[128 * j + 4 * i + 2];
output[32 * j + 3 * 8 + i] = W[128 * j + 4 * i + 3];
}
}
} }
#endif /* HAVE_SHA256_4WAY */ #endif /* HAVE_SHA256_4WAY */
#endif /* HAVE_SCRYPT_3WAY */ #endif /* HAVE_SCRYPT_3WAY */
#ifdef HAVE_SCRYPT_6WAY
static void scrypt_1024_1_1_256_24way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{
uint32_t tstate[24 * 8] __attribute__((aligned(128)));
uint32_t ostate[24 * 8] __attribute__((aligned(128)));
uint32_t W[24 * 32] __attribute__((aligned(128)));
uint32_t X[24 * 32] __attribute__((aligned(128)));
uint32_t *V;
int i, j, k;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
for (j = 0; j < 3; j++)
for (i = 0; i < 20; i++)
for (k = 0; k < 8; k++)
W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i];
for (j = 0; j < 3; j++)
for (i = 0; i < 8; i++)
for (k = 0; k < 8; k++)
tstate[8 * 8 * j + 8 * i + k] = midstate[i];
HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0);
HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64);
HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128);
PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256);
PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 8; k++)
X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k];
scrypt_core_6way(X + 0 * 32, V);
scrypt_core_6way(X + 6 * 32, V);
scrypt_core_6way(X + 12 * 32, V);
scrypt_core_6way(X + 18 * 32, V);
for (j = 0; j < 3; j++)
for (i = 0; i < 32; i++)
for (k = 0; k < 8; k++)
W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i];
PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0);
PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256);
PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512);
for (j = 0; j < 3; j++)
for (i = 0; i < 8; i++)
for (k = 0; k < 8; k++)
output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k];
}
#endif /* HAVE_SCRYPT_6WAY */
int scanhash_scrypt(int thr_id, uint32_t *pdata, int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget, unsigned char *scratchbuf, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done) uint32_t max_nonce, unsigned long *hashes_done)
@ -589,6 +731,11 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf);
else else
#endif #endif
#if defined(HAVE_SCRYPT_6WAY)
if (throughput == 24)
scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf);
else
#endif
#if defined(HAVE_SCRYPT_3WAY) #if defined(HAVE_SCRYPT_3WAY)
if (throughput == 3) if (throughput == 3)
scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf);

1056
sha2-x64.S

File diff suppressed because it is too large Load diff

64
sha2.c
View file

@ -522,6 +522,65 @@ static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata,
#endif /* HAVE_SHA256_4WAY */ #endif /* HAVE_SHA256_4WAY */
#ifdef HAVE_SHA256_8WAY
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t data[8 * 64] __attribute__((aligned(128)));
uint32_t hash[8 * 8] __attribute__((aligned(32)));
uint32_t midstate[8 * 8] __attribute__((aligned(32)));
uint32_t prehash[8 * 8] __attribute__((aligned(32)));
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int i, j;
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < 8; j++)
data[i * 8 + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < 8; j++) {
midstate[i * 8 + j] = midstate[i];
prehash[i * 8 + j] = prehash[i];
}
}
do {
for (i = 0; i < 8; i++)
data[8 * 3 + i] = ++n;
sha256d_ms_8way(hash, data, midstate, prehash);
for (i = 0; i < 8; i++) {
if (swab32(hash[8 * 7 + i]) <= Htarg) {
pdata[19] = data[8 * 3 + i];
sha256d_80_swap(hash, pdata);
if (fulltest(hash, ptarget)) {
*hashes_done = n - first_nonce + 1;
return 1;
}
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done) uint32_t max_nonce, unsigned long *hashes_done)
{ {
@ -533,6 +592,11 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
#ifdef HAVE_SHA256_8WAY
if (sha256_use_8way())
return scanhash_sha256d_8way(thr_id, pdata, ptarget,
max_nonce, hashes_done);
#endif
#ifdef HAVE_SHA256_4WAY #ifdef HAVE_SHA256_4WAY
if (sha256_use_4way()) if (sha256_use_4way())
return scanhash_sha256d_4way(thr_id, pdata, ptarget, return scanhash_sha256d_4way(thr_id, pdata, ptarget,