Add 4-way SHA-256 implementation for x86-64

This commit is contained in:
pooler 2012-02-28 18:10:26 +01:00
parent 1f15a1f672
commit e560d53b76
3 changed files with 769 additions and 227 deletions

38
miner.h
View file

@ -119,17 +119,49 @@ static inline void swap256(void *dest_p, const void *src_p)
dest[7] = src[0];
}
static inline uint32_t be32dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
}
static inline void be32enc(void *pp, uint32_t x)
{
uint8_t *p = (uint8_t *)pp;
p[3] = x & 0xff;
p[2] = (x >> 8) & 0xff;
p[1] = (x >> 16) & 0xff;
p[0] = (x >> 24) & 0xff;
}
static inline uint32_t le32dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
}
static inline void le32enc(void *pp, uint32_t x)
{
uint8_t *p = (uint8_t *)pp;
p[0] = x & 0xff;
p[1] = (x >> 8) & 0xff;
p[2] = (x >> 16) & 0xff;
p[3] = (x >> 24) & 0xff;
}
extern bool opt_debug;
extern bool opt_protocol;
extern const uint32_t sha256_init_state[];
extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
const char *rpc_req, bool, bool, int *);
extern char *bin2hex(const unsigned char *p, size_t len);
extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
extern unsigned char *scrypt_buffer_alloc();
extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf,
const unsigned char *ptarget,
extern int scanhash_scrypt(int thr_id, unsigned char *pdata,
unsigned char *scratchbuf, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done);
extern int

View file

@ -27,6 +27,394 @@
#endif
#if defined(__x86_64__)
.data
.p2align 6
sha256_4h:
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
.data
.p2align 6
sha256_4k:
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
.text
.p2align 5
.globl SHA256_InitState_4way
.globl _SHA256_InitState_4way
SHA256_InitState_4way:
_SHA256_InitState_4way:
#if defined(WIN64)
pushq %rdi
movq %rcx, %rdi
#endif
movdqa sha256_4h+0, %xmm0
movdqa sha256_4h+16, %xmm1
movdqa sha256_4h+32, %xmm2
movdqa sha256_4h+48, %xmm3
movdqu %xmm0, 0(%rdi)
movdqu %xmm1, 16(%rdi)
movdqu %xmm2, 32(%rdi)
movdqu %xmm3, 48(%rdi)
movdqa sha256_4h+64, %xmm0
movdqa sha256_4h+80, %xmm1
movdqa sha256_4h+96, %xmm2
movdqa sha256_4h+112, %xmm3
movdqu %xmm0, 64(%rdi)
movdqu %xmm1, 80(%rdi)
movdqu %xmm2, 96(%rdi)
movdqu %xmm3, 112(%rdi)
#if defined(WIN64)
popq %rdi
#endif
ret
.macro p2bswap_rsi_rsp i
movdqu \i*16(%rsi), %xmm0
movdqu (\i+1)*16(%rsi), %xmm2
pshuflw $0xb1, %xmm0, %xmm0
pshuflw $0xb1, %xmm2, %xmm2
pshufhw $0xb1, %xmm0, %xmm0
pshufhw $0xb1, %xmm2, %xmm2
movdqa %xmm0, %xmm1
movdqa %xmm2, %xmm3
psrlw $8, %xmm1
psrlw $8, %xmm3
psllw $8, %xmm0
psllw $8, %xmm2
pxor %xmm1, %xmm0
pxor %xmm3, %xmm2
movdqa %xmm0, \i*16(%rsp)
movdqa %xmm2, (\i+1)*16(%rsp)
.endm
.text
.p2align 5
.globl SHA256_Transform_4way
.globl _SHA256_Transform_4way
SHA256_Transform_4way:
_SHA256_Transform_4way:
#if defined(WIN64)
pushq %rdi
subq $96, %rsp
movdqa %xmm6, 0(%rsp)
movdqa %xmm7, 16(%rsp)
movdqa %xmm8, 32(%rsp)
movdqa %xmm9, 48(%rsp)
movdqa %xmm10, 64(%rsp)
movdqa %xmm11, 80(%rsp)
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
subq $1032, %rsp
testq %rdx, %rdx
jz sha256_transform_4way_block_copy
p2bswap_rsi_rsp 0
p2bswap_rsi_rsp 2
p2bswap_rsi_rsp 4
p2bswap_rsi_rsp 6
p2bswap_rsi_rsp 8
p2bswap_rsi_rsp 10
p2bswap_rsi_rsp 12
p2bswap_rsi_rsp 14
jmp sha256_transform_4way_extend
sha256_transform_4way_block_copy:
movdqu 0*16(%rsi), %xmm0
movdqu 1*16(%rsi), %xmm1
movdqu 2*16(%rsi), %xmm2
movdqu 3*16(%rsi), %xmm3
movdqu 4*16(%rsi), %xmm4
movdqu 5*16(%rsi), %xmm5
movdqu 6*16(%rsi), %xmm6
movdqu 7*16(%rsi), %xmm7
movdqa %xmm0, 0*16(%rsp)
movdqa %xmm1, 1*16(%rsp)
movdqa %xmm2, 2*16(%rsp)
movdqa %xmm3, 3*16(%rsp)
movdqa %xmm4, 4*16(%rsp)
movdqa %xmm5, 5*16(%rsp)
movdqa %xmm6, 6*16(%rsp)
movdqa %xmm7, 7*16(%rsp)
movdqu 8*16(%rsi), %xmm0
movdqu 9*16(%rsi), %xmm1
movdqu 10*16(%rsi), %xmm2
movdqu 11*16(%rsi), %xmm3
movdqu 12*16(%rsi), %xmm4
movdqu 13*16(%rsi), %xmm5
movdqu 14*16(%rsi), %xmm6
movdqu 15*16(%rsi), %xmm7
movdqa %xmm0, 8*16(%rsp)
movdqa %xmm1, 9*16(%rsp)
movdqa %xmm2, 10*16(%rsp)
movdqa %xmm3, 11*16(%rsp)
movdqa %xmm4, 12*16(%rsp)
movdqa %xmm5, 13*16(%rsp)
movdqa %xmm6, 14*16(%rsp)
movdqa %xmm7, 15*16(%rsp)
sha256_transform_4way_extend:
leaq 256(%rsp), %rcx
leaq 48*16(%rcx), %rax
sha256_transform_4way_extend_loop:
movdqa -15*16(%rcx), %xmm0
movdqa -14*16(%rcx), %xmm4
movdqa %xmm0, %xmm2
movdqa %xmm4, %xmm6
psrld $3, %xmm0
psrld $3, %xmm4
movdqa %xmm0, %xmm1
movdqa %xmm4, %xmm5
pslld $14, %xmm2
pslld $14, %xmm6
psrld $4, %xmm1
psrld $4, %xmm5
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
psrld $11, %xmm1
psrld $11, %xmm5
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
pslld $11, %xmm2
pslld $11, %xmm6
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
movdqa -2*16(%rcx), %xmm3
movdqa -1*16(%rcx), %xmm7
paddd -16*16(%rcx), %xmm0
paddd -15*16(%rcx), %xmm4
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
psrld $10, %xmm3
psrld $10, %xmm7
movdqa %xmm3, %xmm1
movdqa %xmm7, %xmm5
paddd -7*16(%rcx), %xmm0
pslld $13, %xmm2
pslld $13, %xmm6
psrld $7, %xmm1
psrld $7, %xmm5
paddd -6*16(%rcx), %xmm4
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
psrld $2, %xmm1
psrld $2, %xmm5
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
pslld $2, %xmm2
pslld $2, %xmm6
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
paddd %xmm3, %xmm0
paddd %xmm7, %xmm4
movdqa %xmm0, (%rcx)
movdqa %xmm4, 16(%rcx)
addq $2*16, %rcx
cmpq %rcx, %rax
jne sha256_transform_4way_extend_loop
movdqu 0(%rdi), %xmm7
movdqu 16(%rdi), %xmm5
movdqu 32(%rdi), %xmm4
movdqu 48(%rdi), %xmm3
movdqu 64(%rdi), %xmm0
movdqu 80(%rdi), %xmm8
movdqu 96(%rdi), %xmm9
movdqu 112(%rdi), %xmm10
xorq %rax, %rax
sha256_transform_4way_main_loop:
movdqa (%rsp, %rax), %xmm6
paddd sha256_4k(%rax), %xmm6
paddd %xmm10, %xmm6
movdqa %xmm0, %xmm1
movdqa %xmm9, %xmm2
pandn %xmm2, %xmm1
movdqa %xmm2, %xmm10
movdqa %xmm8, %xmm2
movdqa %xmm2, %xmm9
pand %xmm0, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm0, %xmm8
paddd %xmm1, %xmm6
movdqa %xmm0, %xmm1
psrld $6, %xmm0
movdqa %xmm0, %xmm2
pslld $7, %xmm1
psrld $5, %xmm2
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $14, %xmm1
psrld $14, %xmm2
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $5, %xmm1
pxor %xmm1, %xmm0
paddd %xmm0, %xmm6
movdqa %xmm3, %xmm0
paddd %xmm6, %xmm0
movdqa %xmm5, %xmm1
movdqa %xmm4, %xmm3
movdqa %xmm4, %xmm2
pand %xmm5, %xmm2
pand %xmm7, %xmm4
pand %xmm7, %xmm1
pxor %xmm4, %xmm1
movdqa %xmm5, %xmm4
movdqa %xmm7, %xmm5
pxor %xmm2, %xmm1
paddd %xmm1, %xmm6
movdqa %xmm7, %xmm2
psrld $2, %xmm7
movdqa %xmm7, %xmm1
pslld $10, %xmm2
psrld $11, %xmm1
pxor %xmm2, %xmm7
pxor %xmm1, %xmm7
pslld $9, %xmm2
psrld $9, %xmm1
pxor %xmm2, %xmm7
pxor %xmm1, %xmm7
pslld $11, %xmm2
pxor %xmm2, %xmm7
paddd %xmm6, %xmm7
addq $16, %rax
cmpq $16*64, %rax
jne sha256_transform_4way_main_loop
movdqu 0(%rdi), %xmm2
movdqu 16(%rdi), %xmm6
movdqu 32(%rdi), %xmm11
movdqu 48(%rdi), %xmm1
paddd %xmm2, %xmm7
paddd %xmm6, %xmm5
paddd %xmm11, %xmm4
paddd %xmm1, %xmm3
movdqu 64(%rdi), %xmm2
movdqu 80(%rdi), %xmm6
movdqu 96(%rdi), %xmm11
movdqu 112(%rdi), %xmm1
paddd %xmm2, %xmm0
paddd %xmm6, %xmm8
paddd %xmm11, %xmm9
paddd %xmm1, %xmm10
movdqu %xmm7, 0(%rdi)
movdqu %xmm5, 16(%rdi)
movdqu %xmm4, 32(%rdi)
movdqu %xmm3, 48(%rdi)
movdqu %xmm0, 64(%rdi)
movdqu %xmm8, 80(%rdi)
movdqu %xmm9, 96(%rdi)
movdqu %xmm10, 112(%rdi)
addq $1032, %rsp
#if defined(WIN64)
popq %rsi
movdqa 0(%rsp), %xmm6
movdqa 16(%rsp), %xmm7
movdqa 32(%rsp), %xmm8
movdqa 48(%rsp), %xmm9
movdqa 64(%rsp), %xmm10
movdqa 80(%rsp), %xmm11
addq $96, %rsp
popq %rdi
#endif
ret
.macro scrypt_shuffle src, so, dest, do
movl \so+60(\src), %r8d
@ -187,7 +575,7 @@
.endm
.text
.align 32
.p2align 5
gen_salsa8_core:
# 0: %rdx, %rdi, %rcx, %rsi
movq 8(%rsp), %rdi
@ -286,7 +674,7 @@ gen_salsa8_core:
.text
.align 32
.p2align 5
.globl scrypt_core
.globl _scrypt_core
scrypt_core:
@ -559,7 +947,7 @@ gen_scrypt_core_loop2:
xmm_salsa8_core_doubleround
.endm
.align 32
.p2align 5
xmm_scrypt_core:
# shuffle 1st block into %xmm8-%xmm11
movl 60(%rdi), %edx
@ -871,7 +1259,7 @@ xmm_scrypt_core_loop2:
.text
.align 32
.p2align 5
.globl scrypt_best_throughput
.globl _scrypt_best_throughput
scrypt_best_throughput:
@ -1040,7 +1428,7 @@ scrypt_best_throughput_exit:
.text
.align 32
.p2align 5
.globl scrypt_core_2way
.globl _scrypt_core_2way
scrypt_core_2way:
@ -1509,7 +1897,7 @@ scrypt_core_2way_loop2:
.endm
.text
.align 32
.p2align 5
.globl scrypt_core_3way
.globl _scrypt_core_3way
scrypt_core_3way:
@ -1694,7 +2082,7 @@ scrypt_core_3way_loop1:
jne scrypt_core_3way_loop1
movq $1024, %r8
.align 16
.p2align 4
scrypt_core_3way_loop2:
movl 64(%rsp), %ebp
andl $1023, %ebp

554
scrypt.c
View file

@ -1,5 +1,5 @@
/*-
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2012 pooler
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -34,58 +34,30 @@
#include <stdint.h>
#include <string.h>
#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
| (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
static inline void
byteswap_vec(uint32_t *dest, const uint32_t *src, uint32_t len)
static inline void byteswap_vec(uint32_t *dest, const uint32_t *src, int len)
{
uint32_t i;
int i;
for (i = 0; i < len; i++)
dest[i] = byteswap(src[i]);
}
static inline uint32_t be32dec(const void *pp)
static inline void SHA256_InitState(uint32_t *state)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
/* Magic initialization constants */
state[0] = 0x6A09E667;
state[1] = 0xBB67AE85;
state[2] = 0x3C6EF372;
state[3] = 0xA54FF53A;
state[4] = 0x510E527F;
state[5] = 0x9B05688C;
state[6] = 0x1F83D9AB;
state[7] = 0x5BE0CD19;
}
static inline void be32enc(void *pp, uint32_t x)
{
uint8_t * p = (uint8_t *)pp;
p[3] = x & 0xff;
p[2] = (x >> 8) & 0xff;
p[1] = (x >> 16) & 0xff;
p[0] = (x >> 24) & 0xff;
}
static inline uint32_t le32dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
}
static inline void le32enc(void *pp, uint32_t x)
{
uint8_t * p = (uint8_t *)pp;
p[0] = x & 0xff;
p[1] = (x >> 8) & 0xff;
p[2] = (x >> 16) & 0xff;
p[3] = (x >> 24) & 0xff;
}
typedef struct SHA256Context {
uint32_t state[8];
uint32_t buf[16];
} SHA256_CTX;
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
@ -115,8 +87,7 @@ typedef struct SHA256Context {
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
static void SHA256_Transform(uint32_t *state, const uint32_t *block, int swap)
{
uint32_t W[64];
uint32_t S[8];
@ -124,12 +95,12 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
int i;
/* 1. Prepare message schedule W. */
if(swap)
if (swap)
byteswap_vec(W, block, 16);
else
memcpy(W, block, 64);
for (i = 16; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
@ -207,34 +178,37 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
state[i] += S[i];
}
static inline void
SHA256_InitState(uint32_t * state)
{
/* Magic initialization constants */
state[0] = 0x6A09E667;
state[1] = 0xBB67AE85;
state[2] = 0x3C6EF372;
state[3] = 0xA54FF53A;
state[4] = 0x510E527F;
state[5] = 0x9B05688C;
state[6] = 0x1F83D9AB;
state[7] = 0x5BE0CD19;
}
#if defined(__x86_64__)
#define SHA256_4WAY
void SHA256_Transform_4way(uint32_t *state, const uint32_t *block, int swap);
void SHA256_InitState_4way(uint32_t *state);
#endif
static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300};
static inline void
PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t ostate[8])
static const uint32_t keypad[12] = {
0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000
};
static const uint32_t innerpad[11] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t outerpad[8] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t finalblk[16] = {
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};
static inline void HMAC_SHA256_80_init(const uint32_t *key,
uint32_t *tstate, uint32_t *ostate)
{
uint32_t ihash[8];
uint32_t pad[16];
uint32_t i;
int i;
SHA256_InitState(tstate);
SHA256_Transform(tstate, passwd, 1);
memcpy(pad, passwd+16, 16);
memcpy(pad+4, passwdpad, 48);
SHA256_Transform(tstate, key, 1);
memcpy(pad, key + 16, 16);
memcpy(pad + 4, keypad, 48);
SHA256_Transform(tstate, pad, 1);
memcpy(ihash, tstate, 32);
@ -253,120 +227,179 @@ PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t o
SHA256_Transform(tstate, pad, 0);
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf.
*/
static inline void
PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *passwd, uint32_t *buf)
static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
SHA256_CTX PShictx, PShoctx;
uint32_t i;
uint32_t istate[8], ostate2[8];
uint32_t ibuf[16], obuf[16];
int i;
/* If Klen > 64, the key is really SHA256(K). */
memcpy(PShictx.state, tstate, 32);
memcpy(PShoctx.state, ostate, 32);
memcpy(istate, tstate, 32);
SHA256_Transform(istate, salt, 1);
memcpy(PShoctx.buf+8, outerpad, 32);
byteswap_vec(ibuf, salt + 16, 4);
memcpy(ibuf + 5, innerpad, 44);
memcpy(obuf + 8, outerpad, 32);
SHA256_Transform(PShictx.state, passwd, 1);
byteswap_vec(PShictx.buf, passwd+16, 4);
byteswap_vec(PShictx.buf+5, innerpad, 11);
/* Iterate through the blocks. */
for (i = 0; i < 4; i++) {
uint32_t ist[8];
uint32_t ost[8];
memcpy(obuf, istate, 32);
ibuf[4] = i + 1;
SHA256_Transform(obuf, ibuf, 0);
memcpy(ist, PShictx.state, 32);
PShictx.buf[4] = i + 1;
SHA256_Transform(ist, PShictx.buf, 0);
memcpy(PShoctx.buf, ist, 32);
memcpy(ost, PShoctx.state, 32);
SHA256_Transform(ost, PShoctx.buf, 0);
byteswap_vec(buf+i*8, ost, 8);
memcpy(ostate2, ostate, 32);
SHA256_Transform(ostate2, obuf, 0);
byteswap_vec(output + 8 * i, ostate2, 8);
}
}
static inline void
PBKDF2_SHA256_80_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *passwd, const uint32_t *salt, uint32_t *output)
static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
const uint32_t *salt, uint32_t *output)
{
static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
uint32_t pad[16];
uint32_t buf[16];
SHA256_Transform(tstate, salt, 1);
SHA256_Transform(tstate, salt+16, 1);
SHA256_Transform(tstate, ihash_finalblk, 0);
memcpy(pad, tstate, 32);
memcpy(pad+8, outerpad, 32);
SHA256_Transform(tstate, salt + 16, 1);
SHA256_Transform(tstate, finalblk, 0);
memcpy(buf, tstate, 32);
memcpy(buf + 8, outerpad, 32);
SHA256_Transform(ostate, pad, 0);
SHA256_Transform(ostate, buf, 0);
byteswap_vec(output, ostate, 8);
}
/**
* salsa20_8(B):
* Apply the salsa20/8 core to the provided block.
*/
static inline void
salsa20_8(uint32_t B[16], const uint32_t Bx[16])
#ifdef SHA256_4WAY
static const uint32_t keypad_4way[4 * 12] = {
0x00000080, 0x00000080, 0x00000080, 0x00000080,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80020000, 0x80020000, 0x80020000, 0x80020000
};
static const uint32_t innerpad_4way[4 * 11] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
};
static const uint32_t outerpad_4way[4 * 8] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000300, 0x00000300, 0x00000300, 0x00000300
};
static const uint32_t finalblk_4way[4 * 16] = {
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000620, 0x00000620, 0x00000620, 0x00000620
};
static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
uint32_t *tstate, uint32_t *ostate)
{
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
size_t i;
uint32_t ihash[4 * 8];
uint32_t pad[4 * 16];
int i;
x00 = (B[ 0] ^= Bx[ 0]);
x01 = (B[ 1] ^= Bx[ 1]);
x02 = (B[ 2] ^= Bx[ 2]);
x03 = (B[ 3] ^= Bx[ 3]);
x04 = (B[ 4] ^= Bx[ 4]);
x05 = (B[ 5] ^= Bx[ 5]);
x06 = (B[ 6] ^= Bx[ 6]);
x07 = (B[ 7] ^= Bx[ 7]);
x08 = (B[ 8] ^= Bx[ 8]);
x09 = (B[ 9] ^= Bx[ 9]);
x10 = (B[10] ^= Bx[10]);
x11 = (B[11] ^= Bx[11]);
x12 = (B[12] ^= Bx[12]);
x13 = (B[13] ^= Bx[13]);
x14 = (B[14] ^= Bx[14]);
x15 = (B[15] ^= Bx[15]);
for (i = 0; i < 8; i += 2) {
#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
/* Operate on columns. */
x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7);
x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9);
x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13);
x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18);
SHA256_InitState_4way(tstate);
SHA256_Transform_4way(tstate, key, 1);
memcpy(pad, key + 4 * 16, 4 * 16);
memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
SHA256_Transform_4way(tstate, pad, 1);
memcpy(ihash, tstate, 4 * 32);
/* Operate on rows. */
x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7);
x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9);
x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13);
x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18);
#undef R
}
B[ 0] += x00;
B[ 1] += x01;
B[ 2] += x02;
B[ 3] += x03;
B[ 4] += x04;
B[ 5] += x05;
B[ 6] += x06;
B[ 7] += x07;
B[ 8] += x08;
B[ 9] += x09;
B[10] += x10;
B[11] += x11;
B[12] += x12;
B[13] += x13;
B[14] += x14;
B[15] += x15;
SHA256_InitState_4way(ostate);
for (i = 0; i < 4 * 8; i++)
pad[i] = ihash[i] ^ 0x5c5c5c5c;
for (; i < 4 * 16; i++)
pad[i] = 0x5c5c5c5c;
SHA256_Transform_4way(ostate, pad, 0);
SHA256_InitState_4way(tstate);
for (i = 0; i < 4 * 8; i++)
pad[i] = ihash[i] ^ 0x36363636;
for (; i < 4 * 16; i++)
pad[i] = 0x36363636;
SHA256_Transform_4way(tstate, pad, 0);
}
static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t istate[4 * 8], ostate2[4 * 8];
uint32_t ibuf[4 * 16], obuf[4 * 16];
int i;
memcpy(istate, tstate, 4 * 32);
SHA256_Transform_4way(istate, salt, 1);
byteswap_vec(ibuf, salt + 4 * 16, 4 * 4);
memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
for (i = 0; i < 4; i++) {
memcpy(obuf, istate, 4 * 32);
ibuf[4 * 4 + 0] = i + 1;
ibuf[4 * 4 + 1] = i + 1;
ibuf[4 * 4 + 2] = i + 1;
ibuf[4 * 4 + 3] = i + 1;
SHA256_Transform_4way(obuf, ibuf, 0);
memcpy(ostate2, ostate, 4 * 32);
SHA256_Transform_4way(ostate2, obuf, 0);
byteswap_vec(output + 4 * 8 * i, ostate2, 4 * 8);
}
}
static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{
uint32_t buf[4 * 16];
SHA256_Transform_4way(tstate, salt, 1);
SHA256_Transform_4way(tstate, salt + 4 * 16, 1);
SHA256_Transform_4way(tstate, finalblk_4way, 0);
memcpy(buf, tstate, 4 * 32);
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
SHA256_Transform_4way(ostate, buf, 0);
byteswap_vec(output, ostate, 4 * 8);
}
#endif /* SHA256_4WAY */
#if defined(__x86_64__)
@ -388,38 +421,90 @@ void scrypt_core(uint32_t *X, uint32_t *V);
#define SCRYPT_BUFFER_SIZE (131072 + 63)
static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
{
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
int i;
x00 = (B[ 0] ^= Bx[ 0]);
x01 = (B[ 1] ^= Bx[ 1]);
x02 = (B[ 2] ^= Bx[ 2]);
x03 = (B[ 3] ^= Bx[ 3]);
x04 = (B[ 4] ^= Bx[ 4]);
x05 = (B[ 5] ^= Bx[ 5]);
x06 = (B[ 6] ^= Bx[ 6]);
x07 = (B[ 7] ^= Bx[ 7]);
x08 = (B[ 8] ^= Bx[ 8]);
x09 = (B[ 9] ^= Bx[ 9]);
x10 = (B[10] ^= Bx[10]);
x11 = (B[11] ^= Bx[11]);
x12 = (B[12] ^= Bx[12]);
x13 = (B[13] ^= Bx[13]);
x14 = (B[14] ^= Bx[14]);
x15 = (B[15] ^= Bx[15]);
for (i = 0; i < 8; i += 2) {
#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
/* Operate on columns. */
x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7);
x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7);
x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9);
x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9);
x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13);
x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13);
x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18);
x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18);
/* Operate on rows. */
x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7);
x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7);
x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9);
x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9);
x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13);
x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13);
x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18);
x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18);
#undef R
}
B[ 0] += x00;
B[ 1] += x01;
B[ 2] += x02;
B[ 3] += x03;
B[ 4] += x04;
B[ 5] += x05;
B[ 6] += x06;
B[ 7] += x07;
B[ 8] += x08;
B[ 9] += x09;
B[10] += x10;
B[11] += x11;
B[12] += x12;
B[13] += x13;
B[14] += x14;
B[15] += x15;
}
static inline void scrypt_core(uint32_t *X, uint32_t *V)
{
uint32_t i;
uint32_t j;
uint32_t k;
uint32_t i, j, k;
uint64_t *p1, *p2;
p1 = (uint64_t *)X;
for (i = 0; i < 1024; i += 2) {
for (i = 0; i < 1024; i++) {
memcpy(&V[i * 32], X, 128);
salsa20_8(&X[0], &X[16]);
salsa20_8(&X[16], &X[0]);
memcpy(&V[(i + 1) * 32], X, 128);
salsa20_8(&X[0], &X[16]);
salsa20_8(&X[16], &X[0]);
}
for (i = 0; i < 1024; i += 2) {
for (i = 0; i < 1024; i++) {
j = X[16] & 1023;
p2 = (uint64_t *)(&V[j * 32]);
for(k = 0; k < 16; k++)
for (k = 0; k < 16; k++)
p1[k] ^= p2[k];
salsa20_8(&X[0], &X[16]);
salsa20_8(&X[16], &X[0]);
j = X[16] & 1023;
p2 = (uint64_t *)(&V[j * 32]);
for(k = 0; k < 16; k++)
p1[k] ^= p2[k];
salsa20_8(&X[0], &X[16]);
salsa20_8(&X[16], &X[0]);
}
@ -427,33 +512,32 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V)
#endif
unsigned char *scrypt_buffer_alloc() {
unsigned char *scrypt_buffer_alloc()
{
return malloc(SCRYPT_BUFFER_SIZE);
}
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
r = 1, p = 1, N = 1024
*/
static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *output, unsigned char *scratchpad)
static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output,
unsigned char *scratchpad)
{
uint32_t tstate[8], ostate[8];
uint32_t *V;
uint32_t X[32];
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
PBKDF2_SHA256_80_128_init(input, tstate, ostate);
HMAC_SHA256_80_init(input, tstate, ostate);
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
scrypt_core(X, V);
return PBKDF2_SHA256_80_128_32(tstate, ostate, input, X, output);
return PBKDF2_SHA256_128_32(tstate, ostate, X, output);
}
#ifdef SCRYPT_3WAY
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1, const uint32_t *input2,
uint32_t *output1, uint32_t *output2, unsigned char *scratchpad)
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1,
const uint32_t *input2, uint32_t *output1, uint32_t *output2,
unsigned char *scratchpad)
{
uint32_t tstate1[8], tstate2[8];
uint32_t ostate1[8], ostate2[8];
@ -461,47 +545,86 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1, const uint32_t *
uint32_t X[32], Y[32];
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1);
PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2);
HMAC_SHA256_80_init(input1, tstate1, ostate1);
HMAC_SHA256_80_init(input2, tstate2, ostate2);
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
scrypt_core_2way(X, Y, V);
PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, output1);
PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, output2);
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
}
static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input1, const uint32_t *input2, const uint32_t *input3,
uint32_t *output1, uint32_t *output2, uint32_t *output3, unsigned char *scratchpad)
static void scrypt_1024_1_1_256_sp_3way(
const uint32_t *input1, const uint32_t *input2, const uint32_t *input3,
uint32_t *output1, uint32_t *output2, uint32_t *output3,
unsigned char *scratchpad)
{
uint32_t tstate1[8], tstate2[8], tstate3[8];
uint32_t ostate1[8], ostate2[8], ostate3[8];
uint32_t *V;
#ifdef SHA256_4WAY
uint32_t tstate[4 * 8], ostate[4 * 8];
uint32_t input[4 * 20], output[4 * 32];
uint32_t X[32], Y[32], Z[32];
uint32_t W[4 * 32];
uint32_t *V;
int i;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1);
PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2);
PBKDF2_SHA256_80_128_init(input3, tstate3, ostate3);
for (i = 0; i < 20; i++) {
input[4 * i + 0] = input1[i];
input[4 * i + 1] = input2[i];
input[4 * i + 2] = input3[i];
}
HMAC_SHA256_80_init_4way(input, tstate, ostate);
PBKDF2_SHA256_80_128_4way(tstate, ostate, input, W);
for (i = 0; i < 32; i++) {
X[i] = W[4 * i + 0];
Y[i] = W[4 * i + 1];
Z[i] = W[4 * i + 2];
}
scrypt_core_3way(X, Y, Z, V);
for (i = 0; i < 32; i++) {
W[4 * i + 0] = X[i];
W[4 * i + 1] = Y[i];
W[4 * i + 2] = Z[i];
}
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, output);
for (i = 0; i < 8; i++) {
output1[i] = output[4 * i + 0];
output2[i] = output[4 * i + 1];
output3[i] = output[4 * i + 2];
}
#else
uint32_t tstate1[8], tstate2[8], tstate3[8];
uint32_t ostate1[8], ostate2[8], ostate3[8];
uint32_t X[32], Y[32], Z[32];
uint32_t *V;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
HMAC_SHA256_80_init(input1, tstate1, ostate1);
HMAC_SHA256_80_init(input2, tstate2, ostate2);
HMAC_SHA256_80_init(input3, tstate3, ostate3);
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
PBKDF2_SHA256_80_128(tstate3, ostate3, input3, Z);
scrypt_core_3way(X, Y, Z, V);
PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, output1);
PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, output2);
PBKDF2_SHA256_80_128_32(tstate3, ostate3, input3, Z, output3);
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
PBKDF2_SHA256_128_32(tstate3, ostate3, Z, output3);
#endif /* SHA256_4WAY*/
}
#endif
#endif /* SCRYPT_3WAY */
__attribute__ ((noinline)) static int test_lower_hash(const uint32_t *hash,
__attribute__ ((noinline)) static int confirm_hash(const uint32_t *hash,
const uint32_t *target)
{
int i;
for (i = 6; i >= 0; i--) {
for (i = 7; i >= 0; i--) {
uint32_t t = le32dec(&target[i]);
if (hash[i] > t)
return 0;
@ -511,8 +634,8 @@ __attribute__ ((noinline)) static int test_lower_hash(const uint32_t *hash,
return 1;
}
int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
const unsigned char *ptarget,
int scanhash_scrypt(int thr_id, unsigned char *pdata,
unsigned char *scratchbuf, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done)
{
uint32_t data[20], hash[8];
@ -542,7 +665,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
if (throughput >= 3 && n <= max_nonce) {
data3[19] = n++;
scrypt_1024_1_1_256_sp_3way(data, data2, data3, hash, hash2, hash3, scratchbuf);
if (hash3[7] < Htarg || (hash3[7] == Htarg && test_lower_hash(hash3, (uint32_t *)ptarget))) {
if (hash3[7] <= Htarg && confirm_hash(hash3, (uint32_t *)ptarget)) {
be32enc(&((uint32_t *)pdata)[19], data3[19]);
*next_nonce = n;
*hashes_done = n - first_nonce;
@ -551,7 +674,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
} else {
scrypt_1024_1_1_256_sp_2way(data, data2, hash, hash2, scratchbuf);
}
if (hash2[7] < Htarg || (hash2[7] == Htarg && test_lower_hash(hash2, (uint32_t *)ptarget))) {
if (hash2[7] <= Htarg && confirm_hash(hash2, (uint32_t *)ptarget)) {
be32enc(&((uint32_t *)pdata)[19], data2[19]);
*next_nonce = n;
*hashes_done = n - first_nonce;
@ -563,7 +686,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
#else
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
#endif
if (hash[7] < Htarg || (hash[7] == Htarg && test_lower_hash(hash, (uint32_t *)ptarget))) {
if (hash[7] <= Htarg && confirm_hash(hash, (uint32_t *)ptarget)) {
be32enc(&((uint32_t *)pdata)[19], data[19]);
*next_nonce = n;
*hashes_done = n - first_nonce;
@ -575,4 +698,3 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
*hashes_done = n - first_nonce;
return false;
}