Add 4-way SHA-256 implementation for x86-64
This commit is contained in:
parent
1f15a1f672
commit
e560d53b76
3 changed files with 769 additions and 227 deletions
38
miner.h
38
miner.h
|
@ -119,17 +119,49 @@ static inline void swap256(void *dest_p, const void *src_p)
|
|||
dest[7] = src[0];
|
||||
}
|
||||
|
||||
static inline uint32_t be32dec(const void *pp)
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)pp;
|
||||
return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
|
||||
((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
|
||||
}
|
||||
|
||||
static inline void be32enc(void *pp, uint32_t x)
|
||||
{
|
||||
uint8_t *p = (uint8_t *)pp;
|
||||
p[3] = x & 0xff;
|
||||
p[2] = (x >> 8) & 0xff;
|
||||
p[1] = (x >> 16) & 0xff;
|
||||
p[0] = (x >> 24) & 0xff;
|
||||
}
|
||||
|
||||
static inline uint32_t le32dec(const void *pp)
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)pp;
|
||||
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
|
||||
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
|
||||
}
|
||||
|
||||
static inline void le32enc(void *pp, uint32_t x)
|
||||
{
|
||||
uint8_t *p = (uint8_t *)pp;
|
||||
p[0] = x & 0xff;
|
||||
p[1] = (x >> 8) & 0xff;
|
||||
p[2] = (x >> 16) & 0xff;
|
||||
p[3] = (x >> 24) & 0xff;
|
||||
}
|
||||
|
||||
extern bool opt_debug;
|
||||
extern bool opt_protocol;
|
||||
extern const uint32_t sha256_init_state[];
|
||||
|
||||
extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
|
||||
const char *rpc_req, bool, bool, int *);
|
||||
extern char *bin2hex(const unsigned char *p, size_t len);
|
||||
extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
|
||||
|
||||
extern unsigned char *scrypt_buffer_alloc();
|
||||
extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf,
|
||||
const unsigned char *ptarget,
|
||||
extern int scanhash_scrypt(int thr_id, unsigned char *pdata,
|
||||
unsigned char *scratchbuf, const unsigned char *ptarget,
|
||||
uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done);
|
||||
|
||||
extern int
|
||||
|
|
402
scrypt-x64.S
402
scrypt-x64.S
|
@ -27,6 +27,394 @@
|
|||
#endif
|
||||
|
||||
#if defined(__x86_64__)
|
||||
.data
|
||||
.p2align 6
|
||||
sha256_4h:
|
||||
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
|
||||
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
|
||||
.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
|
||||
.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
|
||||
.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
|
||||
.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
|
||||
.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
|
||||
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
|
||||
|
||||
.data
|
||||
.p2align 6
|
||||
sha256_4k:
|
||||
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
|
||||
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
|
||||
.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
|
||||
.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
|
||||
.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
|
||||
.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
|
||||
.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
|
||||
.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
|
||||
.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
|
||||
.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
|
||||
.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
|
||||
.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
|
||||
.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
|
||||
.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
|
||||
.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
|
||||
.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
|
||||
.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
|
||||
.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
|
||||
.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
|
||||
.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
|
||||
.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
|
||||
.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
|
||||
.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
|
||||
.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
|
||||
.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
|
||||
.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
|
||||
.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
|
||||
.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
|
||||
.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
|
||||
.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
|
||||
.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
|
||||
.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
|
||||
.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
|
||||
.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
|
||||
.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
|
||||
.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
|
||||
.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
|
||||
.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
|
||||
.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
|
||||
.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
|
||||
.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
|
||||
.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
|
||||
.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
|
||||
.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
|
||||
.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
|
||||
.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
|
||||
.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
|
||||
.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
|
||||
.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
|
||||
.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
|
||||
.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
|
||||
.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
|
||||
.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
|
||||
.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
|
||||
.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
|
||||
.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
|
||||
.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
|
||||
.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
|
||||
.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
|
||||
.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
|
||||
.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
|
||||
.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
|
||||
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
|
||||
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
|
||||
|
||||
.text
|
||||
.p2align 5
|
||||
.globl SHA256_InitState_4way
|
||||
.globl _SHA256_InitState_4way
|
||||
SHA256_InitState_4way:
|
||||
_SHA256_InitState_4way:
|
||||
#if defined(WIN64)
|
||||
pushq %rdi
|
||||
movq %rcx, %rdi
|
||||
#endif
|
||||
movdqa sha256_4h+0, %xmm0
|
||||
movdqa sha256_4h+16, %xmm1
|
||||
movdqa sha256_4h+32, %xmm2
|
||||
movdqa sha256_4h+48, %xmm3
|
||||
movdqu %xmm0, 0(%rdi)
|
||||
movdqu %xmm1, 16(%rdi)
|
||||
movdqu %xmm2, 32(%rdi)
|
||||
movdqu %xmm3, 48(%rdi)
|
||||
movdqa sha256_4h+64, %xmm0
|
||||
movdqa sha256_4h+80, %xmm1
|
||||
movdqa sha256_4h+96, %xmm2
|
||||
movdqa sha256_4h+112, %xmm3
|
||||
movdqu %xmm0, 64(%rdi)
|
||||
movdqu %xmm1, 80(%rdi)
|
||||
movdqu %xmm2, 96(%rdi)
|
||||
movdqu %xmm3, 112(%rdi)
|
||||
#if defined(WIN64)
|
||||
popq %rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
.macro p2bswap_rsi_rsp i
|
||||
movdqu \i*16(%rsi), %xmm0
|
||||
movdqu (\i+1)*16(%rsi), %xmm2
|
||||
pshuflw $0xb1, %xmm0, %xmm0
|
||||
pshuflw $0xb1, %xmm2, %xmm2
|
||||
pshufhw $0xb1, %xmm0, %xmm0
|
||||
pshufhw $0xb1, %xmm2, %xmm2
|
||||
movdqa %xmm0, %xmm1
|
||||
movdqa %xmm2, %xmm3
|
||||
psrlw $8, %xmm1
|
||||
psrlw $8, %xmm3
|
||||
psllw $8, %xmm0
|
||||
psllw $8, %xmm2
|
||||
pxor %xmm1, %xmm0
|
||||
pxor %xmm3, %xmm2
|
||||
movdqa %xmm0, \i*16(%rsp)
|
||||
movdqa %xmm2, (\i+1)*16(%rsp)
|
||||
.endm
|
||||
|
||||
.text
|
||||
.p2align 5
|
||||
.globl SHA256_Transform_4way
|
||||
.globl _SHA256_Transform_4way
|
||||
SHA256_Transform_4way:
|
||||
_SHA256_Transform_4way:
|
||||
#if defined(WIN64)
|
||||
pushq %rdi
|
||||
subq $96, %rsp
|
||||
movdqa %xmm6, 0(%rsp)
|
||||
movdqa %xmm7, 16(%rsp)
|
||||
movdqa %xmm8, 32(%rsp)
|
||||
movdqa %xmm9, 48(%rsp)
|
||||
movdqa %xmm10, 64(%rsp)
|
||||
movdqa %xmm11, 80(%rsp)
|
||||
pushq %rsi
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
#endif
|
||||
subq $1032, %rsp
|
||||
|
||||
testq %rdx, %rdx
|
||||
jz sha256_transform_4way_block_copy
|
||||
|
||||
p2bswap_rsi_rsp 0
|
||||
p2bswap_rsi_rsp 2
|
||||
p2bswap_rsi_rsp 4
|
||||
p2bswap_rsi_rsp 6
|
||||
p2bswap_rsi_rsp 8
|
||||
p2bswap_rsi_rsp 10
|
||||
p2bswap_rsi_rsp 12
|
||||
p2bswap_rsi_rsp 14
|
||||
jmp sha256_transform_4way_extend
|
||||
|
||||
sha256_transform_4way_block_copy:
|
||||
movdqu 0*16(%rsi), %xmm0
|
||||
movdqu 1*16(%rsi), %xmm1
|
||||
movdqu 2*16(%rsi), %xmm2
|
||||
movdqu 3*16(%rsi), %xmm3
|
||||
movdqu 4*16(%rsi), %xmm4
|
||||
movdqu 5*16(%rsi), %xmm5
|
||||
movdqu 6*16(%rsi), %xmm6
|
||||
movdqu 7*16(%rsi), %xmm7
|
||||
movdqa %xmm0, 0*16(%rsp)
|
||||
movdqa %xmm1, 1*16(%rsp)
|
||||
movdqa %xmm2, 2*16(%rsp)
|
||||
movdqa %xmm3, 3*16(%rsp)
|
||||
movdqa %xmm4, 4*16(%rsp)
|
||||
movdqa %xmm5, 5*16(%rsp)
|
||||
movdqa %xmm6, 6*16(%rsp)
|
||||
movdqa %xmm7, 7*16(%rsp)
|
||||
movdqu 8*16(%rsi), %xmm0
|
||||
movdqu 9*16(%rsi), %xmm1
|
||||
movdqu 10*16(%rsi), %xmm2
|
||||
movdqu 11*16(%rsi), %xmm3
|
||||
movdqu 12*16(%rsi), %xmm4
|
||||
movdqu 13*16(%rsi), %xmm5
|
||||
movdqu 14*16(%rsi), %xmm6
|
||||
movdqu 15*16(%rsi), %xmm7
|
||||
movdqa %xmm0, 8*16(%rsp)
|
||||
movdqa %xmm1, 9*16(%rsp)
|
||||
movdqa %xmm2, 10*16(%rsp)
|
||||
movdqa %xmm3, 11*16(%rsp)
|
||||
movdqa %xmm4, 12*16(%rsp)
|
||||
movdqa %xmm5, 13*16(%rsp)
|
||||
movdqa %xmm6, 14*16(%rsp)
|
||||
movdqa %xmm7, 15*16(%rsp)
|
||||
|
||||
sha256_transform_4way_extend:
|
||||
leaq 256(%rsp), %rcx
|
||||
leaq 48*16(%rcx), %rax
|
||||
sha256_transform_4way_extend_loop:
|
||||
movdqa -15*16(%rcx), %xmm0
|
||||
movdqa -14*16(%rcx), %xmm4
|
||||
movdqa %xmm0, %xmm2
|
||||
movdqa %xmm4, %xmm6
|
||||
psrld $3, %xmm0
|
||||
psrld $3, %xmm4
|
||||
movdqa %xmm0, %xmm1
|
||||
movdqa %xmm4, %xmm5
|
||||
pslld $14, %xmm2
|
||||
pslld $14, %xmm6
|
||||
psrld $4, %xmm1
|
||||
psrld $4, %xmm5
|
||||
pxor %xmm1, %xmm0
|
||||
pxor %xmm5, %xmm4
|
||||
psrld $11, %xmm1
|
||||
psrld $11, %xmm5
|
||||
pxor %xmm2, %xmm0
|
||||
pxor %xmm6, %xmm4
|
||||
pslld $11, %xmm2
|
||||
pslld $11, %xmm6
|
||||
pxor %xmm1, %xmm0
|
||||
pxor %xmm5, %xmm4
|
||||
pxor %xmm2, %xmm0
|
||||
pxor %xmm6, %xmm4
|
||||
|
||||
movdqa -2*16(%rcx), %xmm3
|
||||
movdqa -1*16(%rcx), %xmm7
|
||||
paddd -16*16(%rcx), %xmm0
|
||||
paddd -15*16(%rcx), %xmm4
|
||||
|
||||
movdqa %xmm3, %xmm2
|
||||
movdqa %xmm7, %xmm6
|
||||
psrld $10, %xmm3
|
||||
psrld $10, %xmm7
|
||||
movdqa %xmm3, %xmm1
|
||||
movdqa %xmm7, %xmm5
|
||||
|
||||
paddd -7*16(%rcx), %xmm0
|
||||
|
||||
pslld $13, %xmm2
|
||||
pslld $13, %xmm6
|
||||
psrld $7, %xmm1
|
||||
psrld $7, %xmm5
|
||||
|
||||
paddd -6*16(%rcx), %xmm4
|
||||
|
||||
pxor %xmm1, %xmm3
|
||||
pxor %xmm5, %xmm7
|
||||
psrld $2, %xmm1
|
||||
psrld $2, %xmm5
|
||||
pxor %xmm2, %xmm3
|
||||
pxor %xmm6, %xmm7
|
||||
pslld $2, %xmm2
|
||||
pslld $2, %xmm6
|
||||
pxor %xmm1, %xmm3
|
||||
pxor %xmm5, %xmm7
|
||||
pxor %xmm2, %xmm3
|
||||
pxor %xmm6, %xmm7
|
||||
|
||||
paddd %xmm3, %xmm0
|
||||
paddd %xmm7, %xmm4
|
||||
movdqa %xmm0, (%rcx)
|
||||
movdqa %xmm4, 16(%rcx)
|
||||
addq $2*16, %rcx
|
||||
cmpq %rcx, %rax
|
||||
jne sha256_transform_4way_extend_loop
|
||||
|
||||
movdqu 0(%rdi), %xmm7
|
||||
movdqu 16(%rdi), %xmm5
|
||||
movdqu 32(%rdi), %xmm4
|
||||
movdqu 48(%rdi), %xmm3
|
||||
movdqu 64(%rdi), %xmm0
|
||||
movdqu 80(%rdi), %xmm8
|
||||
movdqu 96(%rdi), %xmm9
|
||||
movdqu 112(%rdi), %xmm10
|
||||
|
||||
xorq %rax, %rax
|
||||
sha256_transform_4way_main_loop:
|
||||
movdqa (%rsp, %rax), %xmm6
|
||||
paddd sha256_4k(%rax), %xmm6
|
||||
paddd %xmm10, %xmm6
|
||||
|
||||
movdqa %xmm0, %xmm1
|
||||
movdqa %xmm9, %xmm2
|
||||
pandn %xmm2, %xmm1
|
||||
|
||||
movdqa %xmm2, %xmm10
|
||||
movdqa %xmm8, %xmm2
|
||||
movdqa %xmm2, %xmm9
|
||||
|
||||
pand %xmm0, %xmm2
|
||||
pxor %xmm2, %xmm1
|
||||
movdqa %xmm0, %xmm8
|
||||
|
||||
paddd %xmm1, %xmm6
|
||||
|
||||
movdqa %xmm0, %xmm1
|
||||
psrld $6, %xmm0
|
||||
movdqa %xmm0, %xmm2
|
||||
pslld $7, %xmm1
|
||||
psrld $5, %xmm2
|
||||
pxor %xmm1, %xmm0
|
||||
pxor %xmm2, %xmm0
|
||||
pslld $14, %xmm1
|
||||
psrld $14, %xmm2
|
||||
pxor %xmm1, %xmm0
|
||||
pxor %xmm2, %xmm0
|
||||
pslld $5, %xmm1
|
||||
pxor %xmm1, %xmm0
|
||||
paddd %xmm0, %xmm6
|
||||
|
||||
movdqa %xmm3, %xmm0
|
||||
paddd %xmm6, %xmm0
|
||||
|
||||
movdqa %xmm5, %xmm1
|
||||
movdqa %xmm4, %xmm3
|
||||
movdqa %xmm4, %xmm2
|
||||
pand %xmm5, %xmm2
|
||||
pand %xmm7, %xmm4
|
||||
pand %xmm7, %xmm1
|
||||
pxor %xmm4, %xmm1
|
||||
movdqa %xmm5, %xmm4
|
||||
movdqa %xmm7, %xmm5
|
||||
pxor %xmm2, %xmm1
|
||||
paddd %xmm1, %xmm6
|
||||
|
||||
movdqa %xmm7, %xmm2
|
||||
psrld $2, %xmm7
|
||||
movdqa %xmm7, %xmm1
|
||||
pslld $10, %xmm2
|
||||
psrld $11, %xmm1
|
||||
pxor %xmm2, %xmm7
|
||||
pxor %xmm1, %xmm7
|
||||
pslld $9, %xmm2
|
||||
psrld $9, %xmm1
|
||||
pxor %xmm2, %xmm7
|
||||
pxor %xmm1, %xmm7
|
||||
pslld $11, %xmm2
|
||||
pxor %xmm2, %xmm7
|
||||
paddd %xmm6, %xmm7
|
||||
|
||||
addq $16, %rax
|
||||
cmpq $16*64, %rax
|
||||
jne sha256_transform_4way_main_loop
|
||||
|
||||
movdqu 0(%rdi), %xmm2
|
||||
movdqu 16(%rdi), %xmm6
|
||||
movdqu 32(%rdi), %xmm11
|
||||
movdqu 48(%rdi), %xmm1
|
||||
paddd %xmm2, %xmm7
|
||||
paddd %xmm6, %xmm5
|
||||
paddd %xmm11, %xmm4
|
||||
paddd %xmm1, %xmm3
|
||||
movdqu 64(%rdi), %xmm2
|
||||
movdqu 80(%rdi), %xmm6
|
||||
movdqu 96(%rdi), %xmm11
|
||||
movdqu 112(%rdi), %xmm1
|
||||
paddd %xmm2, %xmm0
|
||||
paddd %xmm6, %xmm8
|
||||
paddd %xmm11, %xmm9
|
||||
paddd %xmm1, %xmm10
|
||||
|
||||
movdqu %xmm7, 0(%rdi)
|
||||
movdqu %xmm5, 16(%rdi)
|
||||
movdqu %xmm4, 32(%rdi)
|
||||
movdqu %xmm3, 48(%rdi)
|
||||
movdqu %xmm0, 64(%rdi)
|
||||
movdqu %xmm8, 80(%rdi)
|
||||
movdqu %xmm9, 96(%rdi)
|
||||
movdqu %xmm10, 112(%rdi)
|
||||
|
||||
addq $1032, %rsp
|
||||
#if defined(WIN64)
|
||||
popq %rsi
|
||||
movdqa 0(%rsp), %xmm6
|
||||
movdqa 16(%rsp), %xmm7
|
||||
movdqa 32(%rsp), %xmm8
|
||||
movdqa 48(%rsp), %xmm9
|
||||
movdqa 64(%rsp), %xmm10
|
||||
movdqa 80(%rsp), %xmm11
|
||||
addq $96, %rsp
|
||||
popq %rdi
|
||||
#endif
|
||||
ret
|
||||
|
||||
|
||||
.macro scrypt_shuffle src, so, dest, do
|
||||
movl \so+60(\src), %r8d
|
||||
|
@ -187,7 +575,7 @@
|
|||
.endm
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.p2align 5
|
||||
gen_salsa8_core:
|
||||
# 0: %rdx, %rdi, %rcx, %rsi
|
||||
movq 8(%rsp), %rdi
|
||||
|
@ -286,7 +674,7 @@ gen_salsa8_core:
|
|||
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.p2align 5
|
||||
.globl scrypt_core
|
||||
.globl _scrypt_core
|
||||
scrypt_core:
|
||||
|
@ -559,7 +947,7 @@ gen_scrypt_core_loop2:
|
|||
xmm_salsa8_core_doubleround
|
||||
.endm
|
||||
|
||||
.align 32
|
||||
.p2align 5
|
||||
xmm_scrypt_core:
|
||||
# shuffle 1st block into %xmm8-%xmm11
|
||||
movl 60(%rdi), %edx
|
||||
|
@ -871,7 +1259,7 @@ xmm_scrypt_core_loop2:
|
|||
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.p2align 5
|
||||
.globl scrypt_best_throughput
|
||||
.globl _scrypt_best_throughput
|
||||
scrypt_best_throughput:
|
||||
|
@ -1040,7 +1428,7 @@ scrypt_best_throughput_exit:
|
|||
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.p2align 5
|
||||
.globl scrypt_core_2way
|
||||
.globl _scrypt_core_2way
|
||||
scrypt_core_2way:
|
||||
|
@ -1509,7 +1897,7 @@ scrypt_core_2way_loop2:
|
|||
.endm
|
||||
|
||||
.text
|
||||
.align 32
|
||||
.p2align 5
|
||||
.globl scrypt_core_3way
|
||||
.globl _scrypt_core_3way
|
||||
scrypt_core_3way:
|
||||
|
@ -1694,7 +2082,7 @@ scrypt_core_3way_loop1:
|
|||
jne scrypt_core_3way_loop1
|
||||
|
||||
movq $1024, %r8
|
||||
.align 16
|
||||
.p2align 4
|
||||
scrypt_core_3way_loop2:
|
||||
movl 64(%rsp), %ebp
|
||||
andl $1023, %ebp
|
||||
|
|
556
scrypt.c
556
scrypt.c
|
@ -1,5 +1,5 @@
|
|||
/*-
|
||||
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler
|
||||
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2012 pooler
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -34,58 +34,30 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
|
||||
#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \
|
||||
| (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
|
||||
|
||||
static inline void
|
||||
byteswap_vec(uint32_t *dest, const uint32_t *src, uint32_t len)
|
||||
static inline void byteswap_vec(uint32_t *dest, const uint32_t *src, int len)
|
||||
{
|
||||
uint32_t i;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < len; i++)
|
||||
dest[i] = byteswap(src[i]);
|
||||
}
|
||||
|
||||
static inline uint32_t be32dec(const void *pp)
|
||||
|
||||
static inline void SHA256_InitState(uint32_t *state)
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)pp;
|
||||
|
||||
return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
|
||||
((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
|
||||
/* Magic initialization constants */
|
||||
state[0] = 0x6A09E667;
|
||||
state[1] = 0xBB67AE85;
|
||||
state[2] = 0x3C6EF372;
|
||||
state[3] = 0xA54FF53A;
|
||||
state[4] = 0x510E527F;
|
||||
state[5] = 0x9B05688C;
|
||||
state[6] = 0x1F83D9AB;
|
||||
state[7] = 0x5BE0CD19;
|
||||
}
|
||||
|
||||
static inline void be32enc(void *pp, uint32_t x)
|
||||
{
|
||||
uint8_t * p = (uint8_t *)pp;
|
||||
|
||||
p[3] = x & 0xff;
|
||||
p[2] = (x >> 8) & 0xff;
|
||||
p[1] = (x >> 16) & 0xff;
|
||||
p[0] = (x >> 24) & 0xff;
|
||||
}
|
||||
|
||||
static inline uint32_t le32dec(const void *pp)
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)pp;
|
||||
|
||||
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
|
||||
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
|
||||
}
|
||||
|
||||
static inline void le32enc(void *pp, uint32_t x)
|
||||
{
|
||||
uint8_t * p = (uint8_t *)pp;
|
||||
|
||||
p[0] = x & 0xff;
|
||||
p[1] = (x >> 8) & 0xff;
|
||||
p[2] = (x >> 16) & 0xff;
|
||||
p[3] = (x >> 24) & 0xff;
|
||||
}
|
||||
|
||||
typedef struct SHA256Context {
|
||||
uint32_t state[8];
|
||||
uint32_t buf[16];
|
||||
} SHA256_CTX;
|
||||
|
||||
/* Elementary functions used by SHA256 */
|
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||
|
@ -115,8 +87,7 @@ typedef struct SHA256Context {
|
|||
* SHA256 block compression function. The 256-bit state is transformed via
|
||||
* the 512-bit input block to produce a new state.
|
||||
*/
|
||||
static void
|
||||
SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
|
||||
static void SHA256_Transform(uint32_t *state, const uint32_t *block, int swap)
|
||||
{
|
||||
uint32_t W[64];
|
||||
uint32_t S[8];
|
||||
|
@ -124,12 +95,12 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
|
|||
int i;
|
||||
|
||||
/* 1. Prepare message schedule W. */
|
||||
if(swap)
|
||||
if (swap)
|
||||
byteswap_vec(W, block, 16);
|
||||
else
|
||||
memcpy(W, block, 64);
|
||||
for (i = 16; i < 64; i += 2) {
|
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
|
||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
|
||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
|
||||
}
|
||||
|
||||
|
@ -207,34 +178,37 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
|
|||
state[i] += S[i];
|
||||
}
|
||||
|
||||
static inline void
|
||||
SHA256_InitState(uint32_t * state)
|
||||
{
|
||||
/* Magic initialization constants */
|
||||
state[0] = 0x6A09E667;
|
||||
state[1] = 0xBB67AE85;
|
||||
state[2] = 0x3C6EF372;
|
||||
state[3] = 0xA54FF53A;
|
||||
state[4] = 0x510E527F;
|
||||
state[5] = 0x9B05688C;
|
||||
state[6] = 0x1F83D9AB;
|
||||
state[7] = 0x5BE0CD19;
|
||||
}
|
||||
#if defined(__x86_64__)
|
||||
#define SHA256_4WAY
|
||||
void SHA256_Transform_4way(uint32_t *state, const uint32_t *block, int swap);
|
||||
void SHA256_InitState_4way(uint32_t *state);
|
||||
#endif
|
||||
|
||||
static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
|
||||
static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300};
|
||||
|
||||
static inline void
|
||||
PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t ostate[8])
|
||||
static const uint32_t keypad[12] = {
|
||||
0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000
|
||||
};
|
||||
static const uint32_t innerpad[11] = {
|
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
|
||||
};
|
||||
static const uint32_t outerpad[8] = {
|
||||
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
|
||||
};
|
||||
static const uint32_t finalblk[16] = {
|
||||
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
|
||||
};
|
||||
|
||||
static inline void HMAC_SHA256_80_init(const uint32_t *key,
|
||||
uint32_t *tstate, uint32_t *ostate)
|
||||
{
|
||||
uint32_t ihash[8];
|
||||
uint32_t pad[16];
|
||||
uint32_t i;
|
||||
int i;
|
||||
|
||||
SHA256_InitState(tstate);
|
||||
SHA256_Transform(tstate, passwd, 1);
|
||||
memcpy(pad, passwd+16, 16);
|
||||
memcpy(pad+4, passwdpad, 48);
|
||||
SHA256_Transform(tstate, key, 1);
|
||||
memcpy(pad, key + 16, 16);
|
||||
memcpy(pad + 4, keypad, 48);
|
||||
SHA256_Transform(tstate, pad, 1);
|
||||
memcpy(ihash, tstate, 32);
|
||||
|
||||
|
@ -253,120 +227,179 @@ PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t o
|
|||
SHA256_Transform(tstate, pad, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
|
||||
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
|
||||
* write the output to buf.
|
||||
*/
|
||||
static inline void
|
||||
PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *passwd, uint32_t *buf)
|
||||
static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
|
||||
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
|
||||
{
|
||||
static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
|
||||
SHA256_CTX PShictx, PShoctx;
|
||||
uint32_t i;
|
||||
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
memcpy(PShictx.state, tstate, 32);
|
||||
memcpy(PShoctx.state, ostate, 32);
|
||||
|
||||
memcpy(PShoctx.buf+8, outerpad, 32);
|
||||
uint32_t istate[8], ostate2[8];
|
||||
uint32_t ibuf[16], obuf[16];
|
||||
int i;
|
||||
|
||||
SHA256_Transform(PShictx.state, passwd, 1);
|
||||
byteswap_vec(PShictx.buf, passwd+16, 4);
|
||||
byteswap_vec(PShictx.buf+5, innerpad, 11);
|
||||
memcpy(istate, tstate, 32);
|
||||
SHA256_Transform(istate, salt, 1);
|
||||
|
||||
byteswap_vec(ibuf, salt + 16, 4);
|
||||
memcpy(ibuf + 5, innerpad, 44);
|
||||
memcpy(obuf + 8, outerpad, 32);
|
||||
|
||||
/* Iterate through the blocks. */
|
||||
for (i = 0; i < 4; i++) {
|
||||
uint32_t ist[8];
|
||||
uint32_t ost[8];
|
||||
|
||||
memcpy(ist, PShictx.state, 32);
|
||||
PShictx.buf[4] = i + 1;
|
||||
SHA256_Transform(ist, PShictx.buf, 0);
|
||||
memcpy(PShoctx.buf, ist, 32);
|
||||
memcpy(obuf, istate, 32);
|
||||
ibuf[4] = i + 1;
|
||||
SHA256_Transform(obuf, ibuf, 0);
|
||||
|
||||
memcpy(ost, PShoctx.state, 32);
|
||||
SHA256_Transform(ost, PShoctx.buf, 0);
|
||||
byteswap_vec(buf+i*8, ost, 8);
|
||||
memcpy(ostate2, ostate, 32);
|
||||
SHA256_Transform(ostate2, obuf, 0);
|
||||
byteswap_vec(output + 8 * i, ostate2, 8);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
PBKDF2_SHA256_80_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *passwd, const uint32_t *salt, uint32_t *output)
|
||||
static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
|
||||
const uint32_t *salt, uint32_t *output)
|
||||
{
|
||||
static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
|
||||
uint32_t pad[16];
|
||||
uint32_t buf[16];
|
||||
|
||||
SHA256_Transform(tstate, salt, 1);
|
||||
SHA256_Transform(tstate, salt+16, 1);
|
||||
SHA256_Transform(tstate, ihash_finalblk, 0);
|
||||
memcpy(pad, tstate, 32);
|
||||
memcpy(pad+8, outerpad, 32);
|
||||
SHA256_Transform(tstate, salt + 16, 1);
|
||||
SHA256_Transform(tstate, finalblk, 0);
|
||||
memcpy(buf, tstate, 32);
|
||||
memcpy(buf + 8, outerpad, 32);
|
||||
|
||||
SHA256_Transform(ostate, pad, 0);
|
||||
SHA256_Transform(ostate, buf, 0);
|
||||
byteswap_vec(output, ostate, 8);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* salsa20_8(B):
|
||||
* Apply the salsa20/8 core to the provided block.
|
||||
*/
|
||||
static inline void
|
||||
salsa20_8(uint32_t B[16], const uint32_t Bx[16])
|
||||
#ifdef SHA256_4WAY
|
||||
|
||||
static const uint32_t keypad_4way[4 * 12] = {
|
||||
0x00000080, 0x00000080, 0x00000080, 0x00000080,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x80020000, 0x80020000, 0x80020000, 0x80020000
|
||||
};
|
||||
static const uint32_t innerpad_4way[4 * 11] = {
|
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
|
||||
};
|
||||
static const uint32_t outerpad_4way[4 * 8] = {
|
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000300, 0x00000300, 0x00000300, 0x00000300
|
||||
};
|
||||
static const uint32_t finalblk_4way[4 * 16] = {
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x80000000, 0x80000000, 0x80000000, 0x80000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000620, 0x00000620, 0x00000620, 0x00000620
|
||||
};
|
||||
|
||||
static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
|
||||
uint32_t *tstate, uint32_t *ostate)
|
||||
{
|
||||
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
|
||||
size_t i;
|
||||
uint32_t ihash[4 * 8];
|
||||
uint32_t pad[4 * 16];
|
||||
int i;
|
||||
|
||||
x00 = (B[ 0] ^= Bx[ 0]);
|
||||
x01 = (B[ 1] ^= Bx[ 1]);
|
||||
x02 = (B[ 2] ^= Bx[ 2]);
|
||||
x03 = (B[ 3] ^= Bx[ 3]);
|
||||
x04 = (B[ 4] ^= Bx[ 4]);
|
||||
x05 = (B[ 5] ^= Bx[ 5]);
|
||||
x06 = (B[ 6] ^= Bx[ 6]);
|
||||
x07 = (B[ 7] ^= Bx[ 7]);
|
||||
x08 = (B[ 8] ^= Bx[ 8]);
|
||||
x09 = (B[ 9] ^= Bx[ 9]);
|
||||
x10 = (B[10] ^= Bx[10]);
|
||||
x11 = (B[11] ^= Bx[11]);
|
||||
x12 = (B[12] ^= Bx[12]);
|
||||
x13 = (B[13] ^= Bx[13]);
|
||||
x14 = (B[14] ^= Bx[14]);
|
||||
x15 = (B[15] ^= Bx[15]);
|
||||
for (i = 0; i < 8; i += 2) {
|
||||
#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
|
||||
/* Operate on columns. */
|
||||
x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7);
|
||||
x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9);
|
||||
x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13);
|
||||
x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18);
|
||||
SHA256_InitState_4way(tstate);
|
||||
SHA256_Transform_4way(tstate, key, 1);
|
||||
memcpy(pad, key + 4 * 16, 4 * 16);
|
||||
memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
|
||||
SHA256_Transform_4way(tstate, pad, 1);
|
||||
memcpy(ihash, tstate, 4 * 32);
|
||||
|
||||
/* Operate on rows. */
|
||||
x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7);
|
||||
x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9);
|
||||
x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13);
|
||||
x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18);
|
||||
#undef R
|
||||
}
|
||||
B[ 0] += x00;
|
||||
B[ 1] += x01;
|
||||
B[ 2] += x02;
|
||||
B[ 3] += x03;
|
||||
B[ 4] += x04;
|
||||
B[ 5] += x05;
|
||||
B[ 6] += x06;
|
||||
B[ 7] += x07;
|
||||
B[ 8] += x08;
|
||||
B[ 9] += x09;
|
||||
B[10] += x10;
|
||||
B[11] += x11;
|
||||
B[12] += x12;
|
||||
B[13] += x13;
|
||||
B[14] += x14;
|
||||
B[15] += x15;
|
||||
SHA256_InitState_4way(ostate);
|
||||
for (i = 0; i < 4 * 8; i++)
|
||||
pad[i] = ihash[i] ^ 0x5c5c5c5c;
|
||||
for (; i < 4 * 16; i++)
|
||||
pad[i] = 0x5c5c5c5c;
|
||||
SHA256_Transform_4way(ostate, pad, 0);
|
||||
|
||||
SHA256_InitState_4way(tstate);
|
||||
for (i = 0; i < 4 * 8; i++)
|
||||
pad[i] = ihash[i] ^ 0x36363636;
|
||||
for (; i < 4 * 16; i++)
|
||||
pad[i] = 0x36363636;
|
||||
SHA256_Transform_4way(tstate, pad, 0);
|
||||
}
|
||||
|
||||
static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
|
||||
const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
|
||||
{
|
||||
uint32_t istate[4 * 8], ostate2[4 * 8];
|
||||
uint32_t ibuf[4 * 16], obuf[4 * 16];
|
||||
int i;
|
||||
|
||||
memcpy(istate, tstate, 4 * 32);
|
||||
SHA256_Transform_4way(istate, salt, 1);
|
||||
|
||||
byteswap_vec(ibuf, salt + 4 * 16, 4 * 4);
|
||||
memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
|
||||
memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
memcpy(obuf, istate, 4 * 32);
|
||||
ibuf[4 * 4 + 0] = i + 1;
|
||||
ibuf[4 * 4 + 1] = i + 1;
|
||||
ibuf[4 * 4 + 2] = i + 1;
|
||||
ibuf[4 * 4 + 3] = i + 1;
|
||||
SHA256_Transform_4way(obuf, ibuf, 0);
|
||||
|
||||
memcpy(ostate2, ostate, 4 * 32);
|
||||
SHA256_Transform_4way(ostate2, obuf, 0);
|
||||
byteswap_vec(output + 4 * 8 * i, ostate2, 4 * 8);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
|
||||
uint32_t *ostate, const uint32_t *salt, uint32_t *output)
|
||||
{
|
||||
uint32_t buf[4 * 16];
|
||||
|
||||
SHA256_Transform_4way(tstate, salt, 1);
|
||||
SHA256_Transform_4way(tstate, salt + 4 * 16, 1);
|
||||
SHA256_Transform_4way(tstate, finalblk_4way, 0);
|
||||
memcpy(buf, tstate, 4 * 32);
|
||||
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
|
||||
|
||||
SHA256_Transform_4way(ostate, buf, 0);
|
||||
byteswap_vec(output, ostate, 4 * 8);
|
||||
}
|
||||
|
||||
#endif /* SHA256_4WAY */
|
||||
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
|
@ -388,38 +421,90 @@ void scrypt_core(uint32_t *X, uint32_t *V);
|
|||
|
||||
#define SCRYPT_BUFFER_SIZE (131072 + 63)
|
||||
|
||||
static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
|
||||
{
|
||||
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
|
||||
int i;
|
||||
|
||||
x00 = (B[ 0] ^= Bx[ 0]);
|
||||
x01 = (B[ 1] ^= Bx[ 1]);
|
||||
x02 = (B[ 2] ^= Bx[ 2]);
|
||||
x03 = (B[ 3] ^= Bx[ 3]);
|
||||
x04 = (B[ 4] ^= Bx[ 4]);
|
||||
x05 = (B[ 5] ^= Bx[ 5]);
|
||||
x06 = (B[ 6] ^= Bx[ 6]);
|
||||
x07 = (B[ 7] ^= Bx[ 7]);
|
||||
x08 = (B[ 8] ^= Bx[ 8]);
|
||||
x09 = (B[ 9] ^= Bx[ 9]);
|
||||
x10 = (B[10] ^= Bx[10]);
|
||||
x11 = (B[11] ^= Bx[11]);
|
||||
x12 = (B[12] ^= Bx[12]);
|
||||
x13 = (B[13] ^= Bx[13]);
|
||||
x14 = (B[14] ^= Bx[14]);
|
||||
x15 = (B[15] ^= Bx[15]);
|
||||
for (i = 0; i < 8; i += 2) {
|
||||
#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
|
||||
/* Operate on columns. */
|
||||
x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7);
|
||||
x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7);
|
||||
|
||||
x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9);
|
||||
x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9);
|
||||
|
||||
x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13);
|
||||
x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13);
|
||||
|
||||
x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18);
|
||||
x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18);
|
||||
|
||||
/* Operate on rows. */
|
||||
x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7);
|
||||
x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7);
|
||||
|
||||
x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9);
|
||||
x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9);
|
||||
|
||||
x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13);
|
||||
x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13);
|
||||
|
||||
x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18);
|
||||
x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18);
|
||||
#undef R
|
||||
}
|
||||
B[ 0] += x00;
|
||||
B[ 1] += x01;
|
||||
B[ 2] += x02;
|
||||
B[ 3] += x03;
|
||||
B[ 4] += x04;
|
||||
B[ 5] += x05;
|
||||
B[ 6] += x06;
|
||||
B[ 7] += x07;
|
||||
B[ 8] += x08;
|
||||
B[ 9] += x09;
|
||||
B[10] += x10;
|
||||
B[11] += x11;
|
||||
B[12] += x12;
|
||||
B[13] += x13;
|
||||
B[14] += x14;
|
||||
B[15] += x15;
|
||||
}
|
||||
|
||||
static inline void scrypt_core(uint32_t *X, uint32_t *V)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t j;
|
||||
uint32_t k;
|
||||
uint32_t i, j, k;
|
||||
uint64_t *p1, *p2;
|
||||
|
||||
p1 = (uint64_t *)X;
|
||||
for (i = 0; i < 1024; i += 2) {
|
||||
for (i = 0; i < 1024; i++) {
|
||||
memcpy(&V[i * 32], X, 128);
|
||||
|
||||
salsa20_8(&X[0], &X[16]);
|
||||
salsa20_8(&X[16], &X[0]);
|
||||
|
||||
memcpy(&V[(i + 1) * 32], X, 128);
|
||||
|
||||
salsa20_8(&X[0], &X[16]);
|
||||
salsa20_8(&X[16], &X[0]);
|
||||
}
|
||||
for (i = 0; i < 1024; i += 2) {
|
||||
for (i = 0; i < 1024; i++) {
|
||||
j = X[16] & 1023;
|
||||
p2 = (uint64_t *)(&V[j * 32]);
|
||||
for(k = 0; k < 16; k++)
|
||||
for (k = 0; k < 16; k++)
|
||||
p1[k] ^= p2[k];
|
||||
|
||||
salsa20_8(&X[0], &X[16]);
|
||||
salsa20_8(&X[16], &X[0]);
|
||||
|
||||
j = X[16] & 1023;
|
||||
p2 = (uint64_t *)(&V[j * 32]);
|
||||
for(k = 0; k < 16; k++)
|
||||
p1[k] ^= p2[k];
|
||||
|
||||
salsa20_8(&X[0], &X[16]);
|
||||
salsa20_8(&X[16], &X[0]);
|
||||
}
|
||||
|
@ -427,33 +512,32 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V)
|
|||
|
||||
#endif
|
||||
|
||||
unsigned char *scrypt_buffer_alloc() {
|
||||
unsigned char *scrypt_buffer_alloc()
|
||||
{
|
||||
return malloc(SCRYPT_BUFFER_SIZE);
|
||||
}
|
||||
|
||||
/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
|
||||
scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
|
||||
r = 1, p = 1, N = 1024
|
||||
*/
|
||||
static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *output, unsigned char *scratchpad)
|
||||
static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output,
|
||||
unsigned char *scratchpad)
|
||||
{
|
||||
uint32_t tstate[8], ostate[8];
|
||||
uint32_t *V;
|
||||
uint32_t X[32];
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||
|
||||
PBKDF2_SHA256_80_128_init(input, tstate, ostate);
|
||||
HMAC_SHA256_80_init(input, tstate, ostate);
|
||||
PBKDF2_SHA256_80_128(tstate, ostate, input, X);
|
||||
|
||||
scrypt_core(X, V);
|
||||
|
||||
return PBKDF2_SHA256_80_128_32(tstate, ostate, input, X, output);
|
||||
return PBKDF2_SHA256_128_32(tstate, ostate, X, output);
|
||||
}
|
||||
|
||||
#ifdef SCRYPT_3WAY
|
||||
|
||||
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1, const uint32_t *input2,
|
||||
uint32_t *output1, uint32_t *output2, unsigned char *scratchpad)
|
||||
static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1,
|
||||
const uint32_t *input2, uint32_t *output1, uint32_t *output2,
|
||||
unsigned char *scratchpad)
|
||||
{
|
||||
uint32_t tstate1[8], tstate2[8];
|
||||
uint32_t ostate1[8], ostate2[8];
|
||||
|
@ -461,47 +545,86 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1, const uint32_t *
|
|||
uint32_t X[32], Y[32];
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||
|
||||
PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1);
|
||||
PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2);
|
||||
HMAC_SHA256_80_init(input1, tstate1, ostate1);
|
||||
HMAC_SHA256_80_init(input2, tstate2, ostate2);
|
||||
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
|
||||
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
|
||||
|
||||
scrypt_core_2way(X, Y, V);
|
||||
|
||||
PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, output1);
|
||||
PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, output2);
|
||||
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
|
||||
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
|
||||
}
|
||||
|
||||
static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input1, const uint32_t *input2, const uint32_t *input3,
|
||||
uint32_t *output1, uint32_t *output2, uint32_t *output3, unsigned char *scratchpad)
|
||||
static void scrypt_1024_1_1_256_sp_3way(
|
||||
const uint32_t *input1, const uint32_t *input2, const uint32_t *input3,
|
||||
uint32_t *output1, uint32_t *output2, uint32_t *output3,
|
||||
unsigned char *scratchpad)
|
||||
{
|
||||
uint32_t tstate1[8], tstate2[8], tstate3[8];
|
||||
uint32_t ostate1[8], ostate2[8], ostate3[8];
|
||||
uint32_t *V;
|
||||
#ifdef SHA256_4WAY
|
||||
uint32_t tstate[4 * 8], ostate[4 * 8];
|
||||
uint32_t input[4 * 20], output[4 * 32];
|
||||
uint32_t X[32], Y[32], Z[32];
|
||||
uint32_t W[4 * 32];
|
||||
uint32_t *V;
|
||||
int i;
|
||||
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||
|
||||
PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1);
|
||||
PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2);
|
||||
PBKDF2_SHA256_80_128_init(input3, tstate3, ostate3);
|
||||
for (i = 0; i < 20; i++) {
|
||||
input[4 * i + 0] = input1[i];
|
||||
input[4 * i + 1] = input2[i];
|
||||
input[4 * i + 2] = input3[i];
|
||||
}
|
||||
HMAC_SHA256_80_init_4way(input, tstate, ostate);
|
||||
PBKDF2_SHA256_80_128_4way(tstate, ostate, input, W);
|
||||
for (i = 0; i < 32; i++) {
|
||||
X[i] = W[4 * i + 0];
|
||||
Y[i] = W[4 * i + 1];
|
||||
Z[i] = W[4 * i + 2];
|
||||
}
|
||||
scrypt_core_3way(X, Y, Z, V);
|
||||
for (i = 0; i < 32; i++) {
|
||||
W[4 * i + 0] = X[i];
|
||||
W[4 * i + 1] = Y[i];
|
||||
W[4 * i + 2] = Z[i];
|
||||
}
|
||||
PBKDF2_SHA256_128_32_4way(tstate, ostate, W, output);
|
||||
for (i = 0; i < 8; i++) {
|
||||
output1[i] = output[4 * i + 0];
|
||||
output2[i] = output[4 * i + 1];
|
||||
output3[i] = output[4 * i + 2];
|
||||
}
|
||||
#else
|
||||
uint32_t tstate1[8], tstate2[8], tstate3[8];
|
||||
uint32_t ostate1[8], ostate2[8], ostate3[8];
|
||||
uint32_t X[32], Y[32], Z[32];
|
||||
uint32_t *V;
|
||||
|
||||
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
|
||||
|
||||
HMAC_SHA256_80_init(input1, tstate1, ostate1);
|
||||
HMAC_SHA256_80_init(input2, tstate2, ostate2);
|
||||
HMAC_SHA256_80_init(input3, tstate3, ostate3);
|
||||
PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X);
|
||||
PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y);
|
||||
PBKDF2_SHA256_80_128(tstate3, ostate3, input3, Z);
|
||||
|
||||
scrypt_core_3way(X, Y, Z, V);
|
||||
|
||||
PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, output1);
|
||||
PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, output2);
|
||||
PBKDF2_SHA256_80_128_32(tstate3, ostate3, input3, Z, output3);
|
||||
PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1);
|
||||
PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2);
|
||||
PBKDF2_SHA256_128_32(tstate3, ostate3, Z, output3);
|
||||
#endif /* SHA256_4WAY*/
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif /* SCRYPT_3WAY */
|
||||
|
||||
__attribute__ ((noinline)) static int test_lower_hash(const uint32_t *hash,
|
||||
__attribute__ ((noinline)) static int confirm_hash(const uint32_t *hash,
|
||||
const uint32_t *target)
|
||||
{
|
||||
int i;
|
||||
for (i = 6; i >= 0; i--) {
|
||||
for (i = 7; i >= 0; i--) {
|
||||
uint32_t t = le32dec(&target[i]);
|
||||
if (hash[i] > t)
|
||||
return 0;
|
||||
|
@ -511,8 +634,8 @@ __attribute__ ((noinline)) static int test_lower_hash(const uint32_t *hash,
|
|||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
|
||||
const unsigned char *ptarget,
|
||||
int scanhash_scrypt(int thr_id, unsigned char *pdata,
|
||||
unsigned char *scratchbuf, const unsigned char *ptarget,
|
||||
uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
uint32_t data[20], hash[8];
|
||||
|
@ -542,7 +665,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
|
|||
if (throughput >= 3 && n <= max_nonce) {
|
||||
data3[19] = n++;
|
||||
scrypt_1024_1_1_256_sp_3way(data, data2, data3, hash, hash2, hash3, scratchbuf);
|
||||
if (hash3[7] < Htarg || (hash3[7] == Htarg && test_lower_hash(hash3, (uint32_t *)ptarget))) {
|
||||
if (hash3[7] <= Htarg && confirm_hash(hash3, (uint32_t *)ptarget)) {
|
||||
be32enc(&((uint32_t *)pdata)[19], data3[19]);
|
||||
*next_nonce = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
|
@ -551,7 +674,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
|
|||
} else {
|
||||
scrypt_1024_1_1_256_sp_2way(data, data2, hash, hash2, scratchbuf);
|
||||
}
|
||||
if (hash2[7] < Htarg || (hash2[7] == Htarg && test_lower_hash(hash2, (uint32_t *)ptarget))) {
|
||||
if (hash2[7] <= Htarg && confirm_hash(hash2, (uint32_t *)ptarget)) {
|
||||
be32enc(&((uint32_t *)pdata)[19], data2[19]);
|
||||
*next_nonce = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
|
@ -563,7 +686,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
|
|||
#else
|
||||
scrypt_1024_1_1_256_sp(data, hash, scratchbuf);
|
||||
#endif
|
||||
if (hash[7] < Htarg || (hash[7] == Htarg && test_lower_hash(hash, (uint32_t *)ptarget))) {
|
||||
if (hash[7] <= Htarg && confirm_hash(hash, (uint32_t *)ptarget)) {
|
||||
be32enc(&((uint32_t *)pdata)[19], data[19]);
|
||||
*next_nonce = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
|
@ -575,4 +698,3 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
|
|||
*hashes_done = n - first_nonce;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue