From e560d53b764cffefc6d1b5d8a40e89bcffec96eb Mon Sep 17 00:00:00 2001 From: pooler Date: Tue, 28 Feb 2012 18:10:26 +0100 Subject: [PATCH] Add 4-way SHA-256 implementation for x86-64 --- miner.h | 38 +++- scrypt-x64.S | 402 ++++++++++++++++++++++++++++++++++++- scrypt.c | 556 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 769 insertions(+), 227 deletions(-) diff --git a/miner.h b/miner.h index c67a0c9..d6fa160 100644 --- a/miner.h +++ b/miner.h @@ -119,17 +119,49 @@ static inline void swap256(void *dest_p, const void *src_p) dest[7] = src[0]; } +static inline uint32_t be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} + +static inline void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} + +static inline uint32_t le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} + +static inline void le32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} + extern bool opt_debug; extern bool opt_protocol; -extern const uint32_t sha256_init_state[]; + extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, const char *rpc_req, bool, bool, int *); extern char *bin2hex(const unsigned char *p, size_t len); extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); extern unsigned char *scrypt_buffer_alloc(); -extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf, - const unsigned char *ptarget, +extern int scanhash_scrypt(int thr_id, unsigned char *pdata, + unsigned char *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done); extern int diff --git a/scrypt-x64.S b/scrypt-x64.S index 5e57bb8..6d3f497 100644 --- a/scrypt-x64.S +++ b/scrypt-x64.S @@ -27,6 +27,394 @@ #endif #if defined(__x86_64__) + .data + .p2align 6 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 6 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .text + .p2align 5 + .globl SHA256_InitState_4way + .globl _SHA256_InitState_4way +SHA256_InitState_4way: +_SHA256_InitState_4way: +#if defined(WIN64) + pushq %rdi + movq %rcx, %rdi +#endif + movdqa sha256_4h+0, %xmm0 + movdqa sha256_4h+16, %xmm1 + movdqa sha256_4h+32, %xmm2 + movdqa sha256_4h+48, %xmm3 + movdqu %xmm0, 0(%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqa sha256_4h+64, %xmm0 + movdqa sha256_4h+80, %xmm1 + movdqa sha256_4h+96, %xmm2 + movdqa sha256_4h+112, %xmm3 + movdqu %xmm0, 64(%rdi) + movdqu %xmm1, 80(%rdi) + movdqu %xmm2, 96(%rdi) + movdqu %xmm3, 112(%rdi) +#if defined(WIN64) + popq %rdi +#endif + ret + +.macro p2bswap_rsi_rsp i + movdqu \i*16(%rsi), %xmm0 + movdqu (\i+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, \i*16(%rsp) + movdqa %xmm2, (\i+1)*16(%rsp) +.endm + + .text + .p2align 5 + .globl SHA256_Transform_4way + .globl _SHA256_Transform_4way +SHA256_Transform_4way: +_SHA256_Transform_4way: +#if defined(WIN64) + pushq %rdi + subq $96, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + subq $1032, %rsp + + testq %rdx, %rdx + jz sha256_transform_4way_block_copy + + p2bswap_rsi_rsp 0 + p2bswap_rsi_rsp 2 + p2bswap_rsi_rsp 4 + p2bswap_rsi_rsp 6 + p2bswap_rsi_rsp 8 + p2bswap_rsi_rsp 10 + p2bswap_rsi_rsp 12 + p2bswap_rsi_rsp 14 + jmp sha256_transform_4way_extend + +sha256_transform_4way_block_copy: + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqu 4*16(%rsi), %xmm4 + movdqu 5*16(%rsi), %xmm5 + movdqu 6*16(%rsi), %xmm6 + movdqu 7*16(%rsi), %xmm7 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) + movdqa %xmm4, 4*16(%rsp) + movdqa %xmm5, 5*16(%rsp) + movdqa %xmm6, 6*16(%rsp) + movdqa %xmm7, 7*16(%rsp) + movdqu 8*16(%rsi), %xmm0 + movdqu 9*16(%rsi), %xmm1 + movdqu 10*16(%rsi), %xmm2 + movdqu 11*16(%rsi), %xmm3 + movdqu 12*16(%rsi), %xmm4 + movdqu 13*16(%rsi), %xmm5 + movdqu 14*16(%rsi), %xmm6 + movdqu 15*16(%rsi), %xmm7 + movdqa %xmm0, 8*16(%rsp) + movdqa %xmm1, 9*16(%rsp) + movdqa %xmm2, 10*16(%rsp) + movdqa %xmm3, 11*16(%rsp) + movdqa %xmm4, 12*16(%rsp) + movdqa %xmm5, 13*16(%rsp) + movdqa %xmm6, 14*16(%rsp) + movdqa %xmm7, 15*16(%rsp) + +sha256_transform_4way_extend: + leaq 256(%rsp), %rcx + leaq 48*16(%rcx), %rax +sha256_transform_4way_extend_loop: + movdqa -15*16(%rcx), %xmm0 + movdqa -14*16(%rcx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + movdqa -2*16(%rcx), %xmm3 + movdqa -1*16(%rcx), %xmm7 + paddd -16*16(%rcx), %xmm0 + paddd -15*16(%rcx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + + paddd -7*16(%rcx), %xmm0 + + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd -6*16(%rcx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm3, %xmm0 + paddd %xmm7, %xmm4 + movdqa %xmm0, (%rcx) + movdqa %xmm4, 16(%rcx) + addq $2*16, %rcx + cmpq %rcx, %rax + jne sha256_transform_4way_extend_loop + + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + + xorq %rax, %rax +sha256_transform_4way_main_loop: + movdqa (%rsp, %rax), %xmm6 + paddd sha256_4k(%rax), %xmm6 + paddd %xmm10, %xmm6 + + movdqa %xmm0, %xmm1 + movdqa %xmm9, %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, %xmm10 + movdqa %xmm8, %xmm2 + movdqa %xmm2, %xmm9 + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, %xmm8 + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + addq $16, %rax + cmpq $16*64, %rax + jne sha256_transform_4way_main_loop + + movdqu 0(%rdi), %xmm2 + movdqu 16(%rdi), %xmm6 + movdqu 32(%rdi), %xmm11 + movdqu 48(%rdi), %xmm1 + paddd %xmm2, %xmm7 + paddd %xmm6, %xmm5 + paddd %xmm11, %xmm4 + paddd %xmm1, %xmm3 + movdqu 64(%rdi), %xmm2 + movdqu 80(%rdi), %xmm6 + movdqu 96(%rdi), %xmm11 + movdqu 112(%rdi), %xmm1 + paddd %xmm2, %xmm0 + paddd %xmm6, %xmm8 + paddd %xmm11, %xmm9 + paddd %xmm1, %xmm10 + + movdqu %xmm7, 0(%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm4, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu %xmm0, 64(%rdi) + movdqu %xmm8, 80(%rdi) + movdqu %xmm9, 96(%rdi) + movdqu %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(WIN64) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + .macro scrypt_shuffle src, so, dest, do movl \so+60(\src), %r8d @@ -187,7 +575,7 @@ .endm .text - .align 32 + .p2align 5 gen_salsa8_core: # 0: %rdx, %rdi, %rcx, %rsi movq 8(%rsp), %rdi @@ -286,7 +674,7 @@ gen_salsa8_core: .text - .align 32 + .p2align 5 .globl scrypt_core .globl _scrypt_core scrypt_core: @@ -559,7 +947,7 @@ gen_scrypt_core_loop2: xmm_salsa8_core_doubleround .endm - .align 32 + .p2align 5 xmm_scrypt_core: # shuffle 1st block into %xmm8-%xmm11 movl 60(%rdi), %edx @@ -871,7 +1259,7 @@ xmm_scrypt_core_loop2: .text - .align 32 + .p2align 5 .globl scrypt_best_throughput .globl _scrypt_best_throughput scrypt_best_throughput: @@ -1040,7 +1428,7 @@ scrypt_best_throughput_exit: .text - .align 32 + .p2align 5 .globl scrypt_core_2way .globl _scrypt_core_2way scrypt_core_2way: @@ -1509,7 +1897,7 @@ scrypt_core_2way_loop2: .endm .text - .align 32 + .p2align 5 .globl scrypt_core_3way .globl _scrypt_core_3way scrypt_core_3way: @@ -1694,7 +2082,7 @@ scrypt_core_3way_loop1: jne scrypt_core_3way_loop1 movq $1024, %r8 - .align 16 + .p2align 4 scrypt_core_3way_loop2: movl 64(%rsp), %ebp andl $1023, %ebp diff --git a/scrypt.c b/scrypt.c index 093ee72..413e4d3 100644 --- a/scrypt.c +++ b/scrypt.c @@ -1,5 +1,5 @@ /*- - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2012 pooler * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -34,58 +34,30 @@ #include #include -#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ + | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) -static inline void -byteswap_vec(uint32_t *dest, const uint32_t *src, uint32_t len) +static inline void byteswap_vec(uint32_t *dest, const uint32_t *src, int len) { - uint32_t i; - + int i; for (i = 0; i < len; i++) dest[i] = byteswap(src[i]); } -static inline uint32_t be32dec(const void *pp) + +static inline void SHA256_InitState(uint32_t *state) { - const uint8_t *p = (uint8_t const *)pp; - - return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + - ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); + /* Magic initialization constants */ + state[0] = 0x6A09E667; + state[1] = 0xBB67AE85; + state[2] = 0x3C6EF372; + state[3] = 0xA54FF53A; + state[4] = 0x510E527F; + state[5] = 0x9B05688C; + state[6] = 0x1F83D9AB; + state[7] = 0x5BE0CD19; } -static inline void be32enc(void *pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[3] = x & 0xff; - p[2] = (x >> 8) & 0xff; - p[1] = (x >> 16) & 0xff; - p[0] = (x >> 24) & 0xff; -} - -static inline uint32_t le32dec(const void *pp) -{ - const uint8_t *p = (uint8_t const *)pp; - - return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + - ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); -} - -static inline void le32enc(void *pp, uint32_t x) -{ - uint8_t * p = (uint8_t *)pp; - - p[0] = x & 0xff; - p[1] = (x >> 8) & 0xff; - p[2] = (x >> 16) & 0xff; - p[3] = (x >> 24) & 0xff; -} - -typedef struct SHA256Context { - uint32_t state[8]; - uint32_t buf[16]; -} SHA256_CTX; - /* Elementary functions used by SHA256 */ #define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Maj(x, y, z) ((x & (y | z)) | (y & z)) @@ -115,8 +87,7 @@ typedef struct SHA256Context { * SHA256 block compression function. The 256-bit state is transformed via * the 512-bit input block to produce a new state. */ -static void -SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap) +static void SHA256_Transform(uint32_t *state, const uint32_t *block, int swap) { uint32_t W[64]; uint32_t S[8]; @@ -124,12 +95,12 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap) int i; /* 1. Prepare message schedule W. */ - if(swap) + if (swap) byteswap_vec(W, block, 16); else memcpy(W, block, 64); for (i = 16; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } @@ -207,34 +178,37 @@ SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap) state[i] += S[i]; } -static inline void -SHA256_InitState(uint32_t * state) -{ - /* Magic initialization constants */ - state[0] = 0x6A09E667; - state[1] = 0xBB67AE85; - state[2] = 0x3C6EF372; - state[3] = 0xA54FF53A; - state[4] = 0x510E527F; - state[5] = 0x9B05688C; - state[6] = 0x1F83D9AB; - state[7] = 0x5BE0CD19; -} +#if defined(__x86_64__) +#define SHA256_4WAY +void SHA256_Transform_4way(uint32_t *state, const uint32_t *block, int swap); +void SHA256_InitState_4way(uint32_t *state); +#endif -static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000}; -static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300}; -static inline void -PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t ostate[8]) +static const uint32_t keypad[12] = { + 0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000 +}; +static const uint32_t innerpad[11] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 +}; +static const uint32_t outerpad[8] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 +}; +static const uint32_t finalblk[16] = { + 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) { uint32_t ihash[8]; uint32_t pad[16]; - uint32_t i; + int i; SHA256_InitState(tstate); - SHA256_Transform(tstate, passwd, 1); - memcpy(pad, passwd+16, 16); - memcpy(pad+4, passwdpad, 48); + SHA256_Transform(tstate, key, 1); + memcpy(pad, key + 16, 16); + memcpy(pad + 4, keypad, 48); SHA256_Transform(tstate, pad, 1); memcpy(ihash, tstate, 32); @@ -253,120 +227,179 @@ PBKDF2_SHA256_80_128_init(const uint32_t *passwd, uint32_t tstate[8], uint32_t o SHA256_Transform(tstate, pad, 0); } -/** - * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): - * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and - * write the output to buf. - */ -static inline void -PBKDF2_SHA256_80_128(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *passwd, uint32_t *buf) +static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) { - static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000}; - SHA256_CTX PShictx, PShoctx; - uint32_t i; - - /* If Klen > 64, the key is really SHA256(K). */ - memcpy(PShictx.state, tstate, 32); - memcpy(PShoctx.state, ostate, 32); - - memcpy(PShoctx.buf+8, outerpad, 32); + uint32_t istate[8], ostate2[8]; + uint32_t ibuf[16], obuf[16]; + int i; - SHA256_Transform(PShictx.state, passwd, 1); - byteswap_vec(PShictx.buf, passwd+16, 4); - byteswap_vec(PShictx.buf+5, innerpad, 11); + memcpy(istate, tstate, 32); + SHA256_Transform(istate, salt, 1); + + byteswap_vec(ibuf, salt + 16, 4); + memcpy(ibuf + 5, innerpad, 44); + memcpy(obuf + 8, outerpad, 32); - /* Iterate through the blocks. */ for (i = 0; i < 4; i++) { - uint32_t ist[8]; - uint32_t ost[8]; - - memcpy(ist, PShictx.state, 32); - PShictx.buf[4] = i + 1; - SHA256_Transform(ist, PShictx.buf, 0); - memcpy(PShoctx.buf, ist, 32); + memcpy(obuf, istate, 32); + ibuf[4] = i + 1; + SHA256_Transform(obuf, ibuf, 0); - memcpy(ost, PShoctx.state, 32); - SHA256_Transform(ost, PShoctx.buf, 0); - byteswap_vec(buf+i*8, ost, 8); + memcpy(ostate2, ostate, 32); + SHA256_Transform(ostate2, obuf, 0); + byteswap_vec(output + 8 * i, ostate2, 8); } } -static inline void -PBKDF2_SHA256_80_128_32(uint32_t *tstate, uint32_t *ostate, const uint32_t *passwd, const uint32_t *salt, uint32_t *output) +static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, + const uint32_t *salt, uint32_t *output) { - static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620}; - uint32_t pad[16]; + uint32_t buf[16]; SHA256_Transform(tstate, salt, 1); - SHA256_Transform(tstate, salt+16, 1); - SHA256_Transform(tstate, ihash_finalblk, 0); - memcpy(pad, tstate, 32); - memcpy(pad+8, outerpad, 32); + SHA256_Transform(tstate, salt + 16, 1); + SHA256_Transform(tstate, finalblk, 0); + memcpy(buf, tstate, 32); + memcpy(buf + 8, outerpad, 32); - SHA256_Transform(ostate, pad, 0); + SHA256_Transform(ostate, buf, 0); byteswap_vec(output, ostate, 8); } -/** - * salsa20_8(B): - * Apply the salsa20/8 core to the provided block. - */ -static inline void -salsa20_8(uint32_t B[16], const uint32_t Bx[16]) +#ifdef SHA256_4WAY + +static const uint32_t keypad_4way[4 * 12] = { + 0x00000080, 0x00000080, 0x00000080, 0x00000080, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80020000, 0x80020000, 0x80020000, 0x80020000 +}; +static const uint32_t innerpad_4way[4 * 11] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 +}; +static const uint32_t outerpad_4way[4 * 8] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000300, 0x00000300, 0x00000300, 0x00000300 +}; +static const uint32_t finalblk_4way[4 * 16] = { + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000620, 0x00000620, 0x00000620, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) { - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - size_t i; + uint32_t ihash[4 * 8]; + uint32_t pad[4 * 16]; + int i; - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - for (i = 0; i < 8; i += 2) { -#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns. */ - x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); - x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); - x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); - x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); + SHA256_InitState_4way(tstate); + SHA256_Transform_4way(tstate, key, 1); + memcpy(pad, key + 4 * 16, 4 * 16); + memcpy(pad + 4 * 4, keypad_4way, 4 * 48); + SHA256_Transform_4way(tstate, pad, 1); + memcpy(ihash, tstate, 4 * 32); - /* Operate on rows. */ - x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); - x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); - x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); - x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); -#undef R - } - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; + SHA256_InitState_4way(ostate); + for (i = 0; i < 4 * 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 4 * 16; i++) + pad[i] = 0x5c5c5c5c; + SHA256_Transform_4way(ostate, pad, 0); + + SHA256_InitState_4way(tstate); + for (i = 0; i < 4 * 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 4 * 16; i++) + pad[i] = 0x36363636; + SHA256_Transform_4way(tstate, pad, 0); } +static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[4 * 8], ostate2[4 * 8]; + uint32_t ibuf[4 * 16], obuf[4 * 16]; + int i; + + memcpy(istate, tstate, 4 * 32); + SHA256_Transform_4way(istate, salt, 1); + + byteswap_vec(ibuf, salt + 4 * 16, 4 * 4); + memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); + memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 4 * 32); + ibuf[4 * 4 + 0] = i + 1; + ibuf[4 * 4 + 1] = i + 1; + ibuf[4 * 4 + 2] = i + 1; + ibuf[4 * 4 + 3] = i + 1; + SHA256_Transform_4way(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 4 * 32); + SHA256_Transform_4way(ostate2, obuf, 0); + byteswap_vec(output + 4 * 8 * i, ostate2, 4 * 8); + } +} + +static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[4 * 16]; + + SHA256_Transform_4way(tstate, salt, 1); + SHA256_Transform_4way(tstate, salt + 4 * 16, 1); + SHA256_Transform_4way(tstate, finalblk_4way, 0); + memcpy(buf, tstate, 4 * 32); + memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); + + SHA256_Transform_4way(ostate, buf, 0); + byteswap_vec(output, ostate, 4 * 8); +} + +#endif /* SHA256_4WAY */ + #if defined(__x86_64__) @@ -388,38 +421,90 @@ void scrypt_core(uint32_t *X, uint32_t *V); #define SCRYPT_BUFFER_SIZE (131072 + 63) +static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16]) +{ + uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; + int i; + + x00 = (B[ 0] ^= Bx[ 0]); + x01 = (B[ 1] ^= Bx[ 1]); + x02 = (B[ 2] ^= Bx[ 2]); + x03 = (B[ 3] ^= Bx[ 3]); + x04 = (B[ 4] ^= Bx[ 4]); + x05 = (B[ 5] ^= Bx[ 5]); + x06 = (B[ 6] ^= Bx[ 6]); + x07 = (B[ 7] ^= Bx[ 7]); + x08 = (B[ 8] ^= Bx[ 8]); + x09 = (B[ 9] ^= Bx[ 9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); + for (i = 0; i < 8; i += 2) { +#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); + x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); + + x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); + x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); + + x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); + x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); + + x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); + x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); + + /* Operate on rows. */ + x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); + x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); + + x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); + x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); + + x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); + x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); + + x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); + x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); +#undef R + } + B[ 0] += x00; + B[ 1] += x01; + B[ 2] += x02; + B[ 3] += x03; + B[ 4] += x04; + B[ 5] += x05; + B[ 6] += x06; + B[ 7] += x07; + B[ 8] += x08; + B[ 9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; +} + static inline void scrypt_core(uint32_t *X, uint32_t *V) { - uint32_t i; - uint32_t j; - uint32_t k; + uint32_t i, j, k; uint64_t *p1, *p2; + p1 = (uint64_t *)X; - for (i = 0; i < 1024; i += 2) { + for (i = 0; i < 1024; i++) { memcpy(&V[i * 32], X, 128); - - salsa20_8(&X[0], &X[16]); - salsa20_8(&X[16], &X[0]); - - memcpy(&V[(i + 1) * 32], X, 128); - salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); } - for (i = 0; i < 1024; i += 2) { + for (i = 0; i < 1024; i++) { j = X[16] & 1023; p2 = (uint64_t *)(&V[j * 32]); - for(k = 0; k < 16; k++) + for (k = 0; k < 16; k++) p1[k] ^= p2[k]; - - salsa20_8(&X[0], &X[16]); - salsa20_8(&X[16], &X[0]); - - j = X[16] & 1023; - p2 = (uint64_t *)(&V[j * 32]); - for(k = 0; k < 16; k++) - p1[k] ^= p2[k]; - salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); } @@ -427,33 +512,32 @@ static inline void scrypt_core(uint32_t *X, uint32_t *V) #endif -unsigned char *scrypt_buffer_alloc() { +unsigned char *scrypt_buffer_alloc() +{ return malloc(SCRYPT_BUFFER_SIZE); } -/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output - scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes - r = 1, p = 1, N = 1024 - */ -static void scrypt_1024_1_1_256_sp(const uint32_t* input, uint32_t *output, unsigned char *scratchpad) +static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output, + unsigned char *scratchpad) { uint32_t tstate[8], ostate[8]; uint32_t *V; uint32_t X[32]; V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - PBKDF2_SHA256_80_128_init(input, tstate, ostate); + HMAC_SHA256_80_init(input, tstate, ostate); PBKDF2_SHA256_80_128(tstate, ostate, input, X); scrypt_core(X, V); - return PBKDF2_SHA256_80_128_32(tstate, ostate, input, X, output); + return PBKDF2_SHA256_128_32(tstate, ostate, X, output); } #ifdef SCRYPT_3WAY -static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1, const uint32_t *input2, - uint32_t *output1, uint32_t *output2, unsigned char *scratchpad) +static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1, + const uint32_t *input2, uint32_t *output1, uint32_t *output2, + unsigned char *scratchpad) { uint32_t tstate1[8], tstate2[8]; uint32_t ostate1[8], ostate2[8]; @@ -461,47 +545,86 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input1, const uint32_t * uint32_t X[32], Y[32]; V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1); - PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2); + HMAC_SHA256_80_init(input1, tstate1, ostate1); + HMAC_SHA256_80_init(input2, tstate2, ostate2); PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X); PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y); scrypt_core_2way(X, Y, V); - PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, output1); - PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, output2); + PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1); + PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2); } -static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input1, const uint32_t *input2, const uint32_t *input3, - uint32_t *output1, uint32_t *output2, uint32_t *output3, unsigned char *scratchpad) +static void scrypt_1024_1_1_256_sp_3way( + const uint32_t *input1, const uint32_t *input2, const uint32_t *input3, + uint32_t *output1, uint32_t *output2, uint32_t *output3, + unsigned char *scratchpad) { - uint32_t tstate1[8], tstate2[8], tstate3[8]; - uint32_t ostate1[8], ostate2[8], ostate3[8]; - uint32_t *V; +#ifdef SHA256_4WAY + uint32_t tstate[4 * 8], ostate[4 * 8]; + uint32_t input[4 * 20], output[4 * 32]; uint32_t X[32], Y[32], Z[32]; + uint32_t W[4 * 32]; + uint32_t *V; + int i; + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - PBKDF2_SHA256_80_128_init(input1, tstate1, ostate1); - PBKDF2_SHA256_80_128_init(input2, tstate2, ostate2); - PBKDF2_SHA256_80_128_init(input3, tstate3, ostate3); + for (i = 0; i < 20; i++) { + input[4 * i + 0] = input1[i]; + input[4 * i + 1] = input2[i]; + input[4 * i + 2] = input3[i]; + } + HMAC_SHA256_80_init_4way(input, tstate, ostate); + PBKDF2_SHA256_80_128_4way(tstate, ostate, input, W); + for (i = 0; i < 32; i++) { + X[i] = W[4 * i + 0]; + Y[i] = W[4 * i + 1]; + Z[i] = W[4 * i + 2]; + } + scrypt_core_3way(X, Y, Z, V); + for (i = 0; i < 32; i++) { + W[4 * i + 0] = X[i]; + W[4 * i + 1] = Y[i]; + W[4 * i + 2] = Z[i]; + } + PBKDF2_SHA256_128_32_4way(tstate, ostate, W, output); + for (i = 0; i < 8; i++) { + output1[i] = output[4 * i + 0]; + output2[i] = output[4 * i + 1]; + output3[i] = output[4 * i + 2]; + } +#else + uint32_t tstate1[8], tstate2[8], tstate3[8]; + uint32_t ostate1[8], ostate2[8], ostate3[8]; + uint32_t X[32], Y[32], Z[32]; + uint32_t *V; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + HMAC_SHA256_80_init(input1, tstate1, ostate1); + HMAC_SHA256_80_init(input2, tstate2, ostate2); + HMAC_SHA256_80_init(input3, tstate3, ostate3); PBKDF2_SHA256_80_128(tstate1, ostate1, input1, X); PBKDF2_SHA256_80_128(tstate2, ostate2, input2, Y); PBKDF2_SHA256_80_128(tstate3, ostate3, input3, Z); scrypt_core_3way(X, Y, Z, V); - PBKDF2_SHA256_80_128_32(tstate1, ostate1, input1, X, output1); - PBKDF2_SHA256_80_128_32(tstate2, ostate2, input2, Y, output2); - PBKDF2_SHA256_80_128_32(tstate3, ostate3, input3, Z, output3); + PBKDF2_SHA256_128_32(tstate1, ostate1, X, output1); + PBKDF2_SHA256_128_32(tstate2, ostate2, Y, output2); + PBKDF2_SHA256_128_32(tstate3, ostate3, Z, output3); +#endif /* SHA256_4WAY*/ } -#endif +#endif /* SCRYPT_3WAY */ -__attribute__ ((noinline)) static int test_lower_hash(const uint32_t *hash, +__attribute__ ((noinline)) static int confirm_hash(const uint32_t *hash, const uint32_t *target) { int i; - for (i = 6; i >= 0; i--) { + for (i = 7; i >= 0; i--) { uint32_t t = le32dec(&target[i]); if (hash[i] > t) return 0; @@ -511,8 +634,8 @@ __attribute__ ((noinline)) static int test_lower_hash(const uint32_t *hash, return 1; } -int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, - const unsigned char *ptarget, +int scanhash_scrypt(int thr_id, unsigned char *pdata, + unsigned char *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, uint32_t *next_nonce, unsigned long *hashes_done) { uint32_t data[20], hash[8]; @@ -542,7 +665,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, if (throughput >= 3 && n <= max_nonce) { data3[19] = n++; scrypt_1024_1_1_256_sp_3way(data, data2, data3, hash, hash2, hash3, scratchbuf); - if (hash3[7] < Htarg || (hash3[7] == Htarg && test_lower_hash(hash3, (uint32_t *)ptarget))) { + if (hash3[7] <= Htarg && confirm_hash(hash3, (uint32_t *)ptarget)) { be32enc(&((uint32_t *)pdata)[19], data3[19]); *next_nonce = n; *hashes_done = n - first_nonce; @@ -551,7 +674,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, } else { scrypt_1024_1_1_256_sp_2way(data, data2, hash, hash2, scratchbuf); } - if (hash2[7] < Htarg || (hash2[7] == Htarg && test_lower_hash(hash2, (uint32_t *)ptarget))) { + if (hash2[7] <= Htarg && confirm_hash(hash2, (uint32_t *)ptarget)) { be32enc(&((uint32_t *)pdata)[19], data2[19]); *next_nonce = n; *hashes_done = n - first_nonce; @@ -563,7 +686,7 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, #else scrypt_1024_1_1_256_sp(data, hash, scratchbuf); #endif - if (hash[7] < Htarg || (hash[7] == Htarg && test_lower_hash(hash, (uint32_t *)ptarget))) { + if (hash[7] <= Htarg && confirm_hash(hash, (uint32_t *)ptarget)) { be32enc(&((uint32_t *)pdata)[19], data[19]); *next_nonce = n; *hashes_done = n - first_nonce; @@ -575,4 +698,3 @@ int scanhash_scrypt(int thr_id, unsigned char *pdata, unsigned char *scratchbuf, *hashes_done = n - first_nonce; return false; } -