diff --git a/cpu-miner.c b/cpu-miner.c index 032a04a..2fea8e2 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -84,10 +84,12 @@ struct workio_cmd { enum sha256_algos { ALGO_SCRYPT, /* scrypt(1024,1,1) */ + ALGO_SHA256D, /* SHA-256d */ }; static const char *algo_names[] = { [ALGO_SCRYPT] = "scrypt", + [ALGO_SHA256D] = "sha256d", }; bool opt_debug = false; @@ -125,6 +127,9 @@ double *thr_hashrates; static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ + -a, --algo=ALGO specify the algorithm to use\n\ + scrypt scrypt(1024, 1, 1) (default)\n\ + sha256d SHA-256d\n\ -o, --url=URL URL of mining server (default: " DEF_RPC_URL ")\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -223,7 +228,7 @@ static bool work_decode(const json_t *val, struct work *work) } for (i = 0; i < ARRAY_SIZE(work->data); i++) - work->data[i] = be32dec(work->data + i); + work->data[i] = le32dec(work->data + i); for (i = 0; i < ARRAY_SIZE(work->target); i++) work->target[i] = le32dec(work->target + i); @@ -251,7 +256,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) /* build hex string */ for (i = 0; i < ARRAY_SIZE(work->data); i++) - be32enc(work->data + i, work->data[i]); + le32enc(work->data + i, work->data[i]); hexstr = bin2hex((unsigned char *)work->data, sizeof(work->data)); if (unlikely(!hexstr)) { applog(LOG_ERR, "submit_upstream_work OOM"); @@ -544,7 +549,7 @@ static void *miner_thread(void *userdata) - time(NULL); max64 *= thr_hashrates[thr_id]; if (max64 <= 0) - max64 = 0xfffLL; + max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0xfffffLL; if (work.data[19] + max64 > end_nonce) max_nonce = end_nonce; else @@ -560,6 +565,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_SHA256D: + rc = scanhash_sha256d(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + default: /* should never happen */ goto out; diff --git a/miner.h b/miner.h index d0de194..2307729 100644 --- a/miner.h +++ b/miner.h @@ -113,11 +113,14 @@ void sha256_init(uint32_t *state); void sha256_transform(uint32_t *state, const uint32_t *block, int swap); #if defined(__x86_64__) -#define SHA256_4WAY +#define SHA256_4WAY 1 void sha256_init_4way(uint32_t *state); void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); #endif +extern int scanhash_sha256d(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + extern unsigned char *scrypt_buffer_alloc(); extern int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, const uint32_t *ptarget, diff --git a/scrypt.c b/scrypt.c index 0830737..3ae53ae 100644 --- a/scrypt.c +++ b/scrypt.c @@ -35,7 +35,7 @@ #include static const uint32_t keypad[12] = { - 0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000 + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 }; static const uint32_t innerpad[11] = { 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 @@ -57,7 +57,7 @@ static inline void HMAC_SHA256_80_init(const uint32_t *key, /* tstate is assumed to contain the midstate of key */ memcpy(pad, key + 16, 16); memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 1); + sha256_transform(tstate, pad, 0); memcpy(ihash, tstate, 32); sha256_init(ostate); @@ -83,10 +83,9 @@ static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, int i, j; memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 1); + sha256_transform(istate, salt, 0); - for (i = 0; i < 4; i++) - ibuf[i] = swab32(salt[16 + i]); + memcpy(ibuf, salt + 16, 16); memcpy(ibuf + 5, innerpad, 44); memcpy(obuf + 8, outerpad, 32); @@ -123,7 +122,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, #ifdef SHA256_4WAY static const uint32_t keypad_4way[4 * 12] = { - 0x00000080, 0x00000080, 0x00000080, 0x00000080, + 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -134,7 +133,7 @@ static const uint32_t keypad_4way[4 * 12] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x80020000, 0x80020000, 0x80020000, 0x80020000 + 0x00000280, 0x00000280, 0x00000280, 0x00000280 }; static const uint32_t innerpad_4way[4 * 11] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000, @@ -159,7 +158,7 @@ static const uint32_t outerpad_4way[4 * 8] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000300, 0x00000300, 0x00000300, 0x00000300 }; -static const uint32_t finalblk_4way[4 * 16] = { +static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -181,14 +180,14 @@ static const uint32_t finalblk_4way[4 * 16] = { static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, uint32_t *tstate, uint32_t *ostate) { - uint32_t ihash[4 * 8]; - uint32_t pad[4 * 16]; + uint32_t ihash[4 * 8] __attribute__((aligned(16))); + uint32_t pad[4 * 16] __attribute__((aligned(16))); int i; /* tstate is assumed to contain the midstate of key */ memcpy(pad, key + 4 * 16, 4 * 16); memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 1); + sha256_transform_4way(tstate, pad, 0); memcpy(ihash, tstate, 4 * 32); sha256_init_4way(ostate); @@ -209,15 +208,16 @@ static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, const uint32_t *ostate, const uint32_t *salt, uint32_t *output) { - uint32_t istate[4 * 8], ostate2[4 * 8]; - uint32_t ibuf[4 * 16], obuf[4 * 16]; + uint32_t istate[4 * 8] __attribute__((aligned(16))); + uint32_t ostate2[4 * 8] __attribute__((aligned(16))); + uint32_t ibuf[4 * 16] __attribute__((aligned(16))); + uint32_t obuf[4 * 16] __attribute__((aligned(16))); int i, j; memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 1); + sha256_transform_4way(istate, salt, 0); - for (i = 0; i < 4 * 4; i++) - ibuf[i] = swab32(salt[4 * 16 + i]); + memcpy(ibuf, salt + 4 * 16, 4 * 16); memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); @@ -239,7 +239,7 @@ static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, uint32_t *ostate, const uint32_t *salt, uint32_t *output) { - uint32_t buf[4 * 16]; + uint32_t buf[4 * 16] __attribute__((aligned(16))); int i; sha256_transform_4way(tstate, salt, 1); @@ -270,7 +270,7 @@ void scrypt_core(uint32_t *X, uint32_t *V); #else -static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16]) +static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) { uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; int i; @@ -341,21 +341,18 @@ static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16]) static inline void scrypt_core(uint32_t *X, uint32_t *V) { uint32_t i, j, k; - uint64_t *p1, *p2; - p1 = (uint64_t *)X; for (i = 0; i < 1024; i++) { memcpy(&V[i * 32], X, 128); - salsa20_8(&X[0], &X[16]); - salsa20_8(&X[16], &X[0]); + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); } for (i = 0; i < 1024; i++) { - j = X[16] & 1023; - p2 = (uint64_t *)(&V[j * 32]); - for (k = 0; k < 16; k++) - p1[k] ^= p2[k]; - salsa20_8(&X[0], &X[16]); - salsa20_8(&X[16], &X[0]); + j = 32 * (X[16] & 1023); + for (k = 0; k < 32; k++) + X[k] ^= V[j + k]; + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); } } @@ -377,8 +374,9 @@ static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) { uint32_t tstate[8], ostate[8]; - uint32_t *V; uint32_t X[32]; + uint32_t *V; + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); memcpy(tstate, midstate, 32); @@ -396,8 +394,8 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input, { uint32_t tstate1[8], tstate2[8]; uint32_t ostate1[8], ostate2[8]; - uint32_t *V; uint32_t X[2 * 32], *Y = X + 32; + uint32_t *V; V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); @@ -419,9 +417,10 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input, static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input, uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) { - uint32_t tstate[4 * 8], ostate[4 * 8]; - uint32_t X[3 * 32]; - uint32_t W[4 * 32]; + uint32_t tstate[4 * 8] __attribute__((aligned(128))); + uint32_t ostate[4 * 8] __attribute__((aligned(128))); + uint32_t W[4 * 32] __attribute__((aligned(128))); + uint32_t X[3 * 32] __attribute__((aligned(128))); uint32_t *V; int i; @@ -474,7 +473,7 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata, memcpy(data + i * 20, pdata, 80); sha256_init(midstate); - sha256_transform(midstate, data, 1); + sha256_transform(midstate, data, 0); do { for (i = 0; i < throughput; i++) diff --git a/sha2-x64.S b/sha2-x64.S index 449b27c..5dbc73d 100644 --- a/sha2-x64.S +++ b/sha2-x64.S @@ -16,7 +16,7 @@ #if defined(__x86_64__) .data - .p2align 6 + .p2align 7 sha256_4h: .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 @@ -28,7 +28,7 @@ sha256_4h: .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 .data - .p2align 6 + .p2align 7 sha256_4k: .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 @@ -126,6 +126,537 @@ _sha256_init_4way: #endif ret + +.macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%rcx), %xmm0 + movdqa (\i-14)*16(%rcx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + movdqa (\i-2)*16(%rcx), %xmm3 + movdqa (\i-1)*16(%rcx), %xmm7 + paddd (\i-16)*16(%rcx), %xmm0 + paddd (\i-15)*16(%rcx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + + paddd (\i-7)*16(%rcx), %xmm0 + + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (\i-6)*16(%rcx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm3, %xmm0 + paddd %xmm7, %xmm4 + movdqa %xmm0, \i*16(%rcx) + movdqa %xmm4, (\i+1)*16(%rcx) +.endm + + .text + .p2align 6 +sha256_sse2_extend_loop: + sha256_sse2_extend_round 0 + sha256_sse2_extend_round 2 + sha256_sse2_extend_round 4 + sha256_sse2_extend_round 6 + sha256_sse2_extend_round 8 + sha256_sse2_extend_round 10 + sha256_sse2_extend_round 12 + sha256_sse2_extend_round 14 + sha256_sse2_extend_round 16 + sha256_sse2_extend_round 18 + sha256_sse2_extend_round 20 + sha256_sse2_extend_round 22 + sha256_sse2_extend_round 24 + sha256_sse2_extend_round 26 + sha256_sse2_extend_round 28 + sha256_sse2_extend_round 30 + sha256_sse2_extend_round 32 + sha256_sse2_extend_round 34 + sha256_sse2_extend_round 36 + sha256_sse2_extend_round 38 + sha256_sse2_extend_round 40 + sha256_sse2_extend_round 42 + sha256_sse2_extend_round 44 + sha256_sse2_extend_round 46 + ret + +.macro sha256_sse2_main_round i + movdqa 16*\i(%rax), %xmm6 + paddd 16*\i(%rcx), %xmm6 + paddd %xmm10, %xmm6 + + movdqa %xmm0, %xmm1 + movdqa %xmm9, %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, %xmm10 + movdqa %xmm8, %xmm2 + movdqa %xmm2, %xmm9 + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, %xmm8 + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +.endm + + .text + .p2align 6 +sha256_sse2_main_loop: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 + sha256_sse2_main_round 3 + sha256_sse2_main_round 4 + sha256_sse2_main_round 5 + sha256_sse2_main_round 6 + sha256_sse2_main_round 7 + sha256_sse2_main_round 8 + sha256_sse2_main_round 9 + sha256_sse2_main_round 10 + sha256_sse2_main_round 11 + sha256_sse2_main_round 12 + sha256_sse2_main_round 13 + sha256_sse2_main_round 14 + sha256_sse2_main_round 15 + sha256_sse2_main_round 16 + sha256_sse2_main_round 17 + sha256_sse2_main_round 18 + sha256_sse2_main_round 19 + sha256_sse2_main_round 20 + sha256_sse2_main_round 21 + sha256_sse2_main_round 22 + sha256_sse2_main_round 23 + sha256_sse2_main_round 24 + sha256_sse2_main_round 25 + sha256_sse2_main_round 26 + sha256_sse2_main_round 27 + sha256_sse2_main_round 28 + sha256_sse2_main_round 29 + sha256_sse2_main_round 30 + sha256_sse2_main_round 31 + sha256_sse2_main_round 32 + sha256_sse2_main_round 33 + sha256_sse2_main_round 34 + sha256_sse2_main_round 35 + sha256_sse2_main_round 36 + sha256_sse2_main_round 37 + sha256_sse2_main_round 38 + sha256_sse2_main_round 39 + sha256_sse2_main_round 40 + sha256_sse2_main_round 41 + sha256_sse2_main_round 42 + sha256_sse2_main_round 43 + sha256_sse2_main_round 44 + sha256_sse2_main_round 45 + sha256_sse2_main_round 46 + sha256_sse2_main_round 47 + sha256_sse2_main_round 48 + sha256_sse2_main_round 49 + sha256_sse2_main_round 50 + sha256_sse2_main_round 51 + sha256_sse2_main_round 52 + sha256_sse2_main_round 53 + sha256_sse2_main_round 54 + sha256_sse2_main_round 55 + sha256_sse2_main_round 56 + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_round 60 + sha256_sse2_main_round 61 + sha256_sse2_main_round 62 + sha256_sse2_main_round 63 + ret + + +#if defined(USE_AVX) + +.macro sha256_avx_extend_round i + movdqa (\i-15)*16(%rcx), %xmm0 + movdqa (\i-14)*16(%rcx), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + vpsrld $4, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + movdqa (\i-2)*16(%rcx), %xmm3 + movdqa (\i-1)*16(%rcx), %xmm7 + paddd (\i-16)*16(%rcx), %xmm0 + paddd (\i-15)*16(%rcx), %xmm4 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + + paddd (\i-7)*16(%rcx), %xmm0 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + + paddd (\i-6)*16(%rcx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm3, %xmm0 + paddd %xmm7, %xmm4 + movdqa %xmm0, \i*16(%rcx) + movdqa %xmm4, (\i+1)*16(%rcx) +.endm + + .text + .p2align 6 +sha256_avx_extend_loop: + sha256_avx_extend_round 0 + sha256_avx_extend_round 2 + sha256_avx_extend_round 4 + sha256_avx_extend_round 6 + sha256_avx_extend_round 8 + sha256_avx_extend_round 10 + sha256_avx_extend_round 12 + sha256_avx_extend_round 14 + sha256_avx_extend_round 16 + sha256_avx_extend_round 18 + sha256_avx_extend_round 20 + sha256_avx_extend_round 22 + sha256_avx_extend_round 24 + sha256_avx_extend_round 26 + sha256_avx_extend_round 28 + sha256_avx_extend_round 30 + sha256_avx_extend_round 32 + sha256_avx_extend_round 34 + sha256_avx_extend_round 36 + sha256_avx_extend_round 38 + sha256_avx_extend_round 40 + sha256_avx_extend_round 42 + sha256_avx_extend_round 44 + sha256_avx_extend_round 46 + ret + +.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + paddd 16*(\i)(%rcx), %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + vpslld $7, \r3, %xmm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %xmm2 + pxor %xmm1, \r0 + pxor %xmm2, \r0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, \r0 + pxor %xmm2, \r0 + pslld $5, %xmm1 + pxor %xmm1, \r0 + paddd \r0, %xmm6 + + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + pxor \r4, %xmm1 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + vpslld $10, \r7, %xmm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %xmm1 + pxor %xmm2, \r4 + pxor %xmm1, \r4 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, \r4 + pxor %xmm1, \r4 + pslld $11, %xmm2 + pxor %xmm2, \r4 + paddd %xmm6, \r4 +.endm + +.macro sha256_avx_main_quadround i + sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + + .text + .p2align 6 +sha256_avx_main_loop: + sha256_avx_main_quadround 0 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_quadround 56 + sha256_avx_main_quadround 60 + ret + +#endif /* USE_AVX */ + + +#if defined(USE_XOP) + +.macro sha256_xop_extend_round i + vmovdqa (\i-15)*16(%rcx), %xmm0 + vmovdqa (\i-14)*16(%rcx), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vmovdqa (\i-2)*16(%rcx), %xmm3 + vmovdqa (\i-1)*16(%rcx), %xmm7 + vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0 + vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm3, %xmm0, %xmm0 + vpaddd %xmm7, %xmm4, %xmm4 + vmovdqa %xmm0, \i*16(%rcx) + vmovdqa %xmm4, (\i+1)*16(%rcx) +.endm + + .text + .p2align 6 +sha256_xop_extend_loop: + sha256_xop_extend_round 0 + sha256_xop_extend_round 2 + sha256_xop_extend_round 4 + sha256_xop_extend_round 6 + sha256_xop_extend_round 8 + sha256_xop_extend_round 10 + sha256_xop_extend_round 12 + sha256_xop_extend_round 14 + sha256_xop_extend_round 16 + sha256_xop_extend_round 18 + sha256_xop_extend_round 20 + sha256_xop_extend_round 22 + sha256_xop_extend_round 24 + sha256_xop_extend_round 26 + sha256_xop_extend_round 28 + sha256_xop_extend_round 30 + sha256_xop_extend_round 32 + sha256_xop_extend_round 34 + sha256_xop_extend_round 36 + sha256_xop_extend_round 38 + sha256_xop_extend_round 40 + sha256_xop_extend_round 42 + sha256_xop_extend_round 44 + sha256_xop_extend_round 46 + ret + +.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, \r3, %xmm1 + vprotd $21, \r3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, \r3, \r0 + vpxor %xmm2, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + vpxor \r4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, \r7, %xmm1 + vprotd $19, \r7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, \r7, \r4 + vpxor %xmm2, \r4, \r4 + vpaddd %xmm6, \r4, \r4 +.endm + +.macro sha256_xop_main_quadround i + sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + + .text + .p2align 6 +sha256_xop_main_loop: + sha256_xop_main_quadround 0 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_quadround 56 + sha256_xop_main_quadround 60 + ret + +#endif /* USE_XOP */ + + .macro p2bswap_rsi_rsp i movdqu \i*16(%rsi), %xmm0 movdqu (\i+1)*16(%rsi), %xmm2 @@ -165,7 +696,9 @@ _sha256_transform_4way: movq %rdx, %rsi movq %r8, %rdx #endif + movq %rsp, %r8 subq $1032, %rsp + andq $-128, %rsp testq %rdx, %rdx jz sha256_transform_4way_block_copy @@ -391,7 +924,7 @@ sha256_transform_4way_main_loop: movdqu %xmm9, 96(%rdi) movdqu %xmm10, 112(%rdi) - addq $1032, %rsp + movq %r8, %rsp #if defined(WIN64) popq %rsi movdqa 0(%rsp), %xmm6 @@ -404,5 +937,414 @@ sha256_transform_4way_main_loop: popq %rdi #endif ret + + + .data + .p2align 3 +sha256d_4way_addr: + .quad 0x0 + + .text + .p2align 6 + .globl sha256d_4way + .globl _sha256d_4way +sha256d_4way: +_sha256d_4way: + movq sha256d_4way_addr(%rip), %rax + testq %rax, %rax + jz sha256d_4way_set + jmp *%rax + +sha256d_4way_set: + pushq %rbx + pushq %rcx + pushq %rdx + +#if defined(USE_AVX) + # Check for AVX and OSXSAVE support + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256d_4way_set_sse2 + # Check for XMM and YMM state support + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256d_4way_set_sse2 +#if defined(USE_XOP) + # Check for XOP support + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jz sha256d_4way_set_avx + +sha256d_4way_set_xop: + leaq sha256d_4way_xop(%rip), %rax + jmp sha256d_4way_set_done +#endif /* USE_XOP */ + +sha256d_4way_set_avx: + leaq sha256d_4way_avx(%rip), %rax + jmp sha256d_4way_set_done +#endif /* USE_AVX */ + +sha256d_4way_set_sse2: + leaq sha256d_4way_sse2(%rip), %rax + +sha256d_4way_set_done: + movq %rax, sha256d_4way_addr(%rip) + popq %rdx + popq %rcx + popq %rbx + jmp *%rax + + + .p2align 6 +sha256d_4way_sse2: +#if defined(WIN64) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rcx + call sha256_sse2_extend_loop + + movdqa 0(%rdx), %xmm7 + movdqa 16(%rdx), %xmm5 + movdqa 32(%rdx), %xmm4 + movdqa 48(%rdx), %xmm3 + movdqa 64(%rdx), %xmm0 + movdqa 80(%rdx), %xmm8 + movdqa 96(%rdx), %xmm9 + movdqa 112(%rdx), %xmm10 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + call sha256_sse2_main_loop + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rcx + call sha256_sse2_extend_loop + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + call sha256_sse2_main_loop + + paddd sha256_4h+0(%rip), %xmm7 + paddd sha256_4h+16(%rip), %xmm5 + paddd sha256_4h+32(%rip), %xmm4 + paddd sha256_4h+48(%rip), %xmm3 + paddd sha256_4h+64(%rip), %xmm0 + paddd sha256_4h+80(%rip), %xmm8 + paddd sha256_4h+96(%rip), %xmm9 + paddd sha256_4h+112(%rip), %xmm10 + + movdqa %xmm7, 0(%rdi) + movdqa %xmm5, 16(%rdi) + movdqa %xmm4, 32(%rdi) + movdqa %xmm3, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movdqa %xmm8, 80(%rdi) + movdqa %xmm9, 96(%rdi) + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(WIN64) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + + +#if defined(USE_AVX) + + .p2align 6 +sha256d_4way_avx: +#if defined(WIN64) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rcx + call sha256_avx_extend_loop + + movdqa 0(%rdx), %xmm7 + movdqa 16(%rdx), %xmm5 + movdqa 32(%rdx), %xmm4 + movdqa 48(%rdx), %xmm3 + movdqa 64(%rdx), %xmm0 + movdqa 80(%rdx), %xmm8 + movdqa 96(%rdx), %xmm9 + movdqa 112(%rdx), %xmm10 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + call sha256_avx_main_loop + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rcx + call sha256_avx_extend_loop + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + call sha256_avx_main_loop + + paddd sha256_4h+0(%rip), %xmm7 + paddd sha256_4h+16(%rip), %xmm5 + paddd sha256_4h+32(%rip), %xmm4 + paddd sha256_4h+48(%rip), %xmm3 + paddd sha256_4h+64(%rip), %xmm0 + paddd sha256_4h+80(%rip), %xmm8 + paddd sha256_4h+96(%rip), %xmm9 + paddd sha256_4h+112(%rip), %xmm10 + + movdqa %xmm7, 0(%rdi) + movdqa %xmm5, 16(%rdi) + movdqa %xmm4, 32(%rdi) + movdqa %xmm3, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movdqa %xmm8, 80(%rdi) + movdqa %xmm9, 96(%rdi) + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(WIN64) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + +#endif /* USE_AVX */ + + +#if defined(USE_XOP) + + .p2align 6 +sha256d_4way_xop: +#if defined(WIN64) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rcx + call sha256_xop_extend_loop + + movdqa 0(%rdx), %xmm7 + movdqa 16(%rdx), %xmm5 + movdqa 32(%rdx), %xmm4 + movdqa 48(%rdx), %xmm3 + movdqa 64(%rdx), %xmm0 + movdqa 80(%rdx), %xmm8 + movdqa 96(%rdx), %xmm9 + movdqa 112(%rdx), %xmm10 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + call sha256_xop_main_loop + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rcx + call sha256_xop_extend_loop + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + call sha256_xop_main_loop + + paddd sha256_4h+0(%rip), %xmm7 + paddd sha256_4h+16(%rip), %xmm5 + paddd sha256_4h+32(%rip), %xmm4 + paddd sha256_4h+48(%rip), %xmm3 + paddd sha256_4h+64(%rip), %xmm0 + paddd sha256_4h+80(%rip), %xmm8 + paddd sha256_4h+96(%rip), %xmm9 + paddd sha256_4h+112(%rip), %xmm10 + + movdqa %xmm7, 0(%rdi) + movdqa %xmm5, 16(%rdi) + movdqa %xmm4, 32(%rdi) + movdqa %xmm3, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movdqa %xmm8, 80(%rdi) + movdqa %xmm9, 96(%rdi) + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(WIN64) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + +#endif /* USE_XOP */ #endif diff --git a/sha2.c b/sha2.c index 78368b7..144bacf 100644 --- a/sha2.c +++ b/sha2.c @@ -13,43 +13,60 @@ #include #include +static const uint32_t sha256_h[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + +static const uint32_t sha256_k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + void sha256_init(uint32_t *state) { - /* Magic initialization constants */ - state[0] = 0x6a09e667; - state[1] = 0xbb67ae85; - state[2] = 0x3C6ef372; - state[3] = 0xa54ff53a; - state[4] = 0x510e527f; - state[5] = 0x9b05688c; - state[6] = 0x1f83d9Ab; - state[7] = 0x5be0cd19; + memcpy(state, sha256_h, 32); } /* Elementary functions used by SHA256 */ #define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define SHR(x, n) (x >> n) #define ROTR(x, n) ((x >> n) | (x << (32 - n))) #define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) #define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) /* SHA256 round function */ #define RND(a, b, c, d, e, f, g, h, k) \ - t0 = h + S1(e) + Ch(e, f, g) + k; \ - t1 = S0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; + do { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ + } while (0) /* Adjusted round function for rotating state */ -#define RNDr(S, W, i, k) \ +#define RNDr(S, W, i) \ RND(S[(64 - i) % 8], S[(65 - i) % 8], \ S[(66 - i) % 8], S[(67 - i) % 8], \ S[(68 - i) % 8], S[(69 - i) % 8], \ S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i] + k) + W[i] + sha256_k[i]) /* * SHA256 block compression function. The 256-bit state is transformed via @@ -77,72 +94,299 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap) memcpy(S, state, 32); /* 3. Mix. */ - RNDr(S, W, 0, 0x428a2f98); - RNDr(S, W, 1, 0x71374491); - RNDr(S, W, 2, 0xb5c0fbcf); - RNDr(S, W, 3, 0xe9b5dba5); - RNDr(S, W, 4, 0x3956c25b); - RNDr(S, W, 5, 0x59f111f1); - RNDr(S, W, 6, 0x923f82a4); - RNDr(S, W, 7, 0xab1c5ed5); - RNDr(S, W, 8, 0xd807aa98); - RNDr(S, W, 9, 0x12835b01); - RNDr(S, W, 10, 0x243185be); - RNDr(S, W, 11, 0x550c7dc3); - RNDr(S, W, 12, 0x72be5d74); - RNDr(S, W, 13, 0x80deb1fe); - RNDr(S, W, 14, 0x9bdc06a7); - RNDr(S, W, 15, 0xc19bf174); - RNDr(S, W, 16, 0xe49b69c1); - RNDr(S, W, 17, 0xefbe4786); - RNDr(S, W, 18, 0x0fc19dc6); - RNDr(S, W, 19, 0x240ca1cc); - RNDr(S, W, 20, 0x2de92c6f); - RNDr(S, W, 21, 0x4a7484aa); - RNDr(S, W, 22, 0x5cb0a9dc); - RNDr(S, W, 23, 0x76f988da); - RNDr(S, W, 24, 0x983e5152); - RNDr(S, W, 25, 0xa831c66d); - RNDr(S, W, 26, 0xb00327c8); - RNDr(S, W, 27, 0xbf597fc7); - RNDr(S, W, 28, 0xc6e00bf3); - RNDr(S, W, 29, 0xd5a79147); - RNDr(S, W, 30, 0x06ca6351); - RNDr(S, W, 31, 0x14292967); - RNDr(S, W, 32, 0x27b70a85); - RNDr(S, W, 33, 0x2e1b2138); - RNDr(S, W, 34, 0x4d2c6dfc); - RNDr(S, W, 35, 0x53380d13); - RNDr(S, W, 36, 0x650a7354); - RNDr(S, W, 37, 0x766a0abb); - RNDr(S, W, 38, 0x81c2c92e); - RNDr(S, W, 39, 0x92722c85); - RNDr(S, W, 40, 0xa2bfe8a1); - RNDr(S, W, 41, 0xa81a664b); - RNDr(S, W, 42, 0xc24b8b70); - RNDr(S, W, 43, 0xc76c51a3); - RNDr(S, W, 44, 0xd192e819); - RNDr(S, W, 45, 0xd6990624); - RNDr(S, W, 46, 0xf40e3585); - RNDr(S, W, 47, 0x106aa070); - RNDr(S, W, 48, 0x19a4c116); - RNDr(S, W, 49, 0x1e376c08); - RNDr(S, W, 50, 0x2748774c); - RNDr(S, W, 51, 0x34b0bcb5); - RNDr(S, W, 52, 0x391c0cb3); - RNDr(S, W, 53, 0x4ed8aa4a); - RNDr(S, W, 54, 0x5b9cca4f); - RNDr(S, W, 55, 0x682e6ff3); - RNDr(S, W, 56, 0x748f82ee); - RNDr(S, W, 57, 0x78a5636f); - RNDr(S, W, 58, 0x84c87814); - RNDr(S, W, 59, 0x8cc70208); - RNDr(S, W, 60, 0x90befffa); - RNDr(S, W, 61, 0xa4506ceb); - RNDr(S, W, 62, 0xbef9a3f7); - RNDr(S, W, 63, 0xc67178f2); + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); /* 4. Mix local working variables into global state */ for (i = 0; i < 8; i++) state[i] += S[i]; } + +#if defined(__x86_64__) + +#define SHA256D_WAYS 4 + +void sha256d_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate); + +#else + +#define SHA256D_WAYS 1 + +static const uint32_t sha256d_hash1[16] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + +static inline void sha256d(uint32_t *hash, uint32_t *W, + const uint32_t *midstate) +{ + uint32_t S[64]; + uint32_t t0, t1; + int i; + + for (i = 16; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + memcpy(S, midstate, 32); + + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + for (i = 0; i < 8; i++) + S[i] += midstate[i]; + + memcpy(S + 8, sha256d_hash1 + 8, 32); + for (i = 16; i < 64; i += 2) { + S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; + S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; + } + + sha256_init(hash); + + RNDr(hash, S, 0); + RNDr(hash, S, 1); + RNDr(hash, S, 2); + RNDr(hash, S, 3); + RNDr(hash, S, 4); + RNDr(hash, S, 5); + RNDr(hash, S, 6); + RNDr(hash, S, 7); + RNDr(hash, S, 8); + RNDr(hash, S, 9); + RNDr(hash, S, 10); + RNDr(hash, S, 11); + RNDr(hash, S, 12); + RNDr(hash, S, 13); + RNDr(hash, S, 14); + RNDr(hash, S, 15); + RNDr(hash, S, 16); + RNDr(hash, S, 17); + RNDr(hash, S, 18); + RNDr(hash, S, 19); + RNDr(hash, S, 20); + RNDr(hash, S, 21); + RNDr(hash, S, 22); + RNDr(hash, S, 23); + RNDr(hash, S, 24); + RNDr(hash, S, 25); + RNDr(hash, S, 26); + RNDr(hash, S, 27); + RNDr(hash, S, 28); + RNDr(hash, S, 29); + RNDr(hash, S, 30); + RNDr(hash, S, 31); + RNDr(hash, S, 32); + RNDr(hash, S, 33); + RNDr(hash, S, 34); + RNDr(hash, S, 35); + RNDr(hash, S, 36); + RNDr(hash, S, 37); + RNDr(hash, S, 38); + RNDr(hash, S, 39); + RNDr(hash, S, 40); + RNDr(hash, S, 41); + RNDr(hash, S, 42); + RNDr(hash, S, 43); + RNDr(hash, S, 44); + RNDr(hash, S, 45); + RNDr(hash, S, 46); + RNDr(hash, S, 47); + RNDr(hash, S, 48); + RNDr(hash, S, 49); + RNDr(hash, S, 50); + RNDr(hash, S, 51); + RNDr(hash, S, 52); + RNDr(hash, S, 53); + RNDr(hash, S, 54); + RNDr(hash, S, 55); + RNDr(hash, S, 56); + RNDr(hash, S, 57); + RNDr(hash, S, 58); + RNDr(hash, S, 59); + RNDr(hash, S, 60); + RNDr(hash, S, 61); + RNDr(hash, S, 62); + RNDr(hash, S, 63); + + for (i = 0; i < 8; i++) + hash[i] += sha256_h[i]; +} + +#endif + +int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[SHA256D_WAYS * 64] __attribute__((aligned(128))); + uint32_t hash[SHA256D_WAYS * 8] __attribute__((aligned(32))); + uint32_t midstate[SHA256D_WAYS * 8] __attribute__((aligned(32))); + uint32_t tmp[8]; + uint32_t n = pdata[19] - 1; + const uint32_t Htarg = ptarget[7]; + int i, j; + + for (i = 15; i >= 0; i--) + for (j = 0; j < SHA256D_WAYS; j++) + data[i * SHA256D_WAYS + j] = pdata[16 + i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + for (i = 7; i >= 0; i--) + for (j = 0; j < SHA256D_WAYS; j++) + midstate[i * SHA256D_WAYS + j] = midstate[i]; + + do { + for (i = 0; i < SHA256D_WAYS; i++) + data[SHA256D_WAYS * 3 + i] = ++n; + +#if SHA256D_WAYS == 4 + sha256d_4way(hash, data, midstate); +#else + sha256d(hash, data, midstate); +#endif + + for (i = 0; i < SHA256D_WAYS; i++) { + if (hash[SHA256D_WAYS * 7 + i] <= Htarg) { + for (j = 0; j < 8; j++) + tmp[j] = hash[SHA256D_WAYS * j + i]; + if (fulltest(tmp, ptarget)) { + *hashes_done = n - pdata[19] + 1; + pdata[19] = data[SHA256D_WAYS * 3 + i]; + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - pdata[19] + 1; + pdata[19] = n; + return 0; +}