Reimplement SHA-256d

This commit is contained in:
pooler 2012-03-21 23:07:56 +01:00
parent b961766f4d
commit ed3d1c94f9
5 changed files with 1321 additions and 123 deletions

View file

@ -84,10 +84,12 @@ struct workio_cmd {
enum sha256_algos { enum sha256_algos {
ALGO_SCRYPT, /* scrypt(1024,1,1) */ ALGO_SCRYPT, /* scrypt(1024,1,1) */
ALGO_SHA256D, /* SHA-256d */
}; };
static const char *algo_names[] = { static const char *algo_names[] = {
[ALGO_SCRYPT] = "scrypt", [ALGO_SCRYPT] = "scrypt",
[ALGO_SHA256D] = "sha256d",
}; };
bool opt_debug = false; bool opt_debug = false;
@ -125,6 +127,9 @@ double *thr_hashrates;
static char const usage[] = "\ static char const usage[] = "\
Usage: " PROGRAM_NAME " [OPTIONS]\n\ Usage: " PROGRAM_NAME " [OPTIONS]\n\
Options:\n\ Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
scrypt scrypt(1024, 1, 1) (default)\n\
sha256d SHA-256d\n\
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ")\n\ -o, --url=URL URL of mining server (default: " DEF_RPC_URL ")\n\
-O, --userpass=U:P username:password pair for mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\
-u, --user=USERNAME username for mining server\n\ -u, --user=USERNAME username for mining server\n\
@ -223,7 +228,7 @@ static bool work_decode(const json_t *val, struct work *work)
} }
for (i = 0; i < ARRAY_SIZE(work->data); i++) for (i = 0; i < ARRAY_SIZE(work->data); i++)
work->data[i] = be32dec(work->data + i); work->data[i] = le32dec(work->data + i);
for (i = 0; i < ARRAY_SIZE(work->target); i++) for (i = 0; i < ARRAY_SIZE(work->target); i++)
work->target[i] = le32dec(work->target + i); work->target[i] = le32dec(work->target + i);
@ -251,7 +256,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
/* build hex string */ /* build hex string */
for (i = 0; i < ARRAY_SIZE(work->data); i++) for (i = 0; i < ARRAY_SIZE(work->data); i++)
be32enc(work->data + i, work->data[i]); le32enc(work->data + i, work->data[i]);
hexstr = bin2hex((unsigned char *)work->data, sizeof(work->data)); hexstr = bin2hex((unsigned char *)work->data, sizeof(work->data));
if (unlikely(!hexstr)) { if (unlikely(!hexstr)) {
applog(LOG_ERR, "submit_upstream_work OOM"); applog(LOG_ERR, "submit_upstream_work OOM");
@ -544,7 +549,7 @@ static void *miner_thread(void *userdata)
- time(NULL); - time(NULL);
max64 *= thr_hashrates[thr_id]; max64 *= thr_hashrates[thr_id];
if (max64 <= 0) if (max64 <= 0)
max64 = 0xfffLL; max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0xfffffLL;
if (work.data[19] + max64 > end_nonce) if (work.data[19] + max64 > end_nonce)
max_nonce = end_nonce; max_nonce = end_nonce;
else else
@ -560,6 +565,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_SHA256D:
rc = scanhash_sha256d(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
default: default:
/* should never happen */ /* should never happen */
goto out; goto out;

View file

@ -113,11 +113,14 @@ void sha256_init(uint32_t *state);
void sha256_transform(uint32_t *state, const uint32_t *block, int swap); void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
#if defined(__x86_64__) #if defined(__x86_64__)
#define SHA256_4WAY #define SHA256_4WAY 1
void sha256_init_4way(uint32_t *state); void sha256_init_4way(uint32_t *state);
void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
#endif #endif
extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
extern unsigned char *scrypt_buffer_alloc(); extern unsigned char *scrypt_buffer_alloc();
extern int scanhash_scrypt(int thr_id, uint32_t *pdata, extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
unsigned char *scratchbuf, const uint32_t *ptarget, unsigned char *scratchbuf, const uint32_t *ptarget,

View file

@ -35,7 +35,7 @@
#include <string.h> #include <string.h>
static const uint32_t keypad[12] = { static const uint32_t keypad[12] = {
0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
}; };
static const uint32_t innerpad[11] = { static const uint32_t innerpad[11] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
@ -57,7 +57,7 @@ static inline void HMAC_SHA256_80_init(const uint32_t *key,
/* tstate is assumed to contain the midstate of key */ /* tstate is assumed to contain the midstate of key */
memcpy(pad, key + 16, 16); memcpy(pad, key + 16, 16);
memcpy(pad + 4, keypad, 48); memcpy(pad + 4, keypad, 48);
sha256_transform(tstate, pad, 1); sha256_transform(tstate, pad, 0);
memcpy(ihash, tstate, 32); memcpy(ihash, tstate, 32);
sha256_init(ostate); sha256_init(ostate);
@ -83,10 +83,9 @@ static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
int i, j; int i, j;
memcpy(istate, tstate, 32); memcpy(istate, tstate, 32);
sha256_transform(istate, salt, 1); sha256_transform(istate, salt, 0);
for (i = 0; i < 4; i++) memcpy(ibuf, salt + 16, 16);
ibuf[i] = swab32(salt[16 + i]);
memcpy(ibuf + 5, innerpad, 44); memcpy(ibuf + 5, innerpad, 44);
memcpy(obuf + 8, outerpad, 32); memcpy(obuf + 8, outerpad, 32);
@ -123,7 +122,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
#ifdef SHA256_4WAY #ifdef SHA256_4WAY
static const uint32_t keypad_4way[4 * 12] = { static const uint32_t keypad_4way[4 * 12] = {
0x00000080, 0x00000080, 0x00000080, 0x00000080, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
@ -134,7 +133,7 @@ static const uint32_t keypad_4way[4 * 12] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80020000, 0x80020000, 0x80020000, 0x80020000 0x00000280, 0x00000280, 0x00000280, 0x00000280
}; };
static const uint32_t innerpad_4way[4 * 11] = { static const uint32_t innerpad_4way[4 * 11] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
@ -159,7 +158,7 @@ static const uint32_t outerpad_4way[4 * 8] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000300, 0x00000300, 0x00000300, 0x00000300 0x00000300, 0x00000300, 0x00000300, 0x00000300
}; };
static const uint32_t finalblk_4way[4 * 16] = { static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
@ -181,14 +180,14 @@ static const uint32_t finalblk_4way[4 * 16] = {
static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
uint32_t *tstate, uint32_t *ostate) uint32_t *tstate, uint32_t *ostate)
{ {
uint32_t ihash[4 * 8]; uint32_t ihash[4 * 8] __attribute__((aligned(16)));
uint32_t pad[4 * 16]; uint32_t pad[4 * 16] __attribute__((aligned(16)));
int i; int i;
/* tstate is assumed to contain the midstate of key */ /* tstate is assumed to contain the midstate of key */
memcpy(pad, key + 4 * 16, 4 * 16); memcpy(pad, key + 4 * 16, 4 * 16);
memcpy(pad + 4 * 4, keypad_4way, 4 * 48); memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
sha256_transform_4way(tstate, pad, 1); sha256_transform_4way(tstate, pad, 0);
memcpy(ihash, tstate, 4 * 32); memcpy(ihash, tstate, 4 * 32);
sha256_init_4way(ostate); sha256_init_4way(ostate);
@ -209,15 +208,16 @@ static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
const uint32_t *ostate, const uint32_t *salt, uint32_t *output) const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{ {
uint32_t istate[4 * 8], ostate2[4 * 8]; uint32_t istate[4 * 8] __attribute__((aligned(16)));
uint32_t ibuf[4 * 16], obuf[4 * 16]; uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
uint32_t obuf[4 * 16] __attribute__((aligned(16)));
int i, j; int i, j;
memcpy(istate, tstate, 4 * 32); memcpy(istate, tstate, 4 * 32);
sha256_transform_4way(istate, salt, 1); sha256_transform_4way(istate, salt, 0);
for (i = 0; i < 4 * 4; i++) memcpy(ibuf, salt + 4 * 16, 4 * 16);
ibuf[i] = swab32(salt[4 * 16 + i]);
memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
@ -239,7 +239,7 @@ static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
uint32_t *ostate, const uint32_t *salt, uint32_t *output) uint32_t *ostate, const uint32_t *salt, uint32_t *output)
{ {
uint32_t buf[4 * 16]; uint32_t buf[4 * 16] __attribute__((aligned(16)));
int i; int i;
sha256_transform_4way(tstate, salt, 1); sha256_transform_4way(tstate, salt, 1);
@ -270,7 +270,7 @@ void scrypt_core(uint32_t *X, uint32_t *V);
#else #else
static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16]) static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
{ {
uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
int i; int i;
@ -341,21 +341,18 @@ static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
static inline void scrypt_core(uint32_t *X, uint32_t *V) static inline void scrypt_core(uint32_t *X, uint32_t *V)
{ {
uint32_t i, j, k; uint32_t i, j, k;
uint64_t *p1, *p2;
p1 = (uint64_t *)X;
for (i = 0; i < 1024; i++) { for (i = 0; i < 1024; i++) {
memcpy(&V[i * 32], X, 128); memcpy(&V[i * 32], X, 128);
salsa20_8(&X[0], &X[16]); xor_salsa8(&X[0], &X[16]);
salsa20_8(&X[16], &X[0]); xor_salsa8(&X[16], &X[0]);
} }
for (i = 0; i < 1024; i++) { for (i = 0; i < 1024; i++) {
j = X[16] & 1023; j = 32 * (X[16] & 1023);
p2 = (uint64_t *)(&V[j * 32]); for (k = 0; k < 32; k++)
for (k = 0; k < 16; k++) X[k] ^= V[j + k];
p1[k] ^= p2[k]; xor_salsa8(&X[0], &X[16]);
salsa20_8(&X[0], &X[16]); xor_salsa8(&X[16], &X[0]);
salsa20_8(&X[16], &X[0]);
} }
} }
@ -377,8 +374,9 @@ static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output,
uint32_t *midstate, unsigned char *scratchpad) uint32_t *midstate, unsigned char *scratchpad)
{ {
uint32_t tstate[8], ostate[8]; uint32_t tstate[8], ostate[8];
uint32_t *V;
uint32_t X[32]; uint32_t X[32];
uint32_t *V;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
memcpy(tstate, midstate, 32); memcpy(tstate, midstate, 32);
@ -396,8 +394,8 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input,
{ {
uint32_t tstate1[8], tstate2[8]; uint32_t tstate1[8], tstate2[8];
uint32_t ostate1[8], ostate2[8]; uint32_t ostate1[8], ostate2[8];
uint32_t *V;
uint32_t X[2 * 32], *Y = X + 32; uint32_t X[2 * 32], *Y = X + 32;
uint32_t *V;
V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
@ -419,9 +417,10 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input,
static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input, static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input,
uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
{ {
uint32_t tstate[4 * 8], ostate[4 * 8]; uint32_t tstate[4 * 8] __attribute__((aligned(128)));
uint32_t X[3 * 32]; uint32_t ostate[4 * 8] __attribute__((aligned(128)));
uint32_t W[4 * 32]; uint32_t W[4 * 32] __attribute__((aligned(128)));
uint32_t X[3 * 32] __attribute__((aligned(128)));
uint32_t *V; uint32_t *V;
int i; int i;
@ -474,7 +473,7 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
memcpy(data + i * 20, pdata, 80); memcpy(data + i * 20, pdata, 80);
sha256_init(midstate); sha256_init(midstate);
sha256_transform(midstate, data, 1); sha256_transform(midstate, data, 0);
do { do {
for (i = 0; i < throughput; i++) for (i = 0; i < throughput; i++)

View file

@ -16,7 +16,7 @@
#if defined(__x86_64__) #if defined(__x86_64__)
.data .data
.p2align 6 .p2align 7
sha256_4h: sha256_4h:
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
@ -28,7 +28,7 @@ sha256_4h:
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
.data .data
.p2align 6 .p2align 7
sha256_4k: sha256_4k:
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491
@ -126,6 +126,537 @@ _sha256_init_4way:
#endif #endif
ret ret
.macro sha256_sse2_extend_round i
movdqa (\i-15)*16(%rcx), %xmm0
movdqa (\i-14)*16(%rcx), %xmm4
movdqa %xmm0, %xmm2
movdqa %xmm4, %xmm6
psrld $3, %xmm0
psrld $3, %xmm4
movdqa %xmm0, %xmm1
movdqa %xmm4, %xmm5
pslld $14, %xmm2
pslld $14, %xmm6
psrld $4, %xmm1
psrld $4, %xmm5
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
psrld $11, %xmm1
psrld $11, %xmm5
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
pslld $11, %xmm2
pslld $11, %xmm6
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
movdqa (\i-2)*16(%rcx), %xmm3
movdqa (\i-1)*16(%rcx), %xmm7
paddd (\i-16)*16(%rcx), %xmm0
paddd (\i-15)*16(%rcx), %xmm4
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
psrld $10, %xmm3
psrld $10, %xmm7
movdqa %xmm3, %xmm1
movdqa %xmm7, %xmm5
paddd (\i-7)*16(%rcx), %xmm0
pslld $13, %xmm2
pslld $13, %xmm6
psrld $7, %xmm1
psrld $7, %xmm5
paddd (\i-6)*16(%rcx), %xmm4
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
psrld $2, %xmm1
psrld $2, %xmm5
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
pslld $2, %xmm2
pslld $2, %xmm6
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
paddd %xmm3, %xmm0
paddd %xmm7, %xmm4
movdqa %xmm0, \i*16(%rcx)
movdqa %xmm4, (\i+1)*16(%rcx)
.endm
.text
.p2align 6
sha256_sse2_extend_loop:
sha256_sse2_extend_round 0
sha256_sse2_extend_round 2
sha256_sse2_extend_round 4
sha256_sse2_extend_round 6
sha256_sse2_extend_round 8
sha256_sse2_extend_round 10
sha256_sse2_extend_round 12
sha256_sse2_extend_round 14
sha256_sse2_extend_round 16
sha256_sse2_extend_round 18
sha256_sse2_extend_round 20
sha256_sse2_extend_round 22
sha256_sse2_extend_round 24
sha256_sse2_extend_round 26
sha256_sse2_extend_round 28
sha256_sse2_extend_round 30
sha256_sse2_extend_round 32
sha256_sse2_extend_round 34
sha256_sse2_extend_round 36
sha256_sse2_extend_round 38
sha256_sse2_extend_round 40
sha256_sse2_extend_round 42
sha256_sse2_extend_round 44
sha256_sse2_extend_round 46
ret
.macro sha256_sse2_main_round i
movdqa 16*\i(%rax), %xmm6
paddd 16*\i(%rcx), %xmm6
paddd %xmm10, %xmm6
movdqa %xmm0, %xmm1
movdqa %xmm9, %xmm2
pandn %xmm2, %xmm1
movdqa %xmm2, %xmm10
movdqa %xmm8, %xmm2
movdqa %xmm2, %xmm9
pand %xmm0, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm0, %xmm8
paddd %xmm1, %xmm6
movdqa %xmm0, %xmm1
psrld $6, %xmm0
movdqa %xmm0, %xmm2
pslld $7, %xmm1
psrld $5, %xmm2
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $14, %xmm1
psrld $14, %xmm2
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $5, %xmm1
pxor %xmm1, %xmm0
paddd %xmm0, %xmm6
movdqa %xmm3, %xmm0
paddd %xmm6, %xmm0
movdqa %xmm5, %xmm1
movdqa %xmm4, %xmm3
movdqa %xmm4, %xmm2
pand %xmm5, %xmm2
pand %xmm7, %xmm4
pand %xmm7, %xmm1
pxor %xmm4, %xmm1
movdqa %xmm5, %xmm4
movdqa %xmm7, %xmm5
pxor %xmm2, %xmm1
paddd %xmm1, %xmm6
movdqa %xmm7, %xmm2
psrld $2, %xmm7
movdqa %xmm7, %xmm1
pslld $10, %xmm2
psrld $11, %xmm1
pxor %xmm2, %xmm7
pxor %xmm1, %xmm7
pslld $9, %xmm2
psrld $9, %xmm1
pxor %xmm2, %xmm7
pxor %xmm1, %xmm7
pslld $11, %xmm2
pxor %xmm2, %xmm7
paddd %xmm6, %xmm7
.endm
.text
.p2align 6
sha256_sse2_main_loop:
sha256_sse2_main_round 0
sha256_sse2_main_round 1
sha256_sse2_main_round 2
sha256_sse2_main_round 3
sha256_sse2_main_round 4
sha256_sse2_main_round 5
sha256_sse2_main_round 6
sha256_sse2_main_round 7
sha256_sse2_main_round 8
sha256_sse2_main_round 9
sha256_sse2_main_round 10
sha256_sse2_main_round 11
sha256_sse2_main_round 12
sha256_sse2_main_round 13
sha256_sse2_main_round 14
sha256_sse2_main_round 15
sha256_sse2_main_round 16
sha256_sse2_main_round 17
sha256_sse2_main_round 18
sha256_sse2_main_round 19
sha256_sse2_main_round 20
sha256_sse2_main_round 21
sha256_sse2_main_round 22
sha256_sse2_main_round 23
sha256_sse2_main_round 24
sha256_sse2_main_round 25
sha256_sse2_main_round 26
sha256_sse2_main_round 27
sha256_sse2_main_round 28
sha256_sse2_main_round 29
sha256_sse2_main_round 30
sha256_sse2_main_round 31
sha256_sse2_main_round 32
sha256_sse2_main_round 33
sha256_sse2_main_round 34
sha256_sse2_main_round 35
sha256_sse2_main_round 36
sha256_sse2_main_round 37
sha256_sse2_main_round 38
sha256_sse2_main_round 39
sha256_sse2_main_round 40
sha256_sse2_main_round 41
sha256_sse2_main_round 42
sha256_sse2_main_round 43
sha256_sse2_main_round 44
sha256_sse2_main_round 45
sha256_sse2_main_round 46
sha256_sse2_main_round 47
sha256_sse2_main_round 48
sha256_sse2_main_round 49
sha256_sse2_main_round 50
sha256_sse2_main_round 51
sha256_sse2_main_round 52
sha256_sse2_main_round 53
sha256_sse2_main_round 54
sha256_sse2_main_round 55
sha256_sse2_main_round 56
sha256_sse2_main_round 57
sha256_sse2_main_round 58
sha256_sse2_main_round 59
sha256_sse2_main_round 60
sha256_sse2_main_round 61
sha256_sse2_main_round 62
sha256_sse2_main_round 63
ret
#if defined(USE_AVX)
.macro sha256_avx_extend_round i
movdqa (\i-15)*16(%rcx), %xmm0
movdqa (\i-14)*16(%rcx), %xmm4
vpslld $14, %xmm0, %xmm2
vpslld $14, %xmm4, %xmm6
psrld $3, %xmm0
psrld $3, %xmm4
vpsrld $4, %xmm0, %xmm1
vpsrld $4, %xmm4, %xmm5
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
psrld $11, %xmm1
psrld $11, %xmm5
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
pslld $11, %xmm2
pslld $11, %xmm6
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
movdqa (\i-2)*16(%rcx), %xmm3
movdqa (\i-1)*16(%rcx), %xmm7
paddd (\i-16)*16(%rcx), %xmm0
paddd (\i-15)*16(%rcx), %xmm4
vpslld $13, %xmm3, %xmm2
vpslld $13, %xmm7, %xmm6
psrld $10, %xmm3
psrld $10, %xmm7
paddd (\i-7)*16(%rcx), %xmm0
vpsrld $7, %xmm3, %xmm1
vpsrld $7, %xmm7, %xmm5
paddd (\i-6)*16(%rcx), %xmm4
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
psrld $2, %xmm1
psrld $2, %xmm5
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
pslld $2, %xmm2
pslld $2, %xmm6
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
paddd %xmm3, %xmm0
paddd %xmm7, %xmm4
movdqa %xmm0, \i*16(%rcx)
movdqa %xmm4, (\i+1)*16(%rcx)
.endm
.text
.p2align 6
sha256_avx_extend_loop:
sha256_avx_extend_round 0
sha256_avx_extend_round 2
sha256_avx_extend_round 4
sha256_avx_extend_round 6
sha256_avx_extend_round 8
sha256_avx_extend_round 10
sha256_avx_extend_round 12
sha256_avx_extend_round 14
sha256_avx_extend_round 16
sha256_avx_extend_round 18
sha256_avx_extend_round 20
sha256_avx_extend_round 22
sha256_avx_extend_round 24
sha256_avx_extend_round 26
sha256_avx_extend_round 28
sha256_avx_extend_round 30
sha256_avx_extend_round 32
sha256_avx_extend_round 34
sha256_avx_extend_round 36
sha256_avx_extend_round 38
sha256_avx_extend_round 40
sha256_avx_extend_round 42
sha256_avx_extend_round 44
sha256_avx_extend_round 46
ret
.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
vpaddd 16*(\i)(%rax), \r0, %xmm6
paddd 16*(\i)(%rcx), %xmm6
vpandn \r1, \r3, %xmm1
vpand \r3, \r2, %xmm2
pxor %xmm2, %xmm1
paddd %xmm1, %xmm6
vpslld $7, \r3, %xmm1
vpsrld $6, \r3, \r0
vpsrld $5, \r0, %xmm2
pxor %xmm1, \r0
pxor %xmm2, \r0
pslld $14, %xmm1
psrld $14, %xmm2
pxor %xmm1, \r0
pxor %xmm2, \r0
pslld $5, %xmm1
pxor %xmm1, \r0
paddd \r0, %xmm6
vpaddd %xmm6, \r4, \r0
vpand \r6, \r5, %xmm2
vpand \r7, \r5, \r4
vpand \r7, \r6, %xmm1
pxor \r4, %xmm1
pxor %xmm2, %xmm1
paddd %xmm1, %xmm6
vpslld $10, \r7, %xmm2
vpsrld $2, \r7, \r4
vpsrld $11, \r4, %xmm1
pxor %xmm2, \r4
pxor %xmm1, \r4
pslld $9, %xmm2
psrld $9, %xmm1
pxor %xmm2, \r4
pxor %xmm1, \r4
pslld $11, %xmm2
pxor %xmm2, \r4
paddd %xmm6, \r4
.endm
.macro sha256_avx_main_quadround i
sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
.endm
.text
.p2align 6
sha256_avx_main_loop:
sha256_avx_main_quadround 0
sha256_avx_main_quadround 4
sha256_avx_main_quadround 8
sha256_avx_main_quadround 12
sha256_avx_main_quadround 16
sha256_avx_main_quadround 20
sha256_avx_main_quadround 24
sha256_avx_main_quadround 28
sha256_avx_main_quadround 32
sha256_avx_main_quadround 36
sha256_avx_main_quadround 40
sha256_avx_main_quadround 44
sha256_avx_main_quadround 48
sha256_avx_main_quadround 52
sha256_avx_main_quadround 56
sha256_avx_main_quadround 60
ret
#endif /* USE_AVX */
#if defined(USE_XOP)
.macro sha256_xop_extend_round i
vmovdqa (\i-15)*16(%rcx), %xmm0
vmovdqa (\i-14)*16(%rcx), %xmm4
vprotd $25, %xmm0, %xmm1
vprotd $25, %xmm4, %xmm5
vprotd $14, %xmm0, %xmm2
vprotd $14, %xmm4, %xmm6
vpxor %xmm1, %xmm2, %xmm2
vpxor %xmm5, %xmm6, %xmm6
vpsrld $3, %xmm0, %xmm0
vpsrld $3, %xmm4, %xmm4
vpxor %xmm2, %xmm0, %xmm0
vpxor %xmm6, %xmm4, %xmm4
vmovdqa (\i-2)*16(%rcx), %xmm3
vmovdqa (\i-1)*16(%rcx), %xmm7
vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0
vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4
vprotd $15, %xmm3, %xmm1
vprotd $15, %xmm7, %xmm5
vprotd $13, %xmm3, %xmm2
vprotd $13, %xmm7, %xmm6
vpxor %xmm1, %xmm2, %xmm2
vpxor %xmm5, %xmm6, %xmm6
vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0
vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4
vpsrld $10, %xmm3, %xmm3
vpsrld $10, %xmm7, %xmm7
vpxor %xmm2, %xmm3, %xmm3
vpxor %xmm6, %xmm7, %xmm7
vpaddd %xmm3, %xmm0, %xmm0
vpaddd %xmm7, %xmm4, %xmm4
vmovdqa %xmm0, \i*16(%rcx)
vmovdqa %xmm4, (\i+1)*16(%rcx)
.endm
.text
.p2align 6
sha256_xop_extend_loop:
sha256_xop_extend_round 0
sha256_xop_extend_round 2
sha256_xop_extend_round 4
sha256_xop_extend_round 6
sha256_xop_extend_round 8
sha256_xop_extend_round 10
sha256_xop_extend_round 12
sha256_xop_extend_round 14
sha256_xop_extend_round 16
sha256_xop_extend_round 18
sha256_xop_extend_round 20
sha256_xop_extend_round 22
sha256_xop_extend_round 24
sha256_xop_extend_round 26
sha256_xop_extend_round 28
sha256_xop_extend_round 30
sha256_xop_extend_round 32
sha256_xop_extend_round 34
sha256_xop_extend_round 36
sha256_xop_extend_round 38
sha256_xop_extend_round 40
sha256_xop_extend_round 42
sha256_xop_extend_round 44
sha256_xop_extend_round 46
ret
.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
vpaddd 16*(\i)(%rax), \r0, %xmm6
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
vpandn \r1, \r3, %xmm1
vpand \r3, \r2, %xmm2
vpxor %xmm2, %xmm1, %xmm1
vpaddd %xmm1, %xmm6, %xmm6
vprotd $26, \r3, %xmm1
vprotd $21, \r3, %xmm2
vpxor %xmm1, %xmm2, %xmm2
vprotd $7, \r3, \r0
vpxor %xmm2, \r0, \r0
vpaddd \r0, %xmm6, %xmm6
vpaddd %xmm6, \r4, \r0
vpand \r6, \r5, %xmm2
vpand \r7, \r5, \r4
vpand \r7, \r6, %xmm1
vpxor \r4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vpaddd %xmm1, %xmm6, %xmm6
vprotd $30, \r7, %xmm1
vprotd $19, \r7, %xmm2
vpxor %xmm1, %xmm2, %xmm2
vprotd $10, \r7, \r4
vpxor %xmm2, \r4, \r4
vpaddd %xmm6, \r4, \r4
.endm
.macro sha256_xop_main_quadround i
sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
.endm
.text
.p2align 6
sha256_xop_main_loop:
sha256_xop_main_quadround 0
sha256_xop_main_quadround 4
sha256_xop_main_quadround 8
sha256_xop_main_quadround 12
sha256_xop_main_quadround 16
sha256_xop_main_quadround 20
sha256_xop_main_quadround 24
sha256_xop_main_quadround 28
sha256_xop_main_quadround 32
sha256_xop_main_quadround 36
sha256_xop_main_quadround 40
sha256_xop_main_quadround 44
sha256_xop_main_quadround 48
sha256_xop_main_quadround 52
sha256_xop_main_quadround 56
sha256_xop_main_quadround 60
ret
#endif /* USE_XOP */
.macro p2bswap_rsi_rsp i .macro p2bswap_rsi_rsp i
movdqu \i*16(%rsi), %xmm0 movdqu \i*16(%rsi), %xmm0
movdqu (\i+1)*16(%rsi), %xmm2 movdqu (\i+1)*16(%rsi), %xmm2
@ -165,7 +696,9 @@ _sha256_transform_4way:
movq %rdx, %rsi movq %rdx, %rsi
movq %r8, %rdx movq %r8, %rdx
#endif #endif
movq %rsp, %r8
subq $1032, %rsp subq $1032, %rsp
andq $-128, %rsp
testq %rdx, %rdx testq %rdx, %rdx
jz sha256_transform_4way_block_copy jz sha256_transform_4way_block_copy
@ -391,7 +924,7 @@ sha256_transform_4way_main_loop:
movdqu %xmm9, 96(%rdi) movdqu %xmm9, 96(%rdi)
movdqu %xmm10, 112(%rdi) movdqu %xmm10, 112(%rdi)
addq $1032, %rsp movq %r8, %rsp
#if defined(WIN64) #if defined(WIN64)
popq %rsi popq %rsi
movdqa 0(%rsp), %xmm6 movdqa 0(%rsp), %xmm6
@ -404,5 +937,414 @@ sha256_transform_4way_main_loop:
popq %rdi popq %rdi
#endif #endif
ret ret
.data
.p2align 3
sha256d_4way_addr:
.quad 0x0
.text
.p2align 6
.globl sha256d_4way
.globl _sha256d_4way
sha256d_4way:
_sha256d_4way:
movq sha256d_4way_addr(%rip), %rax
testq %rax, %rax
jz sha256d_4way_set
jmp *%rax
sha256d_4way_set:
pushq %rbx
pushq %rcx
pushq %rdx
#if defined(USE_AVX)
# Check for AVX and OSXSAVE support
movl $1, %eax
cpuid
andl $0x18000000, %ecx
cmpl $0x18000000, %ecx
jne sha256d_4way_set_sse2
# Check for XMM and YMM state support
xorl %ecx, %ecx
xgetbv
andl $0x00000006, %eax
cmpl $0x00000006, %eax
jne sha256d_4way_set_sse2
#if defined(USE_XOP)
# Check for XOP support
movl $0x80000001, %eax
cpuid
andl $0x00000800, %ecx
jz sha256d_4way_set_avx
sha256d_4way_set_xop:
leaq sha256d_4way_xop(%rip), %rax
jmp sha256d_4way_set_done
#endif /* USE_XOP */
sha256d_4way_set_avx:
leaq sha256d_4way_avx(%rip), %rax
jmp sha256d_4way_set_done
#endif /* USE_AVX */
sha256d_4way_set_sse2:
leaq sha256d_4way_sse2(%rip), %rax
sha256d_4way_set_done:
movq %rax, sha256d_4way_addr(%rip)
popq %rdx
popq %rcx
popq %rbx
jmp *%rax
.p2align 6
sha256d_4way_sse2:
#if defined(WIN64)
pushq %rdi
subq $80, %rsp
movdqa %xmm6, 0(%rsp)
movdqa %xmm7, 16(%rsp)
movdqa %xmm8, 32(%rsp)
movdqa %xmm9, 48(%rsp)
movdqa %xmm10, 64(%rsp)
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
subq $1032, %rsp
leaq 256(%rsi), %rcx
call sha256_sse2_extend_loop
movdqa 0(%rdx), %xmm7
movdqa 16(%rdx), %xmm5
movdqa 32(%rdx), %xmm4
movdqa 48(%rdx), %xmm3
movdqa 64(%rdx), %xmm0
movdqa 80(%rdx), %xmm8
movdqa 96(%rdx), %xmm9
movdqa 112(%rdx), %xmm10
movq %rsi, %rax
leaq sha256_4k(%rip), %rcx
call sha256_sse2_main_loop
paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5
paddd 32(%rdx), %xmm4
paddd 48(%rdx), %xmm3
paddd 64(%rdx), %xmm0
paddd 80(%rdx), %xmm8
paddd 96(%rdx), %xmm9
paddd 112(%rdx), %xmm10
movdqa %xmm7, 0(%rsp)
movdqa %xmm5, 16(%rsp)
movdqa %xmm4, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm0, 64(%rsp)
movdqa %xmm8, 80(%rsp)
movdqa %xmm9, 96(%rsp)
movdqa %xmm10, 112(%rsp)
pxor %xmm0, %xmm0
movq $0x8000000000000100, %rax
movd %rax, %xmm1
pshufd $0x55, %xmm1, %xmm2
pshufd $0x00, %xmm1, %xmm1
movdqa %xmm2, 128(%rsp)
movdqa %xmm0, 144(%rsp)
movdqa %xmm0, 160(%rsp)
movdqa %xmm0, 176(%rsp)
movdqa %xmm0, 192(%rsp)
movdqa %xmm0, 208(%rsp)
movdqa %xmm0, 224(%rsp)
movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rcx
call sha256_sse2_extend_loop
movdqa sha256_4h+0(%rip), %xmm7
movdqa sha256_4h+16(%rip), %xmm5
movdqa sha256_4h+32(%rip), %xmm4
movdqa sha256_4h+48(%rip), %xmm3
movdqa sha256_4h+64(%rip), %xmm0
movdqa sha256_4h+80(%rip), %xmm8
movdqa sha256_4h+96(%rip), %xmm9
movdqa sha256_4h+112(%rip), %xmm10
movq %rsp, %rax
leaq sha256_4k(%rip), %rcx
call sha256_sse2_main_loop
paddd sha256_4h+0(%rip), %xmm7
paddd sha256_4h+16(%rip), %xmm5
paddd sha256_4h+32(%rip), %xmm4
paddd sha256_4h+48(%rip), %xmm3
paddd sha256_4h+64(%rip), %xmm0
paddd sha256_4h+80(%rip), %xmm8
paddd sha256_4h+96(%rip), %xmm9
paddd sha256_4h+112(%rip), %xmm10
movdqa %xmm7, 0(%rdi)
movdqa %xmm5, 16(%rdi)
movdqa %xmm4, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm0, 64(%rdi)
movdqa %xmm8, 80(%rdi)
movdqa %xmm9, 96(%rdi)
movdqa %xmm10, 112(%rdi)
addq $1032, %rsp
#if defined(WIN64)
popq %rsi
movdqa 0(%rsp), %xmm6
movdqa 16(%rsp), %xmm7
movdqa 32(%rsp), %xmm8
movdqa 48(%rsp), %xmm9
movdqa 64(%rsp), %xmm10
addq $80, %rsp
popq %rdi
#endif
ret
#if defined(USE_AVX)
.p2align 6
sha256d_4way_avx:
#if defined(WIN64)
pushq %rdi
subq $80, %rsp
movdqa %xmm6, 0(%rsp)
movdqa %xmm7, 16(%rsp)
movdqa %xmm8, 32(%rsp)
movdqa %xmm9, 48(%rsp)
movdqa %xmm10, 64(%rsp)
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
subq $1032, %rsp
leaq 256(%rsi), %rcx
call sha256_avx_extend_loop
movdqa 0(%rdx), %xmm7
movdqa 16(%rdx), %xmm5
movdqa 32(%rdx), %xmm4
movdqa 48(%rdx), %xmm3
movdqa 64(%rdx), %xmm0
movdqa 80(%rdx), %xmm8
movdqa 96(%rdx), %xmm9
movdqa 112(%rdx), %xmm10
movq %rsi, %rax
leaq sha256_4k(%rip), %rcx
call sha256_avx_main_loop
paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5
paddd 32(%rdx), %xmm4
paddd 48(%rdx), %xmm3
paddd 64(%rdx), %xmm0
paddd 80(%rdx), %xmm8
paddd 96(%rdx), %xmm9
paddd 112(%rdx), %xmm10
movdqa %xmm7, 0(%rsp)
movdqa %xmm5, 16(%rsp)
movdqa %xmm4, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm0, 64(%rsp)
movdqa %xmm8, 80(%rsp)
movdqa %xmm9, 96(%rsp)
movdqa %xmm10, 112(%rsp)
pxor %xmm0, %xmm0
movq $0x8000000000000100, %rax
movd %rax, %xmm1
pshufd $0x55, %xmm1, %xmm2
pshufd $0x00, %xmm1, %xmm1
movdqa %xmm2, 128(%rsp)
movdqa %xmm0, 144(%rsp)
movdqa %xmm0, 160(%rsp)
movdqa %xmm0, 176(%rsp)
movdqa %xmm0, 192(%rsp)
movdqa %xmm0, 208(%rsp)
movdqa %xmm0, 224(%rsp)
movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rcx
call sha256_avx_extend_loop
movdqa sha256_4h+0(%rip), %xmm7
movdqa sha256_4h+16(%rip), %xmm5
movdqa sha256_4h+32(%rip), %xmm4
movdqa sha256_4h+48(%rip), %xmm3
movdqa sha256_4h+64(%rip), %xmm0
movdqa sha256_4h+80(%rip), %xmm8
movdqa sha256_4h+96(%rip), %xmm9
movdqa sha256_4h+112(%rip), %xmm10
movq %rsp, %rax
leaq sha256_4k(%rip), %rcx
call sha256_avx_main_loop
paddd sha256_4h+0(%rip), %xmm7
paddd sha256_4h+16(%rip), %xmm5
paddd sha256_4h+32(%rip), %xmm4
paddd sha256_4h+48(%rip), %xmm3
paddd sha256_4h+64(%rip), %xmm0
paddd sha256_4h+80(%rip), %xmm8
paddd sha256_4h+96(%rip), %xmm9
paddd sha256_4h+112(%rip), %xmm10
movdqa %xmm7, 0(%rdi)
movdqa %xmm5, 16(%rdi)
movdqa %xmm4, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm0, 64(%rdi)
movdqa %xmm8, 80(%rdi)
movdqa %xmm9, 96(%rdi)
movdqa %xmm10, 112(%rdi)
addq $1032, %rsp
#if defined(WIN64)
popq %rsi
movdqa 0(%rsp), %xmm6
movdqa 16(%rsp), %xmm7
movdqa 32(%rsp), %xmm8
movdqa 48(%rsp), %xmm9
movdqa 64(%rsp), %xmm10
addq $80, %rsp
popq %rdi
#endif
ret
#endif /* USE_AVX */
#if defined(USE_XOP)
.p2align 6
sha256d_4way_xop:
#if defined(WIN64)
pushq %rdi
subq $80, %rsp
movdqa %xmm6, 0(%rsp)
movdqa %xmm7, 16(%rsp)
movdqa %xmm8, 32(%rsp)
movdqa %xmm9, 48(%rsp)
movdqa %xmm10, 64(%rsp)
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
subq $1032, %rsp
leaq 256(%rsi), %rcx
call sha256_xop_extend_loop
movdqa 0(%rdx), %xmm7
movdqa 16(%rdx), %xmm5
movdqa 32(%rdx), %xmm4
movdqa 48(%rdx), %xmm3
movdqa 64(%rdx), %xmm0
movdqa 80(%rdx), %xmm8
movdqa 96(%rdx), %xmm9
movdqa 112(%rdx), %xmm10
movq %rsi, %rax
leaq sha256_4k(%rip), %rcx
call sha256_xop_main_loop
paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5
paddd 32(%rdx), %xmm4
paddd 48(%rdx), %xmm3
paddd 64(%rdx), %xmm0
paddd 80(%rdx), %xmm8
paddd 96(%rdx), %xmm9
paddd 112(%rdx), %xmm10
movdqa %xmm7, 0(%rsp)
movdqa %xmm5, 16(%rsp)
movdqa %xmm4, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm0, 64(%rsp)
movdqa %xmm8, 80(%rsp)
movdqa %xmm9, 96(%rsp)
movdqa %xmm10, 112(%rsp)
pxor %xmm0, %xmm0
movq $0x8000000000000100, %rax
movd %rax, %xmm1
pshufd $0x55, %xmm1, %xmm2
pshufd $0x00, %xmm1, %xmm1
movdqa %xmm2, 128(%rsp)
movdqa %xmm0, 144(%rsp)
movdqa %xmm0, 160(%rsp)
movdqa %xmm0, 176(%rsp)
movdqa %xmm0, 192(%rsp)
movdqa %xmm0, 208(%rsp)
movdqa %xmm0, 224(%rsp)
movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rcx
call sha256_xop_extend_loop
movdqa sha256_4h+0(%rip), %xmm7
movdqa sha256_4h+16(%rip), %xmm5
movdqa sha256_4h+32(%rip), %xmm4
movdqa sha256_4h+48(%rip), %xmm3
movdqa sha256_4h+64(%rip), %xmm0
movdqa sha256_4h+80(%rip), %xmm8
movdqa sha256_4h+96(%rip), %xmm9
movdqa sha256_4h+112(%rip), %xmm10
movq %rsp, %rax
leaq sha256_4k(%rip), %rcx
call sha256_xop_main_loop
paddd sha256_4h+0(%rip), %xmm7
paddd sha256_4h+16(%rip), %xmm5
paddd sha256_4h+32(%rip), %xmm4
paddd sha256_4h+48(%rip), %xmm3
paddd sha256_4h+64(%rip), %xmm0
paddd sha256_4h+80(%rip), %xmm8
paddd sha256_4h+96(%rip), %xmm9
paddd sha256_4h+112(%rip), %xmm10
movdqa %xmm7, 0(%rdi)
movdqa %xmm5, 16(%rdi)
movdqa %xmm4, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm0, 64(%rdi)
movdqa %xmm8, 80(%rdi)
movdqa %xmm9, 96(%rdi)
movdqa %xmm10, 112(%rdi)
addq $1032, %rsp
#if defined(WIN64)
popq %rsi
movdqa 0(%rsp), %xmm6
movdqa 16(%rsp), %xmm7
movdqa 32(%rsp), %xmm8
movdqa 48(%rsp), %xmm9
movdqa 64(%rsp), %xmm10
addq $80, %rsp
popq %rdi
#endif
ret
#endif /* USE_XOP */
#endif #endif

408
sha2.c
View file

@ -13,43 +13,60 @@
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
static const uint32_t sha256_h[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
static const uint32_t sha256_k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
void sha256_init(uint32_t *state) void sha256_init(uint32_t *state)
{ {
/* Magic initialization constants */ memcpy(state, sha256_h, 32);
state[0] = 0x6a09e667;
state[1] = 0xbb67ae85;
state[2] = 0x3C6ef372;
state[3] = 0xa54ff53a;
state[4] = 0x510e527f;
state[5] = 0x9b05688c;
state[6] = 0x1f83d9Ab;
state[7] = 0x5be0cd19;
} }
/* Elementary functions used by SHA256 */ /* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z)) #define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n))) #define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) #define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) #define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) #define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) #define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
/* SHA256 round function */ /* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \ #define RND(a, b, c, d, e, f, g, h, k) \
t0 = h + S1(e) + Ch(e, f, g) + k; \ do { \
t1 = S0(a) + Maj(a, b, c); \ t0 = h + S1(e) + Ch(e, f, g) + k; \
d += t0; \ t1 = S0(a) + Maj(a, b, c); \
h = t0 + t1; d += t0; \
h = t0 + t1; \
} while (0)
/* Adjusted round function for rotating state */ /* Adjusted round function for rotating state */
#define RNDr(S, W, i, k) \ #define RNDr(S, W, i) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \ RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \ S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \ S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \ S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + k) W[i] + sha256_k[i])
/* /*
* SHA256 block compression function. The 256-bit state is transformed via * SHA256 block compression function. The 256-bit state is transformed via
@ -77,72 +94,299 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
memcpy(S, state, 32); memcpy(S, state, 32);
/* 3. Mix. */ /* 3. Mix. */
RNDr(S, W, 0, 0x428a2f98); RNDr(S, W, 0);
RNDr(S, W, 1, 0x71374491); RNDr(S, W, 1);
RNDr(S, W, 2, 0xb5c0fbcf); RNDr(S, W, 2);
RNDr(S, W, 3, 0xe9b5dba5); RNDr(S, W, 3);
RNDr(S, W, 4, 0x3956c25b); RNDr(S, W, 4);
RNDr(S, W, 5, 0x59f111f1); RNDr(S, W, 5);
RNDr(S, W, 6, 0x923f82a4); RNDr(S, W, 6);
RNDr(S, W, 7, 0xab1c5ed5); RNDr(S, W, 7);
RNDr(S, W, 8, 0xd807aa98); RNDr(S, W, 8);
RNDr(S, W, 9, 0x12835b01); RNDr(S, W, 9);
RNDr(S, W, 10, 0x243185be); RNDr(S, W, 10);
RNDr(S, W, 11, 0x550c7dc3); RNDr(S, W, 11);
RNDr(S, W, 12, 0x72be5d74); RNDr(S, W, 12);
RNDr(S, W, 13, 0x80deb1fe); RNDr(S, W, 13);
RNDr(S, W, 14, 0x9bdc06a7); RNDr(S, W, 14);
RNDr(S, W, 15, 0xc19bf174); RNDr(S, W, 15);
RNDr(S, W, 16, 0xe49b69c1); RNDr(S, W, 16);
RNDr(S, W, 17, 0xefbe4786); RNDr(S, W, 17);
RNDr(S, W, 18, 0x0fc19dc6); RNDr(S, W, 18);
RNDr(S, W, 19, 0x240ca1cc); RNDr(S, W, 19);
RNDr(S, W, 20, 0x2de92c6f); RNDr(S, W, 20);
RNDr(S, W, 21, 0x4a7484aa); RNDr(S, W, 21);
RNDr(S, W, 22, 0x5cb0a9dc); RNDr(S, W, 22);
RNDr(S, W, 23, 0x76f988da); RNDr(S, W, 23);
RNDr(S, W, 24, 0x983e5152); RNDr(S, W, 24);
RNDr(S, W, 25, 0xa831c66d); RNDr(S, W, 25);
RNDr(S, W, 26, 0xb00327c8); RNDr(S, W, 26);
RNDr(S, W, 27, 0xbf597fc7); RNDr(S, W, 27);
RNDr(S, W, 28, 0xc6e00bf3); RNDr(S, W, 28);
RNDr(S, W, 29, 0xd5a79147); RNDr(S, W, 29);
RNDr(S, W, 30, 0x06ca6351); RNDr(S, W, 30);
RNDr(S, W, 31, 0x14292967); RNDr(S, W, 31);
RNDr(S, W, 32, 0x27b70a85); RNDr(S, W, 32);
RNDr(S, W, 33, 0x2e1b2138); RNDr(S, W, 33);
RNDr(S, W, 34, 0x4d2c6dfc); RNDr(S, W, 34);
RNDr(S, W, 35, 0x53380d13); RNDr(S, W, 35);
RNDr(S, W, 36, 0x650a7354); RNDr(S, W, 36);
RNDr(S, W, 37, 0x766a0abb); RNDr(S, W, 37);
RNDr(S, W, 38, 0x81c2c92e); RNDr(S, W, 38);
RNDr(S, W, 39, 0x92722c85); RNDr(S, W, 39);
RNDr(S, W, 40, 0xa2bfe8a1); RNDr(S, W, 40);
RNDr(S, W, 41, 0xa81a664b); RNDr(S, W, 41);
RNDr(S, W, 42, 0xc24b8b70); RNDr(S, W, 42);
RNDr(S, W, 43, 0xc76c51a3); RNDr(S, W, 43);
RNDr(S, W, 44, 0xd192e819); RNDr(S, W, 44);
RNDr(S, W, 45, 0xd6990624); RNDr(S, W, 45);
RNDr(S, W, 46, 0xf40e3585); RNDr(S, W, 46);
RNDr(S, W, 47, 0x106aa070); RNDr(S, W, 47);
RNDr(S, W, 48, 0x19a4c116); RNDr(S, W, 48);
RNDr(S, W, 49, 0x1e376c08); RNDr(S, W, 49);
RNDr(S, W, 50, 0x2748774c); RNDr(S, W, 50);
RNDr(S, W, 51, 0x34b0bcb5); RNDr(S, W, 51);
RNDr(S, W, 52, 0x391c0cb3); RNDr(S, W, 52);
RNDr(S, W, 53, 0x4ed8aa4a); RNDr(S, W, 53);
RNDr(S, W, 54, 0x5b9cca4f); RNDr(S, W, 54);
RNDr(S, W, 55, 0x682e6ff3); RNDr(S, W, 55);
RNDr(S, W, 56, 0x748f82ee); RNDr(S, W, 56);
RNDr(S, W, 57, 0x78a5636f); RNDr(S, W, 57);
RNDr(S, W, 58, 0x84c87814); RNDr(S, W, 58);
RNDr(S, W, 59, 0x8cc70208); RNDr(S, W, 59);
RNDr(S, W, 60, 0x90befffa); RNDr(S, W, 60);
RNDr(S, W, 61, 0xa4506ceb); RNDr(S, W, 61);
RNDr(S, W, 62, 0xbef9a3f7); RNDr(S, W, 62);
RNDr(S, W, 63, 0xc67178f2); RNDr(S, W, 63);
/* 4. Mix local working variables into global state */ /* 4. Mix local working variables into global state */
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
state[i] += S[i]; state[i] += S[i];
} }
#if defined(__x86_64__)
#define SHA256D_WAYS 4
void sha256d_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate);
#else
#define SHA256D_WAYS 1
static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
static inline void sha256d(uint32_t *hash, uint32_t *W,
const uint32_t *midstate)
{
uint32_t S[64];
uint32_t t0, t1;
int i;
for (i = 16; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
memcpy(S, midstate, 32);
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
for (i = 0; i < 8; i++)
S[i] += midstate[i];
memcpy(S + 8, sha256d_hash1 + 8, 32);
for (i = 16; i < 64; i += 2) {
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
}
sha256_init(hash);
RNDr(hash, S, 0);
RNDr(hash, S, 1);
RNDr(hash, S, 2);
RNDr(hash, S, 3);
RNDr(hash, S, 4);
RNDr(hash, S, 5);
RNDr(hash, S, 6);
RNDr(hash, S, 7);
RNDr(hash, S, 8);
RNDr(hash, S, 9);
RNDr(hash, S, 10);
RNDr(hash, S, 11);
RNDr(hash, S, 12);
RNDr(hash, S, 13);
RNDr(hash, S, 14);
RNDr(hash, S, 15);
RNDr(hash, S, 16);
RNDr(hash, S, 17);
RNDr(hash, S, 18);
RNDr(hash, S, 19);
RNDr(hash, S, 20);
RNDr(hash, S, 21);
RNDr(hash, S, 22);
RNDr(hash, S, 23);
RNDr(hash, S, 24);
RNDr(hash, S, 25);
RNDr(hash, S, 26);
RNDr(hash, S, 27);
RNDr(hash, S, 28);
RNDr(hash, S, 29);
RNDr(hash, S, 30);
RNDr(hash, S, 31);
RNDr(hash, S, 32);
RNDr(hash, S, 33);
RNDr(hash, S, 34);
RNDr(hash, S, 35);
RNDr(hash, S, 36);
RNDr(hash, S, 37);
RNDr(hash, S, 38);
RNDr(hash, S, 39);
RNDr(hash, S, 40);
RNDr(hash, S, 41);
RNDr(hash, S, 42);
RNDr(hash, S, 43);
RNDr(hash, S, 44);
RNDr(hash, S, 45);
RNDr(hash, S, 46);
RNDr(hash, S, 47);
RNDr(hash, S, 48);
RNDr(hash, S, 49);
RNDr(hash, S, 50);
RNDr(hash, S, 51);
RNDr(hash, S, 52);
RNDr(hash, S, 53);
RNDr(hash, S, 54);
RNDr(hash, S, 55);
RNDr(hash, S, 56);
RNDr(hash, S, 57);
RNDr(hash, S, 58);
RNDr(hash, S, 59);
RNDr(hash, S, 60);
RNDr(hash, S, 61);
RNDr(hash, S, 62);
RNDr(hash, S, 63);
for (i = 0; i < 8; i++)
hash[i] += sha256_h[i];
}
#endif
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t data[SHA256D_WAYS * 64] __attribute__((aligned(128)));
uint32_t hash[SHA256D_WAYS * 8] __attribute__((aligned(32)));
uint32_t midstate[SHA256D_WAYS * 8] __attribute__((aligned(32)));
uint32_t tmp[8];
uint32_t n = pdata[19] - 1;
const uint32_t Htarg = ptarget[7];
int i, j;
for (i = 15; i >= 0; i--)
for (j = 0; j < SHA256D_WAYS; j++)
data[i * SHA256D_WAYS + j] = pdata[16 + i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
for (i = 7; i >= 0; i--)
for (j = 0; j < SHA256D_WAYS; j++)
midstate[i * SHA256D_WAYS + j] = midstate[i];
do {
for (i = 0; i < SHA256D_WAYS; i++)
data[SHA256D_WAYS * 3 + i] = ++n;
#if SHA256D_WAYS == 4
sha256d_4way(hash, data, midstate);
#else
sha256d(hash, data, midstate);
#endif
for (i = 0; i < SHA256D_WAYS; i++) {
if (hash[SHA256D_WAYS * 7 + i] <= Htarg) {
for (j = 0; j < 8; j++)
tmp[j] = hash[SHA256D_WAYS * j + i];
if (fulltest(tmp, ptarget)) {
*hashes_done = n - pdata[19] + 1;
pdata[19] = data[SHA256D_WAYS * 3 + i];
return 1;
}
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - pdata[19] + 1;
pdata[19] = n;
return 0;
}