Precompute the first few SHA-256d rounds

This commit is contained in:
pooler 2012-03-24 01:27:23 +01:00
parent e52982ab7f
commit 18a34a72ab
2 changed files with 161 additions and 91 deletions

View file

@ -128,8 +128,8 @@ _sha256_init_4way:
.macro sha256_sse2_extend_round i
movdqa (\i-15)*16(%rcx), %xmm0
movdqa (\i-14)*16(%rcx), %xmm4
movdqa (\i-15)*16(%rax), %xmm0
movdqa (\i-14)*16(%rax), %xmm4
movdqa %xmm0, %xmm2
movdqa %xmm4, %xmm6
psrld $3, %xmm0
@ -153,10 +153,10 @@ _sha256_init_4way:
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
movdqa (\i-2)*16(%rcx), %xmm3
movdqa (\i-1)*16(%rcx), %xmm7
paddd (\i-16)*16(%rcx), %xmm0
paddd (\i-15)*16(%rcx), %xmm4
movdqa (\i-2)*16(%rax), %xmm3
movdqa (\i-1)*16(%rax), %xmm7
paddd (\i-16)*16(%rax), %xmm0
paddd (\i-15)*16(%rax), %xmm4
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
@ -165,14 +165,14 @@ _sha256_init_4way:
movdqa %xmm3, %xmm1
movdqa %xmm7, %xmm5
paddd (\i-7)*16(%rcx), %xmm0
paddd (\i-7)*16(%rax), %xmm0
pslld $13, %xmm2
pslld $13, %xmm6
psrld $7, %xmm1
psrld $7, %xmm5
paddd (\i-6)*16(%rcx), %xmm4
paddd (\i-6)*16(%rax), %xmm4
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
@ -189,14 +189,15 @@ _sha256_init_4way:
paddd %xmm3, %xmm0
paddd %xmm7, %xmm4
movdqa %xmm0, \i*16(%rcx)
movdqa %xmm4, (\i+1)*16(%rcx)
movdqa %xmm0, \i*16(%rax)
movdqa %xmm4, (\i+1)*16(%rax)
.endm
.text
.p2align 6
sha256_sse2_extend_loop:
sha256_sse2_extend_round 0
sha256_sse2_extend_loop_pre:
sha256_sse2_extend_round 2
sha256_sse2_extend_round 4
sha256_sse2_extend_round 6
@ -293,6 +294,7 @@ sha256_sse2_main_loop:
sha256_sse2_main_round 0
sha256_sse2_main_round 1
sha256_sse2_main_round 2
sha256_sse2_main_loop_pre:
sha256_sse2_main_round 3
sha256_sse2_main_round 4
sha256_sse2_main_round 5
@ -360,8 +362,8 @@ sha256_sse2_main_loop:
#if defined(USE_AVX)
.macro sha256_avx_extend_round i
vmovdqa (\i-15)*16(%rcx), %xmm0
vmovdqa (\i-14)*16(%rcx), %xmm4
vmovdqa (\i-15)*16(%rax), %xmm0
vmovdqa (\i-14)*16(%rax), %xmm4
vpslld $14, %xmm0, %xmm2
vpslld $14, %xmm4, %xmm6
vpsrld $3, %xmm0, %xmm0
@ -381,22 +383,22 @@ sha256_sse2_main_loop:
vpxor %xmm2, %xmm0, %xmm0
vpxor %xmm6, %xmm4, %xmm4
vmovdqa (\i-2)*16(%rcx), %xmm3
vmovdqa (\i-1)*16(%rcx), %xmm7
vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0
vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4
vmovdqa (\i-2)*16(%rax), %xmm3
vmovdqa (\i-1)*16(%rax), %xmm7
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
vpslld $13, %xmm3, %xmm2
vpslld $13, %xmm7, %xmm6
vpsrld $10, %xmm3, %xmm3
vpsrld $10, %xmm7, %xmm7
vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
vpsrld $7, %xmm3, %xmm1
vpsrld $7, %xmm7, %xmm5
vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
vpxor %xmm1, %xmm3, %xmm3
vpxor %xmm5, %xmm7, %xmm7
@ -413,14 +415,15 @@ sha256_sse2_main_loop:
vpaddd %xmm3, %xmm0, %xmm0
vpaddd %xmm7, %xmm4, %xmm4
vmovdqa %xmm0, \i*16(%rcx)
vmovdqa %xmm4, (\i+1)*16(%rcx)
vmovdqa %xmm0, \i*16(%rax)
vmovdqa %xmm4, (\i+1)*16(%rax)
.endm
.text
.p2align 6
sha256_avx_extend_loop:
sha256_avx_extend_round 0
sha256_avx_extend_loop_pre:
sha256_avx_extend_round 2
sha256_avx_extend_round 4
sha256_avx_extend_round 6
@ -501,7 +504,11 @@ sha256_avx_extend_loop:
.text
.p2align 6
sha256_avx_main_loop:
sha256_avx_main_quadround 0
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_avx_main_loop_pre:
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
sha256_avx_main_quadround 4
sha256_avx_main_quadround 8
sha256_avx_main_quadround 12
@ -525,8 +532,8 @@ sha256_avx_main_loop:
#if defined(USE_XOP)
.macro sha256_xop_extend_round i
vmovdqa (\i-15)*16(%rcx), %xmm0
vmovdqa (\i-14)*16(%rcx), %xmm4
vmovdqa (\i-15)*16(%rax), %xmm0
vmovdqa (\i-14)*16(%rax), %xmm4
vprotd $25, %xmm0, %xmm1
vprotd $25, %xmm4, %xmm5
vprotd $14, %xmm0, %xmm2
@ -538,10 +545,10 @@ sha256_avx_main_loop:
vpxor %xmm2, %xmm0, %xmm0
vpxor %xmm6, %xmm4, %xmm4
vmovdqa (\i-2)*16(%rcx), %xmm3
vmovdqa (\i-1)*16(%rcx), %xmm7
vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0
vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4
vmovdqa (\i-2)*16(%rax), %xmm3
vmovdqa (\i-1)*16(%rax), %xmm7
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
vprotd $15, %xmm3, %xmm1
vprotd $15, %xmm7, %xmm5
@ -550,8 +557,8 @@ sha256_avx_main_loop:
vpxor %xmm1, %xmm2, %xmm2
vpxor %xmm5, %xmm6, %xmm6
vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0
vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
vpsrld $10, %xmm3, %xmm3
vpsrld $10, %xmm7, %xmm7
@ -560,14 +567,15 @@ sha256_avx_main_loop:
vpaddd %xmm3, %xmm0, %xmm0
vpaddd %xmm7, %xmm4, %xmm4
vmovdqa %xmm0, \i*16(%rcx)
vmovdqa %xmm4, (\i+1)*16(%rcx)
vmovdqa %xmm0, \i*16(%rax)
vmovdqa %xmm4, (\i+1)*16(%rax)
.endm
.text
.p2align 6
sha256_xop_extend_loop:
sha256_xop_extend_round 0
sha256_xop_extend_loop_pre:
sha256_xop_extend_round 2
sha256_xop_extend_round 4
sha256_xop_extend_round 6
@ -636,7 +644,11 @@ sha256_xop_extend_loop:
.text
.p2align 6
sha256_xop_main_loop:
sha256_xop_main_quadround 0
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_xop_main_loop_pre:
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
sha256_xop_main_quadround 4
sha256_xop_main_quadround 8
sha256_xop_main_quadround 12
@ -810,11 +822,12 @@ sha256_transform_4way_sse2_main_loop:
jne sha256_transform_4way_sse2_main_loop
jmp sha256_transform_4way_finish
#if defined(USE_AVX)
.text
.p2align 6
sha256_transform_4way_core_avx:
leaq 256(%rsp), %rcx
leaq 256(%rsp), %rax
call sha256_avx_extend_loop
movdqu 0(%rdi), %xmm7
movdqu 16(%rdi), %xmm5
@ -830,11 +843,12 @@ sha256_transform_4way_core_avx:
jmp sha256_transform_4way_finish
#endif /* USE_AVX */
#if defined(USE_XOP)
.text
.p2align 6
sha256_transform_4way_core_xop:
leaq 256(%rsp), %rcx
leaq 256(%rsp), %rax
call sha256_xop_extend_loop
movdqu 0(%rdi), %xmm7
movdqu 16(%rdi), %xmm5
@ -1019,24 +1033,25 @@ sha256d_4way_sse2:
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
#endif
subq $1032, %rsp
leaq 256(%rsi), %rcx
call sha256_sse2_extend_loop
leaq 256(%rsi), %rax
call sha256_sse2_extend_loop_pre
movdqa 0(%rdx), %xmm7
movdqa 16(%rdx), %xmm5
movdqa 32(%rdx), %xmm4
movdqa 48(%rdx), %xmm3
movdqa 64(%rdx), %xmm0
movdqa 80(%rdx), %xmm8
movdqa 96(%rdx), %xmm9
movdqa 112(%rdx), %xmm10
movdqa 0(%rcx), %xmm3
movdqa 16(%rcx), %xmm0
movdqa 32(%rcx), %xmm8
movdqa 48(%rcx), %xmm9
movdqa 64(%rcx), %xmm10
movdqa 80(%rcx), %xmm7
movdqa 96(%rcx), %xmm5
movdqa 112(%rcx), %xmm4
movq %rsi, %rax
leaq sha256_4k(%rip), %rcx
call sha256_sse2_main_loop
call sha256_sse2_main_loop_pre
paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5
@ -1070,7 +1085,7 @@ sha256d_4way_sse2:
movdqa %xmm0, 224(%rsp)
movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rcx
leaq 256(%rsp), %rax
call sha256_sse2_extend_loop
movdqa sha256_4h+0(%rip), %xmm7
@ -1134,24 +1149,25 @@ sha256d_4way_avx:
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
#endif
subq $1032, %rsp
leaq 256(%rsi), %rcx
call sha256_avx_extend_loop
leaq 256(%rsi), %rax
call sha256_avx_extend_loop_pre
movdqa 0(%rdx), %xmm7
movdqa 16(%rdx), %xmm5
movdqa 32(%rdx), %xmm4
movdqa 48(%rdx), %xmm3
movdqa 64(%rdx), %xmm0
movdqa 80(%rdx), %xmm8
movdqa 96(%rdx), %xmm9
movdqa 112(%rdx), %xmm10
movdqa 0(%rcx), %xmm7
movdqa 16(%rcx), %xmm8
movdqa 32(%rcx), %xmm9
movdqa 48(%rcx), %xmm10
movdqa 64(%rcx), %xmm0
movdqa 80(%rcx), %xmm5
movdqa 96(%rcx), %xmm4
movdqa 112(%rcx), %xmm3
movq %rsi, %rax
leaq sha256_4k(%rip), %rcx
call sha256_avx_main_loop
call sha256_avx_main_loop_pre
paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5
@ -1185,7 +1201,7 @@ sha256d_4way_avx:
movdqa %xmm0, 224(%rsp)
movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rcx
leaq 256(%rsp), %rax
call sha256_avx_extend_loop
movdqa sha256_4h+0(%rip), %xmm7
@ -1251,24 +1267,25 @@ sha256d_4way_xop:
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
#endif
subq $1032, %rsp
leaq 256(%rsi), %rcx
call sha256_xop_extend_loop
leaq 256(%rsi), %rax
call sha256_xop_extend_loop_pre
movdqa 0(%rdx), %xmm7
movdqa 16(%rdx), %xmm5
movdqa 32(%rdx), %xmm4
movdqa 48(%rdx), %xmm3
movdqa 64(%rdx), %xmm0
movdqa 80(%rdx), %xmm8
movdqa 96(%rdx), %xmm9
movdqa 112(%rdx), %xmm10
movdqa 0(%rcx), %xmm7
movdqa 16(%rcx), %xmm8
movdqa 32(%rcx), %xmm9
movdqa 48(%rcx), %xmm10
movdqa 64(%rcx), %xmm0
movdqa 80(%rcx), %xmm5
movdqa 96(%rcx), %xmm4
movdqa 112(%rcx), %xmm3
movq %rsi, %rax
leaq sha256_4k(%rip), %rcx
call sha256_xop_main_loop
call sha256_xop_main_loop_pre
paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5
@ -1302,7 +1319,7 @@ sha256d_4way_xop:
movdqa %xmm0, 224(%rsp)
movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rcx
leaq 256(%rsp), %rax
call sha256_xop_extend_loop
movdqa sha256_4h+0(%rip), %xmm7

95
sha2.c
View file

@ -164,12 +164,6 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
state[i] += S[i];
}
#ifdef HAVE_SHA256_4WAY
#define SHA256D_MAX_WAYS 4
void sha256d_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate);
#else
#define SHA256D_MAX_WAYS 1
#endif
static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
@ -178,23 +172,64 @@ static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
static inline void sha256d_preextend(uint32_t *W)
{
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
W[18] = s1(W[16]) + W[11] + W[ 2];
W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
W[20] = W[13] + s0(W[ 5]) + W[ 4];
W[21] = W[14] + s0(W[ 6]) + W[ 5];
W[22] = W[15] + s0(W[ 7]) + W[ 6];
W[23] = W[16] + s0(W[ 8]) + W[ 7];
W[24] = W[17] + s0(W[ 9]) + W[ 8];
W[25] = s0(W[10]) + W[ 9];
W[26] = s0(W[11]) + W[10];
W[27] = s0(W[12]) + W[11];
W[28] = s0(W[13]) + W[12];
W[29] = s0(W[14]) + W[13];
W[30] = s0(W[15]) + W[14];
W[31] = s0(W[16]) + W[15];
}
static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
{
uint32_t t0, t1;
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
}
static inline void sha256d(uint32_t *hash, uint32_t *W,
const uint32_t *midstate)
const uint32_t *midstate, const uint32_t *prehash)
{
uint32_t S[64];
uint32_t E[14];
uint32_t t0, t1;
int i;
for (i = 16; i < 64; i += 2) {
memcpy(E, W + 18, sizeof(E));
W[18] += s0(W[3]);
W[19] += W[3];
W[20] += s1(W[18]);
W[21] += s1(W[19]);
W[22] += s1(W[20]);
W[23] += s1(W[21]);
W[24] += s1(W[22]);
W[25] += s1(W[23]) + W[18];
W[26] += s1(W[24]) + W[19];
W[27] += s1(W[25]) + W[20];
W[28] += s1(W[26]) + W[21];
W[29] += s1(W[27]) + W[22];
W[30] += s1(W[28]) + W[23];
W[31] += s1(W[29]) + W[24];
for (i = 32; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
memcpy(S, midstate, 32);
memcpy(S, prehash, 32);
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
@ -260,6 +295,8 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
for (i = 0; i < 8; i++)
S[i] += midstate[i];
memcpy(W + 18, E, sizeof(E));
memcpy(S + 8, sha256d_hash1 + 8, 32);
for (i = 16; i < 64; i += 2) {
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
@ -337,12 +374,21 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
hash[i] += sha256_h[i];
}
#ifdef HAVE_SHA256_4WAY
#define SHA256D_MAX_WAYS 4
void sha256d_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
#else
#define SHA256D_MAX_WAYS 1
#endif
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t data[SHA256D_MAX_WAYS * 64] __attribute__((aligned(128)));
uint32_t hash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
uint32_t n = pdata[19] - 1;
const uint32_t Htarg = ptarget[7];
#ifdef HAVE_SHA256_4WAY
@ -352,15 +398,22 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
#endif
int i, j;
for (i = 15; i >= 0; i--)
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < ways; j++)
data[i * ways + j] = pdata[16 + i];
data[i * ways + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
for (i = 7; i >= 0; i--)
for (j = 0; j < ways; j++)
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < ways; j++) {
midstate[i * ways + j] = midstate[i];
prehash[i * ways + j] = prehash[i];
}
}
#ifdef HAVE_SHA256_4WAY
if (ways == 4)
@ -368,7 +421,7 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
for (i = 0; i < 4; i++)
data[4 * 3 + i] = ++n;
sha256d_4way(hash, data, midstate);
sha256d_4way(hash, data, midstate, prehash);
for (i = 0; i < 4; i++) {
if (hash[4 * 7 + i] <= Htarg) {
@ -386,12 +439,12 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
else
#endif
do {
data[3 + i] = ++n;
sha256d(hash, data, midstate);
if (hash[7 + i] <= Htarg) {
data[3] = ++n;
sha256d(hash, data, midstate, prehash);
if (hash[7] <= Htarg) {
if (fulltest(hash, ptarget)) {
*hashes_done = n - pdata[19] + 1;
pdata[19] = data[3 + i];
pdata[19] = data[3];
return 1;
}
}