From 18a34a72ab1fa112a8370b788a27dd6078b991c3 Mon Sep 17 00:00:00 2001 From: pooler Date: Sat, 24 Mar 2012 01:27:23 +0100 Subject: [PATCH] Precompute the first few SHA-256d rounds --- sha2-x64.S | 157 +++++++++++++++++++++++++++++------------------------ sha2.c | 95 +++++++++++++++++++++++++------- 2 files changed, 161 insertions(+), 91 deletions(-) diff --git a/sha2-x64.S b/sha2-x64.S index 07441a1..c4f3655 100644 --- a/sha2-x64.S +++ b/sha2-x64.S @@ -128,8 +128,8 @@ _sha256_init_4way: .macro sha256_sse2_extend_round i - movdqa (\i-15)*16(%rcx), %xmm0 - movdqa (\i-14)*16(%rcx), %xmm4 + movdqa (\i-15)*16(%rax), %xmm0 + movdqa (\i-14)*16(%rax), %xmm4 movdqa %xmm0, %xmm2 movdqa %xmm4, %xmm6 psrld $3, %xmm0 @@ -153,10 +153,10 @@ _sha256_init_4way: pxor %xmm2, %xmm0 pxor %xmm6, %xmm4 - movdqa (\i-2)*16(%rcx), %xmm3 - movdqa (\i-1)*16(%rcx), %xmm7 - paddd (\i-16)*16(%rcx), %xmm0 - paddd (\i-15)*16(%rcx), %xmm4 + movdqa (\i-2)*16(%rax), %xmm3 + movdqa (\i-1)*16(%rax), %xmm7 + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-15)*16(%rax), %xmm4 movdqa %xmm3, %xmm2 movdqa %xmm7, %xmm6 @@ -165,14 +165,14 @@ _sha256_init_4way: movdqa %xmm3, %xmm1 movdqa %xmm7, %xmm5 - paddd (\i-7)*16(%rcx), %xmm0 + paddd (\i-7)*16(%rax), %xmm0 pslld $13, %xmm2 pslld $13, %xmm6 psrld $7, %xmm1 psrld $7, %xmm5 - paddd (\i-6)*16(%rcx), %xmm4 + paddd (\i-6)*16(%rax), %xmm4 pxor %xmm1, %xmm3 pxor %xmm5, %xmm7 @@ -189,14 +189,15 @@ _sha256_init_4way: paddd %xmm3, %xmm0 paddd %xmm7, %xmm4 - movdqa %xmm0, \i*16(%rcx) - movdqa %xmm4, (\i+1)*16(%rcx) + movdqa %xmm0, \i*16(%rax) + movdqa %xmm4, (\i+1)*16(%rax) .endm .text .p2align 6 sha256_sse2_extend_loop: sha256_sse2_extend_round 0 +sha256_sse2_extend_loop_pre: sha256_sse2_extend_round 2 sha256_sse2_extend_round 4 sha256_sse2_extend_round 6 @@ -293,6 +294,7 @@ sha256_sse2_main_loop: sha256_sse2_main_round 0 sha256_sse2_main_round 1 sha256_sse2_main_round 2 +sha256_sse2_main_loop_pre: sha256_sse2_main_round 3 sha256_sse2_main_round 4 sha256_sse2_main_round 5 @@ -360,8 +362,8 @@ sha256_sse2_main_loop: #if defined(USE_AVX) .macro sha256_avx_extend_round i - vmovdqa (\i-15)*16(%rcx), %xmm0 - vmovdqa (\i-14)*16(%rcx), %xmm4 + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 vpslld $14, %xmm0, %xmm2 vpslld $14, %xmm4, %xmm6 vpsrld $3, %xmm0, %xmm0 @@ -381,22 +383,22 @@ sha256_sse2_main_loop: vpxor %xmm2, %xmm0, %xmm0 vpxor %xmm6, %xmm4, %xmm4 - vmovdqa (\i-2)*16(%rcx), %xmm3 - vmovdqa (\i-1)*16(%rcx), %xmm7 - vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0 - vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4 + vmovdqa (\i-2)*16(%rax), %xmm3 + vmovdqa (\i-1)*16(%rax), %xmm7 + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 vpslld $13, %xmm3, %xmm2 vpslld $13, %xmm7, %xmm6 vpsrld $10, %xmm3, %xmm3 vpsrld $10, %xmm7, %xmm7 - vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 vpsrld $7, %xmm3, %xmm1 vpsrld $7, %xmm7, %xmm5 - vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 vpxor %xmm1, %xmm3, %xmm3 vpxor %xmm5, %xmm7, %xmm7 @@ -413,14 +415,15 @@ sha256_sse2_main_loop: vpaddd %xmm3, %xmm0, %xmm0 vpaddd %xmm7, %xmm4, %xmm4 - vmovdqa %xmm0, \i*16(%rcx) - vmovdqa %xmm4, (\i+1)*16(%rcx) + vmovdqa %xmm0, \i*16(%rax) + vmovdqa %xmm4, (\i+1)*16(%rax) .endm .text .p2align 6 sha256_avx_extend_loop: sha256_avx_extend_round 0 +sha256_avx_extend_loop_pre: sha256_avx_extend_round 2 sha256_avx_extend_round 4 sha256_avx_extend_round 6 @@ -501,7 +504,11 @@ sha256_avx_extend_loop: .text .p2align 6 sha256_avx_main_loop: - sha256_avx_main_quadround 0 + sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256_avx_main_loop_pre: + sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 sha256_avx_main_quadround 4 sha256_avx_main_quadround 8 sha256_avx_main_quadround 12 @@ -525,8 +532,8 @@ sha256_avx_main_loop: #if defined(USE_XOP) .macro sha256_xop_extend_round i - vmovdqa (\i-15)*16(%rcx), %xmm0 - vmovdqa (\i-14)*16(%rcx), %xmm4 + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 vprotd $25, %xmm0, %xmm1 vprotd $25, %xmm4, %xmm5 vprotd $14, %xmm0, %xmm2 @@ -538,10 +545,10 @@ sha256_avx_main_loop: vpxor %xmm2, %xmm0, %xmm0 vpxor %xmm6, %xmm4, %xmm4 - vmovdqa (\i-2)*16(%rcx), %xmm3 - vmovdqa (\i-1)*16(%rcx), %xmm7 - vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0 - vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4 + vmovdqa (\i-2)*16(%rax), %xmm3 + vmovdqa (\i-1)*16(%rax), %xmm7 + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 vprotd $15, %xmm3, %xmm1 vprotd $15, %xmm7, %xmm5 @@ -550,8 +557,8 @@ sha256_avx_main_loop: vpxor %xmm1, %xmm2, %xmm2 vpxor %xmm5, %xmm6, %xmm6 - vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0 - vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 vpsrld $10, %xmm3, %xmm3 vpsrld $10, %xmm7, %xmm7 @@ -560,14 +567,15 @@ sha256_avx_main_loop: vpaddd %xmm3, %xmm0, %xmm0 vpaddd %xmm7, %xmm4, %xmm4 - vmovdqa %xmm0, \i*16(%rcx) - vmovdqa %xmm4, (\i+1)*16(%rcx) + vmovdqa %xmm0, \i*16(%rax) + vmovdqa %xmm4, (\i+1)*16(%rax) .endm .text .p2align 6 sha256_xop_extend_loop: sha256_xop_extend_round 0 +sha256_xop_extend_loop_pre: sha256_xop_extend_round 2 sha256_xop_extend_round 4 sha256_xop_extend_round 6 @@ -636,7 +644,11 @@ sha256_xop_extend_loop: .text .p2align 6 sha256_xop_main_loop: - sha256_xop_main_quadround 0 + sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256_xop_main_loop_pre: + sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 sha256_xop_main_quadround 4 sha256_xop_main_quadround 8 sha256_xop_main_quadround 12 @@ -810,11 +822,12 @@ sha256_transform_4way_sse2_main_loop: jne sha256_transform_4way_sse2_main_loop jmp sha256_transform_4way_finish + #if defined(USE_AVX) .text .p2align 6 sha256_transform_4way_core_avx: - leaq 256(%rsp), %rcx + leaq 256(%rsp), %rax call sha256_avx_extend_loop movdqu 0(%rdi), %xmm7 movdqu 16(%rdi), %xmm5 @@ -830,11 +843,12 @@ sha256_transform_4way_core_avx: jmp sha256_transform_4way_finish #endif /* USE_AVX */ + #if defined(USE_XOP) .text .p2align 6 sha256_transform_4way_core_xop: - leaq 256(%rsp), %rcx + leaq 256(%rsp), %rax call sha256_xop_extend_loop movdqu 0(%rdi), %xmm7 movdqu 16(%rdi), %xmm5 @@ -1019,24 +1033,25 @@ sha256d_4way_sse2: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx + movq %r9, %rcx #endif subq $1032, %rsp - leaq 256(%rsi), %rcx - call sha256_sse2_extend_loop + leaq 256(%rsi), %rax + call sha256_sse2_extend_loop_pre - movdqa 0(%rdx), %xmm7 - movdqa 16(%rdx), %xmm5 - movdqa 32(%rdx), %xmm4 - movdqa 48(%rdx), %xmm3 - movdqa 64(%rdx), %xmm0 - movdqa 80(%rdx), %xmm8 - movdqa 96(%rdx), %xmm9 - movdqa 112(%rdx), %xmm10 + movdqa 0(%rcx), %xmm3 + movdqa 16(%rcx), %xmm0 + movdqa 32(%rcx), %xmm8 + movdqa 48(%rcx), %xmm9 + movdqa 64(%rcx), %xmm10 + movdqa 80(%rcx), %xmm7 + movdqa 96(%rcx), %xmm5 + movdqa 112(%rcx), %xmm4 movq %rsi, %rax leaq sha256_4k(%rip), %rcx - call sha256_sse2_main_loop + call sha256_sse2_main_loop_pre paddd 0(%rdx), %xmm7 paddd 16(%rdx), %xmm5 @@ -1070,7 +1085,7 @@ sha256d_4way_sse2: movdqa %xmm0, 224(%rsp) movdqa %xmm1, 240(%rsp) - leaq 256(%rsp), %rcx + leaq 256(%rsp), %rax call sha256_sse2_extend_loop movdqa sha256_4h+0(%rip), %xmm7 @@ -1134,24 +1149,25 @@ sha256d_4way_avx: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx + movq %r9, %rcx #endif subq $1032, %rsp - leaq 256(%rsi), %rcx - call sha256_avx_extend_loop + leaq 256(%rsi), %rax + call sha256_avx_extend_loop_pre - movdqa 0(%rdx), %xmm7 - movdqa 16(%rdx), %xmm5 - movdqa 32(%rdx), %xmm4 - movdqa 48(%rdx), %xmm3 - movdqa 64(%rdx), %xmm0 - movdqa 80(%rdx), %xmm8 - movdqa 96(%rdx), %xmm9 - movdqa 112(%rdx), %xmm10 + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 movq %rsi, %rax leaq sha256_4k(%rip), %rcx - call sha256_avx_main_loop + call sha256_avx_main_loop_pre paddd 0(%rdx), %xmm7 paddd 16(%rdx), %xmm5 @@ -1185,7 +1201,7 @@ sha256d_4way_avx: movdqa %xmm0, 224(%rsp) movdqa %xmm1, 240(%rsp) - leaq 256(%rsp), %rcx + leaq 256(%rsp), %rax call sha256_avx_extend_loop movdqa sha256_4h+0(%rip), %xmm7 @@ -1251,24 +1267,25 @@ sha256d_4way_xop: movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx + movq %r9, %rcx #endif subq $1032, %rsp - leaq 256(%rsi), %rcx - call sha256_xop_extend_loop + leaq 256(%rsi), %rax + call sha256_xop_extend_loop_pre - movdqa 0(%rdx), %xmm7 - movdqa 16(%rdx), %xmm5 - movdqa 32(%rdx), %xmm4 - movdqa 48(%rdx), %xmm3 - movdqa 64(%rdx), %xmm0 - movdqa 80(%rdx), %xmm8 - movdqa 96(%rdx), %xmm9 - movdqa 112(%rdx), %xmm10 + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 movq %rsi, %rax leaq sha256_4k(%rip), %rcx - call sha256_xop_main_loop + call sha256_xop_main_loop_pre paddd 0(%rdx), %xmm7 paddd 16(%rdx), %xmm5 @@ -1302,7 +1319,7 @@ sha256d_4way_xop: movdqa %xmm0, 224(%rsp) movdqa %xmm1, 240(%rsp) - leaq 256(%rsp), %rcx + leaq 256(%rsp), %rax call sha256_xop_extend_loop movdqa sha256_4h+0(%rip), %xmm7 diff --git a/sha2.c b/sha2.c index fd0a896..21500e3 100644 --- a/sha2.c +++ b/sha2.c @@ -164,12 +164,6 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap) state[i] += S[i]; } -#ifdef HAVE_SHA256_4WAY -#define SHA256D_MAX_WAYS 4 -void sha256d_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate); -#else -#define SHA256D_MAX_WAYS 1 -#endif static const uint32_t sha256d_hash1[16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -178,23 +172,64 @@ static const uint32_t sha256d_hash1[16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000100 }; +static inline void sha256d_preextend(uint32_t *W) +{ + W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; + W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; + W[18] = s1(W[16]) + W[11] + W[ 2]; + W[19] = s1(W[17]) + W[12] + s0(W[ 4]); + W[20] = W[13] + s0(W[ 5]) + W[ 4]; + W[21] = W[14] + s0(W[ 6]) + W[ 5]; + W[22] = W[15] + s0(W[ 7]) + W[ 6]; + W[23] = W[16] + s0(W[ 8]) + W[ 7]; + W[24] = W[17] + s0(W[ 9]) + W[ 8]; + W[25] = s0(W[10]) + W[ 9]; + W[26] = s0(W[11]) + W[10]; + W[27] = s0(W[12]) + W[11]; + W[28] = s0(W[13]) + W[12]; + W[29] = s0(W[14]) + W[13]; + W[30] = s0(W[15]) + W[14]; + W[31] = s0(W[16]) + W[15]; +} + +static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) +{ + uint32_t t0, t1; + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); +} + static inline void sha256d(uint32_t *hash, uint32_t *W, - const uint32_t *midstate) + const uint32_t *midstate, const uint32_t *prehash) { uint32_t S[64]; + uint32_t E[14]; uint32_t t0, t1; int i; - for (i = 16; i < 64; i += 2) { + memcpy(E, W + 18, sizeof(E)); + W[18] += s0(W[3]); + W[19] += W[3]; + W[20] += s1(W[18]); + W[21] += s1(W[19]); + W[22] += s1(W[20]); + W[23] += s1(W[21]); + W[24] += s1(W[22]); + W[25] += s1(W[23]) + W[18]; + W[26] += s1(W[24]) + W[19]; + W[27] += s1(W[25]) + W[20]; + W[28] += s1(W[26]) + W[21]; + W[29] += s1(W[27]) + W[22]; + W[30] += s1(W[28]) + W[23]; + W[31] += s1(W[29]) + W[24]; + for (i = 32; i < 64; i += 2) { W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; } - memcpy(S, midstate, 32); + memcpy(S, prehash, 32); - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); RNDr(S, W, 3); RNDr(S, W, 4); RNDr(S, W, 5); @@ -260,6 +295,8 @@ static inline void sha256d(uint32_t *hash, uint32_t *W, for (i = 0; i < 8; i++) S[i] += midstate[i]; + memcpy(W + 18, E, sizeof(E)); + memcpy(S + 8, sha256d_hash1 + 8, 32); for (i = 16; i < 64; i += 2) { S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; @@ -337,12 +374,21 @@ static inline void sha256d(uint32_t *hash, uint32_t *W, hash[i] += sha256_h[i]; } +#ifdef HAVE_SHA256_4WAY +#define SHA256D_MAX_WAYS 4 +void sha256d_4way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); +#else +#define SHA256D_MAX_WAYS 1 +#endif + int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t data[SHA256D_MAX_WAYS * 64] __attribute__((aligned(128))); uint32_t hash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32))); uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32))); + uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32))); uint32_t n = pdata[19] - 1; const uint32_t Htarg = ptarget[7]; #ifdef HAVE_SHA256_4WAY @@ -352,15 +398,22 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, #endif int i, j; - for (i = 15; i >= 0; i--) + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) for (j = 0; j < ways; j++) - data[i * ways + j] = pdata[16 + i]; + data[i * ways + j] = data[i]; sha256_init(midstate); sha256_transform(midstate, pdata, 0); - for (i = 7; i >= 0; i--) - for (j = 0; j < ways; j++) + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < ways; j++) { midstate[i * ways + j] = midstate[i]; + prehash[i * ways + j] = prehash[i]; + } + } #ifdef HAVE_SHA256_4WAY if (ways == 4) @@ -368,7 +421,7 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, for (i = 0; i < 4; i++) data[4 * 3 + i] = ++n; - sha256d_4way(hash, data, midstate); + sha256d_4way(hash, data, midstate, prehash); for (i = 0; i < 4; i++) { if (hash[4 * 7 + i] <= Htarg) { @@ -386,12 +439,12 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, else #endif do { - data[3 + i] = ++n; - sha256d(hash, data, midstate); - if (hash[7 + i] <= Htarg) { + data[3] = ++n; + sha256d(hash, data, midstate, prehash); + if (hash[7] <= Htarg) { if (fulltest(hash, ptarget)) { *hashes_done = n - pdata[19] + 1; - pdata[19] = data[3 + i]; + pdata[19] = data[3]; return 1; } }