diff --git a/Makefile.am b/Makefile.am index b435083..5a6fd25 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,17 +14,8 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) bin_PROGRAMS = minerd minerd_SOURCES = elist.h miner.h compat.h \ - cpu-miner.c util.c scrypt.c \ - sha256_generic.c sha256_4way.c sha256_via.c \ - sha256_cryptopp.c sha256_sse2_amd64.c + cpu-miner.c util.c scrypt.c minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ -if HAVE_x86_64 -if HAS_YASM -SUBDIRS += x86_64 -minerd_LDADD += x86_64/libx8664.a -AM_CFLAGS = -DHAS_YASM -endif -endif diff --git a/configure.ac b/configure.ac index 1747aef..3b0733f 100644 --- a/configure.ac +++ b/configure.ac @@ -54,42 +54,6 @@ else JANSSON_LIBS=-ljansson fi -dnl Find YASM -has_yasm=false -AC_PATH_PROG([YASM],[yasm],[false]) -if test "x$YASM" != "xfalse" ; then - AC_MSG_CHECKING([if yasm version is greater than 1.0.1]) - yasmver=`yasm --version | head -1 | cut -d\ -f2` - yamajor=`echo $yasmver | cut -d. -f1` - yaminor=`echo $yasmver | cut -d. -f2` - yamini=`echo $yasmver | cut -d. -f3` - if test "$yamajor" -ge "1" ; then - if test "$yamajor" -eq "1" ; then - if test "$yaminor" -ge "0" ; then - if test "$yaminor" -eq "0"; then - if test "$yamini" -ge "1"; then - has_yasm=true - fi - else - has_yasm=true - fi - fi - fi - else - has_yasm=false - fi - if test "x$has_yasm" = "xtrue" ; then - AC_MSG_RESULT([yes]) - else - AC_MSG_RESULT([no]) - fi -fi -if test "x$has_yasm" = "xfalse" ; then - AC_MSG_NOTICE([yasm is required for the sse2_64 algorithm. It will be skipped.]) -fi - -AM_CONDITIONAL([HAS_YASM], [test x$has_yasm = xtrue]) - PKG_PROG_PKG_CONFIG() LIBCURL_CHECK_CONFIG(, 7.10.1, , @@ -103,7 +67,6 @@ AC_CONFIG_FILES([ Makefile compat/Makefile compat/jansson/Makefile - x86_64/Makefile ]) AC_OUTPUT diff --git a/cpu-miner.c b/cpu-miner.c index 819d202..133183a 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -81,30 +81,10 @@ struct workio_cmd { }; enum sha256_algos { - ALGO_C, /* plain C */ - ALGO_4WAY, /* parallel SSE2 */ - ALGO_VIA, /* VIA padlock */ - ALGO_CRYPTOPP, /* Crypto++ (C) */ - ALGO_CRYPTOPP_ASM32, /* Crypto++ 32-bit assembly */ - ALGO_SSE2_64, /* SSE2 for x86_64 */ ALGO_SCRYPT, /* scrypt(1024,1,1) */ }; static const char *algo_names[] = { - [ALGO_C] = "c", -#ifdef WANT_SSE2_4WAY - [ALGO_4WAY] = "4way", -#endif -#ifdef WANT_VIA_PADLOCK - [ALGO_VIA] = "via", -#endif - [ALGO_CRYPTOPP] = "cryptopp", -#ifdef WANT_CRYPTOPP_ASM32 - [ALGO_CRYPTOPP_ASM32] = "cryptopp_asm32", -#endif -#ifdef WANT_X8664_SSE2 - [ALGO_SSE2_64] = "sse2_64", -#endif [ALGO_SCRYPT] = "scrypt", }; @@ -119,11 +99,7 @@ static int opt_fail_pause = 30; int opt_scantime = 5; static json_t *opt_config; static const bool opt_time = true; -#ifdef WANT_X8664_SSE2 static enum sha256_algos opt_algo = ALGO_SCRYPT; -#else -static enum sha256_algos opt_algo = ALGO_SCRYPT; -#endif static int opt_n_threads; static int num_processors; static char *rpc_url; @@ -578,56 +554,6 @@ static void *miner_thread(void *userdata) /* scan nonces for a proof-of-work hash */ switch (opt_algo) { - case ALGO_C: - rc = scanhash_c(thr_id, work.midstate, work.data + 64, - work.hash1, work.hash, work.target, - max_nonce, &hashes_done); - break; - -#ifdef WANT_X8664_SSE2 - case ALGO_SSE2_64: { - unsigned int rc5 = - scanhash_sse2_64(thr_id, work.midstate, work.data + 64, - work.hash1, work.hash, - work.target, - max_nonce, &hashes_done); - rc = (rc5 == -1) ? false : true; - } - break; -#endif - -#ifdef WANT_SSE2_4WAY - case ALGO_4WAY: { - unsigned int rc4 = - ScanHash_4WaySSE2(thr_id, work.midstate, work.data + 64, - work.hash1, work.hash, - work.target, - max_nonce, &hashes_done); - rc = (rc4 == -1) ? false : true; - } - break; -#endif - -#ifdef WANT_VIA_PADLOCK - case ALGO_VIA: - rc = scanhash_via(thr_id, work.data, work.target, - max_nonce, &hashes_done); - break; -#endif - case ALGO_CRYPTOPP: - rc = scanhash_cryptopp(thr_id, work.midstate, work.data + 64, - work.hash1, work.hash, work.target, - max_nonce, &hashes_done); - break; - -#ifdef WANT_CRYPTOPP_ASM32 - case ALGO_CRYPTOPP_ASM32: - rc = scanhash_asm32(thr_id, work.midstate, work.data + 64, - work.hash1, work.hash, work.target, - max_nonce, &hashes_done); - break; -#endif - case ALGO_SCRYPT: rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target, max_nonce, &hashes_done); diff --git a/miner.h b/miner.h index a5bdfff..5d46209 100644 --- a/miner.h +++ b/miner.h @@ -37,18 +37,6 @@ void *alloca (size_t); #endif -#ifdef __SSE2__ -#define WANT_SSE2_4WAY 1 -#endif - -#if defined(__i386__) || defined(__x86_64__) -#define WANT_VIA_PADLOCK 1 -#endif - -#if defined(__x86_64__) && defined(__SSE2__) && defined(HAS_YASM) -#define WANT_X8664_SSE2 1 -#endif - #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) #define WANT_BUILTIN_BSWAP #else @@ -97,10 +85,6 @@ enum { #define likely(expr) (expr) #endif -#if defined(__i386__) -#define WANT_CRYPTOPP_ASM32 -#endif - #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif @@ -143,36 +127,6 @@ extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, extern char *bin2hex(const unsigned char *p, size_t len); extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); -extern unsigned int ScanHash_4WaySSE2(int, const unsigned char *pmidstate, - unsigned char *pdata, unsigned char *phash1, unsigned char *phash, - const unsigned char *ptarget, - uint32_t max_nonce, unsigned long *nHashesDone); - -extern unsigned int scanhash_sse2_amd64(int, const unsigned char *pmidstate, - unsigned char *pdata, unsigned char *phash1, unsigned char *phash, - const unsigned char *ptarget, - uint32_t max_nonce, unsigned long *nHashesDone); - -extern bool scanhash_via(int, unsigned char *data_inout, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done); - -extern bool scanhash_c(int, const unsigned char *midstate, unsigned char *data, - unsigned char *hash1, unsigned char *hash, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done); -extern bool scanhash_cryptopp(int, const unsigned char *midstate,unsigned char *data, - unsigned char *hash1, unsigned char *hash, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done); -extern bool scanhash_asm32(int, const unsigned char *midstate,unsigned char *data, - unsigned char *hash1, unsigned char *hash, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_sse2_64(int, const unsigned char *pmidstate, unsigned char *pdata, - unsigned char *phash1, unsigned char *phash, - const unsigned char *ptarget, - uint32_t max_nonce, unsigned long *nHashesDone); extern int scanhash_scrypt(int, unsigned char *pdata, unsigned char *scratchbuf, const unsigned char *ptarget, uint32_t max_nonce, unsigned long *nHashesDone); diff --git a/sha256_4way.c b/sha256_4way.c deleted file mode 100644 index 82dd6ca..0000000 --- a/sha256_4way.c +++ /dev/null @@ -1,488 +0,0 @@ -// Copyright (c) 2010 Satoshi Nakamoto -// Distributed under the MIT/X11 software license, see the accompanying -// file license.txt or http://www.opensource.org/licenses/mit-license.php. - -// tcatm's 4-way 128-bit SSE2 SHA-256 - -#include "cpuminer-config.h" -#include "miner.h" - -#ifdef WANT_SSE2_4WAY - -#include -#include - -#include -#include -#include - -#define NPAR 32 - -static void DoubleBlockSHA256(const void* pin, void* pout, const void* pinit, unsigned int hash[8][NPAR], const void* init2); - -static const unsigned int sha256_consts[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */ - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */ - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */ - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */ - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */ - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */ - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */ - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */ - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - - -static inline __m128i Ch(const __m128i b, const __m128i c, const __m128i d) { - return _mm_xor_si128(_mm_and_si128(b,c),_mm_andnot_si128(b,d)); -} - -static inline __m128i Maj(const __m128i b, const __m128i c, const __m128i d) { - return _mm_xor_si128(_mm_xor_si128(_mm_and_si128(b,c),_mm_and_si128(b,d)),_mm_and_si128(c,d)); -} - -static __attribute__((always_inline)) __m128i ROTR(__m128i x, const int n) { - return _mm_or_si128(_mm_srli_epi32(x, n),_mm_slli_epi32(x, 32 - n)); -} - -static __attribute__((always_inline)) __m128i SHR(__m128i x, const int n) { - return _mm_srli_epi32(x, n); -} - -/* SHA256 Functions */ -#define BIGSIGMA0_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 2),ROTR((x), 13)),ROTR((x), 22))) -#define BIGSIGMA1_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 6),ROTR((x), 11)),ROTR((x), 25))) - - -#define SIGMA0_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x), 7),ROTR((x), 18)), SHR((x), 3 ))) -#define SIGMA1_256(x) (_mm_xor_si128(_mm_xor_si128(ROTR((x),17),ROTR((x), 19)), SHR((x), 10))) - -static inline unsigned int store32(const __m128i x, int i) { - union { unsigned int ret[4]; __m128i x; } box; - box.x = x; - return box.ret[i]; -} - -static inline void store_epi32(const __m128i x, unsigned int *x0, unsigned int *x1, unsigned int *x2, unsigned int *x3) { - union { unsigned int ret[4]; __m128i x; } box; - box.x = x; - *x0 = box.ret[3]; *x1 = box.ret[2]; *x2 = box.ret[1]; *x3 = box.ret[0]; -} - -#define add4(x0, x1, x2, x3) _mm_add_epi32(_mm_add_epi32(x0, x1),_mm_add_epi32( x2,x3)) -#define add5(x0, x1, x2, x3, x4) _mm_add_epi32(add4(x0, x1, x2, x3), x4) - -#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \ - T1 = add5(h, BIGSIGMA1_256(e), Ch(e, f, g), _mm_set1_epi32(sha256_consts[i]), w); \ -d = _mm_add_epi32(d, T1); \ -h = _mm_add_epi32(T1, _mm_add_epi32(BIGSIGMA0_256(a), Maj(a, b, c))); - -static inline void dumpreg(__m128i x, char *msg) { - union { unsigned int ret[4]; __m128i x; } box; - box.x = x ; - printf("%s %08x %08x %08x %08x\n", msg, box.ret[0], box.ret[1], box.ret[2], box.ret[3]); -} - -#if 1 -#define dumpstate(i) printf("%s: %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", \ - __func__, store32(w0, i), store32(a, i), store32(b, i), store32(c, i), store32(d, i), store32(e, i), store32(f, i), store32(g, i), store32(h, i)); -#else -#define dumpstate() -#endif - -static const unsigned int pSHA256InitState[8] = -{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; - - -unsigned int ScanHash_4WaySSE2(int thr_id, const unsigned char *pmidstate, - unsigned char *pdata, - unsigned char *phash1, unsigned char *phash, - const unsigned char *ptarget, - uint32_t max_nonce, unsigned long *nHashesDone) -{ - unsigned int *nNonce_p = (unsigned int*)(pdata + 12); - unsigned int nonce = 0; - - work_restart[thr_id].restart = 0; - - for (;;) - { - unsigned int thash[9][NPAR] __attribute__((aligned(128))); - int j; - - nonce += NPAR; - *nNonce_p = nonce; - - DoubleBlockSHA256(pdata, phash1, pmidstate, thash, pSHA256InitState); - - for (j = 0; j < NPAR; j++) - { - if (unlikely(thash[7][j] == 0)) - { - int i; - - for (i = 0; i < 32/4; i++) - ((unsigned int*)phash)[i] = thash[i][j]; - - if (fulltest(phash, ptarget)) { - *nHashesDone = nonce; - *nNonce_p = nonce + j; - return nonce + j; - } - } - } - - if ((nonce >= max_nonce) || work_restart[thr_id].restart) - { - *nHashesDone = nonce; - return -1; - } - } -} - - -static void DoubleBlockSHA256(const void* pin, void* pad, const void *pre, unsigned int thash[9][NPAR], const void *init) -{ - unsigned int* In = (unsigned int*)pin; - unsigned int* Pad = (unsigned int*)pad; - unsigned int* hPre = (unsigned int*)pre; - unsigned int* hInit = (unsigned int*)init; - unsigned int /* i, j, */ k; - - /* vectors used in calculation */ - __m128i w0, w1, w2, w3, w4, w5, w6, w7; - __m128i w8, w9, w10, w11, w12, w13, w14, w15; - __m128i T1; - __m128i a, b, c, d, e, f, g, h; - __m128i nonce, preNonce; - - /* nonce offset for vector */ - __m128i offset = _mm_set_epi32(0x00000003, 0x00000002, 0x00000001, 0x00000000); - - - preNonce = _mm_add_epi32(_mm_set1_epi32(In[3]), offset); - - for(k = 0; k -#include -#include -#include -#include -#include "miner.h" - -typedef uint32_t word32; - -static word32 rotrFixed(word32 word, unsigned int shift) -{ - return (word >> shift) | (word << (32 - shift)); -} - -#define blk0(i) (W[i] = data[i]) - -static const word32 SHA256_K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15])) - -#define Ch(x,y,z) (z^(x&(y^z))) -#define Maj(x,y,z) (y^((x^y)&(y^z))) - -#define a(i) T[(0-i)&7] -#define b(i) T[(1-i)&7] -#define c(i) T[(2-i)&7] -#define d(i) T[(3-i)&7] -#define e(i) T[(4-i)&7] -#define f(i) T[(5-i)&7] -#define g(i) T[(6-i)&7] -#define h(i) T[(7-i)&7] - -#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA256_K[i+j]+(j?blk2(i):blk0(i));\ - d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) - -// for SHA256 -#define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22)) -#define S1(x) (rotrFixed(x,6)^rotrFixed(x,11)^rotrFixed(x,25)) -#define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3)) -#define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10)) - -static void SHA256_Transform(word32 *state, const word32 *data) -{ - word32 W[16] = { }; - word32 T[8]; - unsigned int j; - - /* Copy context->state[] to working vars */ - memcpy(T, state, sizeof(T)); - /* 64 operations, partially loop unrolled */ - for (j=0; j<64; j+=16) - { - R( 0); R( 1); R( 2); R( 3); - R( 4); R( 5); R( 6); R( 7); - R( 8); R( 9); R(10); R(11); - R(12); R(13); R(14); R(15); - } - /* Add the working vars back into context.state[] */ - state[0] += a(0); - state[1] += b(0); - state[2] += c(0); - state[3] += d(0); - state[4] += e(0); - state[5] += f(0); - state[6] += g(0); - state[7] += h(0); -} - -static void runhash(void *state, const void *input, const void *init) -{ - memcpy(state, init, 32); - SHA256_Transform(state, input); -} - -/* suspiciously similar to ScanHash* from bitcoin */ -bool scanhash_cryptopp(int thr_id, const unsigned char *midstate, - unsigned char *data, - unsigned char *hash1, unsigned char *hash, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t *hash32 = (uint32_t *) hash; - uint32_t *nonce = (uint32_t *)(data + 12); - uint32_t n = 0; - unsigned long stat_ctr = 0; - - work_restart[thr_id].restart = 0; - - while (1) { - n++; - *nonce = n; - - runhash(hash1, data, midstate); - runhash(hash, hash1, sha256_init_state); - - stat_ctr++; - - if (unlikely((hash32[7] == 0) && fulltest(hash, target))) { - *hashes_done = stat_ctr; - return true; - } - - if ((n >= max_nonce) || work_restart[thr_id].restart) { - *hashes_done = stat_ctr; - return false; - } - } -} - -#if defined(WANT_CRYPTOPP_ASM32) - -#define CRYPTOPP_FASTCALL -#define CRYPTOPP_BOOL_X86 1 -#define CRYPTOPP_BOOL_X64 0 -#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0 - -#ifdef CRYPTOPP_GENERATE_X64_MASM - #define AS1(x) x*newline* - #define AS2(x, y) x, y*newline* - #define AS3(x, y, z) x, y, z*newline* - #define ASS(x, y, a, b, c, d) x, y, a*64+b*16+c*4+d*newline* - #define ASL(x) label##x:*newline* - #define ASJ(x, y, z) x label##y*newline* - #define ASC(x, y) x label##y*newline* - #define AS_HEX(y) 0##y##h -#elif defined(_MSC_VER) || defined(__BORLANDC__) - #define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY - #define AS1(x) __asm {x} - #define AS2(x, y) __asm {x, y} - #define AS3(x, y, z) __asm {x, y, z} - #define ASS(x, y, a, b, c, d) __asm {x, y, (a)*64+(b)*16+(c)*4+(d)} - #define ASL(x) __asm {label##x:} - #define ASJ(x, y, z) __asm {x label##y} - #define ASC(x, y) __asm {x label##y} - #define CRYPTOPP_NAKED __declspec(naked) - #define AS_HEX(y) 0x##y -#else - #define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY - // define these in two steps to allow arguments to be expanded - #define GNU_AS1(x) #x ";" - #define GNU_AS2(x, y) #x ", " #y ";" - #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" - #define GNU_ASL(x) "\n" #x ":" - #define GNU_ASJ(x, y, z) #x " " #y #z ";" - #define AS1(x) GNU_AS1(x) - #define AS2(x, y) GNU_AS2(x, y) - #define AS3(x, y, z) GNU_AS3(x, y, z) - #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";" - #define ASL(x) GNU_ASL(x) - #define ASJ(x, y, z) GNU_ASJ(x, y, z) - #define ASC(x, y) #x " " #y ";" - #define CRYPTOPP_NAKED - #define AS_HEX(y) 0x##y -#endif - -#define IF0(y) -#define IF1(y) y - -#ifdef CRYPTOPP_GENERATE_X64_MASM -#define ASM_MOD(x, y) ((x) MOD (y)) -#define XMMWORD_PTR XMMWORD PTR -#else -// GNU assembler doesn't seem to have mod operator -#define ASM_MOD(x, y) ((x)-((x)/(y))*(y)) -// GAS 2.15 doesn't support XMMWORD PTR. it seems necessary only for MASM -#define XMMWORD_PTR -#endif - -#if CRYPTOPP_BOOL_X86 - #define AS_REG_1 ecx - #define AS_REG_2 edx - #define AS_REG_3 esi - #define AS_REG_4 edi - #define AS_REG_5 eax - #define AS_REG_6 ebx - #define AS_REG_7 ebp - #define AS_REG_1d ecx - #define AS_REG_2d edx - #define AS_REG_3d esi - #define AS_REG_4d edi - #define AS_REG_5d eax - #define AS_REG_6d ebx - #define AS_REG_7d ebp - #define WORD_SZ 4 - #define WORD_REG(x) e##x - #define WORD_PTR DWORD PTR - #define AS_PUSH_IF86(x) AS1(push e##x) - #define AS_POP_IF86(x) AS1(pop e##x) - #define AS_JCXZ jecxz -#elif CRYPTOPP_BOOL_X64 - #ifdef CRYPTOPP_GENERATE_X64_MASM - #define AS_REG_1 rcx - #define AS_REG_2 rdx - #define AS_REG_3 r8 - #define AS_REG_4 r9 - #define AS_REG_5 rax - #define AS_REG_6 r10 - #define AS_REG_7 r11 - #define AS_REG_1d ecx - #define AS_REG_2d edx - #define AS_REG_3d r8d - #define AS_REG_4d r9d - #define AS_REG_5d eax - #define AS_REG_6d r10d - #define AS_REG_7d r11d - #else - #define AS_REG_1 rdi - #define AS_REG_2 rsi - #define AS_REG_3 rdx - #define AS_REG_4 rcx - #define AS_REG_5 r8 - #define AS_REG_6 r9 - #define AS_REG_7 r10 - #define AS_REG_1d edi - #define AS_REG_2d esi - #define AS_REG_3d edx - #define AS_REG_4d ecx - #define AS_REG_5d r8d - #define AS_REG_6d r9d - #define AS_REG_7d r10d - #endif - #define WORD_SZ 8 - #define WORD_REG(x) r##x - #define WORD_PTR QWORD PTR - #define AS_PUSH_IF86(x) - #define AS_POP_IF86(x) - #define AS_JCXZ jrcxz -#endif - -static void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data, size_t len -#if defined(_MSC_VER) && (_MSC_VER == 1200) - , ... // VC60 workaround: prevent VC 6 from inlining this function -#endif - ) -{ -#if defined(_MSC_VER) && (_MSC_VER == 1200) - AS2(mov ecx, [state]) - AS2(mov edx, [data]) -#endif - - #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ - #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4] - #define G(i) H(i+1) - #define F(i) H(i+2) - #define E(i) H(i+3) - #define D(i) H(i+4) - #define C(i) H(i+5) - #define B(i) H(i+6) - #define A(i) H(i+7) - #define Wt(i) BASE+8*4+ASM_MOD(1024+15-(i),16)*4 - #define Wt_2(i) Wt((i)-2) - #define Wt_15(i) Wt((i)-15) - #define Wt_7(i) Wt((i)-7) - #define K_END [BASE+8*4+16*4+0*WORD_SZ] - #define STATE_SAVE [BASE+8*4+16*4+1*WORD_SZ] - #define DATA_SAVE [BASE+8*4+16*4+2*WORD_SZ] - #define DATA_END [BASE+8*4+16*4+3*WORD_SZ] - #define Kt(i) WORD_REG(si)+(i)*4 -#if CRYPTOPP_BOOL_X86 - #define BASE esp+4 -#elif defined(__GNUC__) - #define BASE r8 -#else - #define BASE rsp -#endif - -#define RA0(i, edx, edi) \ - AS2( add edx, [Kt(i)] )\ - AS2( add edx, [Wt(i)] )\ - AS2( add edx, H(i) )\ - -#define RA1(i, edx, edi) - -#define RB0(i, edx, edi) - -#define RB1(i, edx, edi) \ - AS2( mov AS_REG_7d, [Wt_2(i)] )\ - AS2( mov edi, [Wt_15(i)])\ - AS2( mov ebx, AS_REG_7d )\ - AS2( shr AS_REG_7d, 10 )\ - AS2( ror ebx, 17 )\ - AS2( xor AS_REG_7d, ebx )\ - AS2( ror ebx, 2 )\ - AS2( xor ebx, AS_REG_7d )/* s1(W_t-2) */\ - AS2( add ebx, [Wt_7(i)])\ - AS2( mov AS_REG_7d, edi )\ - AS2( shr AS_REG_7d, 3 )\ - AS2( ror edi, 7 )\ - AS2( add ebx, [Wt(i)])/* s1(W_t-2) + W_t-7 + W_t-16 */\ - AS2( xor AS_REG_7d, edi )\ - AS2( add edx, [Kt(i)])\ - AS2( ror edi, 11 )\ - AS2( add edx, H(i) )\ - AS2( xor AS_REG_7d, edi )/* s0(W_t-15) */\ - AS2( add AS_REG_7d, ebx )/* W_t = s1(W_t-2) + W_t-7 + s0(W_t-15) W_t-16*/\ - AS2( mov [Wt(i)], AS_REG_7d)\ - AS2( add edx, AS_REG_7d )\ - -#define ROUND(i, r, eax, ecx, edi, edx)\ - /* in: edi = E */\ - /* unused: eax, ecx, temp: ebx, AS_REG_7d, out: edx = T1 */\ - AS2( mov edx, F(i) )\ - AS2( xor edx, G(i) )\ - AS2( and edx, edi )\ - AS2( xor edx, G(i) )/* Ch(E,F,G) = (G^(E&(F^G))) */\ - AS2( mov AS_REG_7d, edi )\ - AS2( ror edi, 6 )\ - AS2( ror AS_REG_7d, 25 )\ - RA##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\ - AS2( xor AS_REG_7d, edi )\ - AS2( ror edi, 5 )\ - AS2( xor AS_REG_7d, edi )/* S1(E) */\ - AS2( add edx, AS_REG_7d )/* T1 = S1(E) + Ch(E,F,G) + H + Wt + Kt */\ - RB##r(i, edx, edi )/* H + Wt + Kt + Ch(E,F,G) */\ - /* in: ecx = A, eax = B^C, edx = T1 */\ - /* unused: edx, temp: ebx, AS_REG_7d, out: eax = A, ecx = B^C, edx = E */\ - AS2( mov ebx, ecx )\ - AS2( xor ecx, B(i) )/* A^B */\ - AS2( and eax, ecx )\ - AS2( xor eax, B(i) )/* Maj(A,B,C) = B^((A^B)&(B^C) */\ - AS2( mov AS_REG_7d, ebx )\ - AS2( ror ebx, 2 )\ - AS2( add eax, edx )/* T1 + Maj(A,B,C) */\ - AS2( add edx, D(i) )\ - AS2( mov D(i), edx )\ - AS2( ror AS_REG_7d, 22 )\ - AS2( xor AS_REG_7d, ebx )\ - AS2( ror ebx, 11 )\ - AS2( xor AS_REG_7d, ebx )\ - AS2( add eax, AS_REG_7d )/* T1 + S0(A) + Maj(A,B,C) */\ - AS2( mov H(i), eax )\ - -#define SWAP_COPY(i) \ - AS2( mov WORD_REG(bx), [WORD_REG(dx)+i*WORD_SZ])\ - AS1( bswap WORD_REG(bx))\ - AS2( mov [Wt(i*(1+CRYPTOPP_BOOL_X64)+CRYPTOPP_BOOL_X64)], WORD_REG(bx)) - -#if defined(__GNUC__) - #if CRYPTOPP_BOOL_X64 - FixedSizeAlignedSecBlock workspace; - #endif - __asm__ __volatile__ - ( - #if CRYPTOPP_BOOL_X64 - "lea %4, %%r8;" - #endif - ".intel_syntax noprefix;" -#elif defined(CRYPTOPP_GENERATE_X64_MASM) - ALIGN 8 - X86_SHA256_HashBlocks PROC FRAME - rex_push_reg rsi - push_reg rdi - push_reg rbx - push_reg rbp - alloc_stack(LOCALS_SIZE+8) - .endprolog - mov rdi, r8 - lea rsi, [?SHA256_K@CryptoPP@@3QBIB + 48*4] -#endif - -#if CRYPTOPP_BOOL_X86 - #ifndef __GNUC__ - AS2( mov edi, [len]) - AS2( lea WORD_REG(si), [SHA256_K+48*4]) - #endif - #if !defined(_MSC_VER) || (_MSC_VER < 1400) - AS_PUSH_IF86(bx) - #endif - - AS_PUSH_IF86(bp) - AS2( mov ebx, esp) - AS2( and esp, -16) - AS2( sub WORD_REG(sp), LOCALS_SIZE) - AS_PUSH_IF86(bx) -#endif - AS2( mov STATE_SAVE, WORD_REG(cx)) - AS2( mov DATA_SAVE, WORD_REG(dx)) - AS2( lea WORD_REG(ax), [WORD_REG(di) + WORD_REG(dx)]) - AS2( mov DATA_END, WORD_REG(ax)) - AS2( mov K_END, WORD_REG(si)) - -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE -#if CRYPTOPP_BOOL_X86 - AS2( test edi, 1) - ASJ( jnz, 2, f) - AS1( dec DWORD PTR K_END) -#endif - AS2( movdqa xmm0, XMMWORD_PTR [WORD_REG(cx)+0*16]) - AS2( movdqa xmm1, XMMWORD_PTR [WORD_REG(cx)+1*16]) -#endif - -#if CRYPTOPP_BOOL_X86 -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - ASJ( jmp, 0, f) -#endif - ASL(2) // non-SSE2 - AS2( mov esi, ecx) - AS2( lea edi, A(0)) - AS2( mov ecx, 8) - AS1( rep movsd) - AS2( mov esi, K_END) - ASJ( jmp, 3, f) -#endif - -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - ASL(0) - AS2( movdqa E(0), xmm1) - AS2( movdqa A(0), xmm0) -#endif -#if CRYPTOPP_BOOL_X86 - ASL(3) -#endif - AS2( sub WORD_REG(si), 48*4) - SWAP_COPY(0) SWAP_COPY(1) SWAP_COPY(2) SWAP_COPY(3) - SWAP_COPY(4) SWAP_COPY(5) SWAP_COPY(6) SWAP_COPY(7) -#if CRYPTOPP_BOOL_X86 - SWAP_COPY(8) SWAP_COPY(9) SWAP_COPY(10) SWAP_COPY(11) - SWAP_COPY(12) SWAP_COPY(13) SWAP_COPY(14) SWAP_COPY(15) -#endif - AS2( mov edi, E(0)) // E - AS2( mov eax, B(0)) // B - AS2( xor eax, C(0)) // B^C - AS2( mov ecx, A(0)) // A - - ROUND(0, 0, eax, ecx, edi, edx) - ROUND(1, 0, ecx, eax, edx, edi) - ROUND(2, 0, eax, ecx, edi, edx) - ROUND(3, 0, ecx, eax, edx, edi) - ROUND(4, 0, eax, ecx, edi, edx) - ROUND(5, 0, ecx, eax, edx, edi) - ROUND(6, 0, eax, ecx, edi, edx) - ROUND(7, 0, ecx, eax, edx, edi) - ROUND(8, 0, eax, ecx, edi, edx) - ROUND(9, 0, ecx, eax, edx, edi) - ROUND(10, 0, eax, ecx, edi, edx) - ROUND(11, 0, ecx, eax, edx, edi) - ROUND(12, 0, eax, ecx, edi, edx) - ROUND(13, 0, ecx, eax, edx, edi) - ROUND(14, 0, eax, ecx, edi, edx) - ROUND(15, 0, ecx, eax, edx, edi) - - ASL(1) - AS2(add WORD_REG(si), 4*16) - ROUND(0, 1, eax, ecx, edi, edx) - ROUND(1, 1, ecx, eax, edx, edi) - ROUND(2, 1, eax, ecx, edi, edx) - ROUND(3, 1, ecx, eax, edx, edi) - ROUND(4, 1, eax, ecx, edi, edx) - ROUND(5, 1, ecx, eax, edx, edi) - ROUND(6, 1, eax, ecx, edi, edx) - ROUND(7, 1, ecx, eax, edx, edi) - ROUND(8, 1, eax, ecx, edi, edx) - ROUND(9, 1, ecx, eax, edx, edi) - ROUND(10, 1, eax, ecx, edi, edx) - ROUND(11, 1, ecx, eax, edx, edi) - ROUND(12, 1, eax, ecx, edi, edx) - ROUND(13, 1, ecx, eax, edx, edi) - ROUND(14, 1, eax, ecx, edi, edx) - ROUND(15, 1, ecx, eax, edx, edi) - AS2( cmp WORD_REG(si), K_END) - ASJ( jb, 1, b) - - AS2( mov WORD_REG(dx), DATA_SAVE) - AS2( add WORD_REG(dx), 64) - AS2( mov AS_REG_7, STATE_SAVE) - AS2( mov DATA_SAVE, WORD_REG(dx)) - -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE -#if CRYPTOPP_BOOL_X86 - AS2( test DWORD PTR K_END, 1) - ASJ( jz, 4, f) -#endif - AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_7+1*16]) - AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_7+0*16]) - AS2( paddd xmm1, E(0)) - AS2( paddd xmm0, A(0)) - AS2( movdqa [AS_REG_7+1*16], xmm1) - AS2( movdqa [AS_REG_7+0*16], xmm0) - AS2( cmp WORD_REG(dx), DATA_END) - ASJ( jb, 0, b) -#endif - -#if CRYPTOPP_BOOL_X86 -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - ASJ( jmp, 5, f) - ASL(4) // non-SSE2 -#endif - AS2( add [AS_REG_7+0*4], ecx) // A - AS2( add [AS_REG_7+4*4], edi) // E - AS2( mov eax, B(0)) - AS2( mov ebx, C(0)) - AS2( mov ecx, D(0)) - AS2( add [AS_REG_7+1*4], eax) - AS2( add [AS_REG_7+2*4], ebx) - AS2( add [AS_REG_7+3*4], ecx) - AS2( mov eax, F(0)) - AS2( mov ebx, G(0)) - AS2( mov ecx, H(0)) - AS2( add [AS_REG_7+5*4], eax) - AS2( add [AS_REG_7+6*4], ebx) - AS2( add [AS_REG_7+7*4], ecx) - AS2( mov ecx, AS_REG_7d) - AS2( cmp WORD_REG(dx), DATA_END) - ASJ( jb, 2, b) -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - ASL(5) -#endif -#endif - - AS_POP_IF86(sp) - AS_POP_IF86(bp) - #if !defined(_MSC_VER) || (_MSC_VER < 1400) - AS_POP_IF86(bx) - #endif - -#ifdef CRYPTOPP_GENERATE_X64_MASM - add rsp, LOCALS_SIZE+8 - pop rbp - pop rbx - pop rdi - pop rsi - ret - X86_SHA256_HashBlocks ENDP -#endif - -#ifdef __GNUC__ - ".att_syntax prefix;" - : - : "c" (state), "d" (data), "S" (SHA256_K+48), "D" (len) - #if CRYPTOPP_BOOL_X64 - , "m" (workspace[0]) - #endif - : "memory", "cc", "%eax" - #if CRYPTOPP_BOOL_X64 - , "%rbx", "%r8", "%r10" - #endif - ); -#endif -} - -static inline bool HasSSE2(void) { return false; } - -static void SHA256_Transform32(word32 *state, const word32 *data) -{ - word32 W[16]; - int i; - - for (i = 0; i < 16; i++) - W[i] = swab32(((word32 *)(data))[i]); - - X86_SHA256_HashBlocks(state, W, 16 * 4); -} - -static void runhash32(void *state, const void *input, const void *init) -{ - memcpy(state, init, 32); - SHA256_Transform32(state, input); -} - -/* suspiciously similar to ScanHash* from bitcoin */ -bool scanhash_asm32(int thr_id, const unsigned char *midstate, - unsigned char *data, - unsigned char *hash1, unsigned char *hash, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t *hash32 = (uint32_t *) hash; - uint32_t *nonce = (uint32_t *)(data + 12); - uint32_t n = 0; - unsigned long stat_ctr = 0; - - work_restart[thr_id].restart = 0; - - while (1) { - n++; - *nonce = n; - - runhash32(hash1, data, midstate); - runhash32(hash, hash1, sha256_init_state); - - stat_ctr++; - - if (unlikely((hash32[7] == 0) && fulltest(hash, target))) { - fulltest(hash, target); - - *hashes_done = stat_ctr; - return true; - } - - if ((n >= max_nonce) || work_restart[thr_id].restart) { - *hashes_done = stat_ctr; - return false; - } - } -} - -#endif // #if defined(WANT_CRYPTOPP_ASM32) diff --git a/sha256_generic.c b/sha256_generic.c deleted file mode 100644 index 789b20e..0000000 --- a/sha256_generic.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Cryptographic API. - * - * SHA-256, as specified in - * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf - * - * SHA-256 code by Jean-Luc Cooke . - * - * Copyright (c) Jean-Luc Cooke - * Copyright (c) Andrew McDonald - * Copyright (c) 2002 James Morris - * SHA224 Support Copyright 2007 Intel Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ - -#include "cpuminer-config.h" - -#include -#include -#include -#include -#include "miner.h" - -typedef uint32_t u32; -typedef uint8_t u8; - -static inline u32 ror32(u32 word, unsigned int shift) -{ - return (word >> shift) | (word << (32 - shift)); -} - -static inline u32 Ch(u32 x, u32 y, u32 z) -{ - return z ^ (x & (y ^ z)); -} - -static inline u32 Maj(u32 x, u32 y, u32 z) -{ - return (x & y) | (z & (x | y)); -} - -#define e0(x) (ror32(x, 2) ^ ror32(x,13) ^ ror32(x,22)) -#define e1(x) (ror32(x, 6) ^ ror32(x,11) ^ ror32(x,25)) -#define s0(x) (ror32(x, 7) ^ ror32(x,18) ^ (x >> 3)) -#define s1(x) (ror32(x,17) ^ ror32(x,19) ^ (x >> 10)) - -static inline void LOAD_OP(int I, u32 *W, const u8 *input) -{ - /* byteswap is commented out, because bitcoin input - * is already big-endian - */ - W[I] = /* ntohl */ ( ((u32*)(input))[I] ); -} - -static inline void BLEND_OP(int I, u32 *W) -{ - W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; -} - -static void sha256_transform(u32 *state, const u8 *input) -{ - u32 a, b, c, d, e, f, g, h, t1, t2; - u32 W[64]; - int i; - - /* load the input */ - for (i = 0; i < 16; i++) - LOAD_OP(i, W, input); - - /* now blend */ - for (i = 16; i < 64; i++) - BLEND_OP(i, W); - - /* load the state into our registers */ - a=state[0]; b=state[1]; c=state[2]; d=state[3]; - e=state[4]; f=state[5]; g=state[6]; h=state[7]; - - /* now iterate */ - t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5 + W[ 3]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0x3956c25b + W[ 4]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0x59f111f1 + W[ 5]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0x923f82a4 + W[ 6]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0xab1c5ed5 + W[ 7]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - t1 = h + e1(e) + Ch(e,f,g) + 0xd807aa98 + W[ 8]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0x12835b01 + W[ 9]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0x243185be + W[10]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0x550c7dc3 + W[11]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0x72be5d74 + W[12]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0x80deb1fe + W[13]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0x9bdc06a7 + W[14]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0xc19bf174 + W[15]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - t1 = h + e1(e) + Ch(e,f,g) + 0xe49b69c1 + W[16]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0xefbe4786 + W[17]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0x0fc19dc6 + W[18]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0x240ca1cc + W[19]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0x2de92c6f + W[20]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0x4a7484aa + W[21]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0x5cb0a9dc + W[22]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0x76f988da + W[23]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - t1 = h + e1(e) + Ch(e,f,g) + 0x983e5152 + W[24]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0xa831c66d + W[25]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0xb00327c8 + W[26]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0xbf597fc7 + W[27]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0xc6e00bf3 + W[28]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0xd5a79147 + W[29]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0x06ca6351 + W[30]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0x14292967 + W[31]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - t1 = h + e1(e) + Ch(e,f,g) + 0x27b70a85 + W[32]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0x2e1b2138 + W[33]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0x4d2c6dfc + W[34]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0x53380d13 + W[35]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0x650a7354 + W[36]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0x766a0abb + W[37]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0x81c2c92e + W[38]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0x92722c85 + W[39]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - t1 = h + e1(e) + Ch(e,f,g) + 0xa2bfe8a1 + W[40]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0xa81a664b + W[41]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0xc24b8b70 + W[42]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0xc76c51a3 + W[43]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0xd192e819 + W[44]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0xd6990624 + W[45]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0xf40e3585 + W[46]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0x106aa070 + W[47]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - t1 = h + e1(e) + Ch(e,f,g) + 0x19a4c116 + W[48]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0x1e376c08 + W[49]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0x2748774c + W[50]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0x34b0bcb5 + W[51]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0x391c0cb3 + W[52]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0x4ed8aa4a + W[53]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0x5b9cca4f + W[54]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0x682e6ff3 + W[55]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - t1 = h + e1(e) + Ch(e,f,g) + 0x748f82ee + W[56]; - t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; - t1 = g + e1(d) + Ch(d,e,f) + 0x78a5636f + W[57]; - t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; - t1 = f + e1(c) + Ch(c,d,e) + 0x84c87814 + W[58]; - t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; - t1 = e + e1(b) + Ch(b,c,d) + 0x8cc70208 + W[59]; - t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; - t1 = d + e1(a) + Ch(a,b,c) + 0x90befffa + W[60]; - t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; - t1 = c + e1(h) + Ch(h,a,b) + 0xa4506ceb + W[61]; - t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; - t1 = b + e1(g) + Ch(g,h,a) + 0xbef9a3f7 + W[62]; - t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; - t1 = a + e1(f) + Ch(f,g,h) + 0xc67178f2 + W[63]; - t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; - - state[0] += a; state[1] += b; state[2] += c; state[3] += d; - state[4] += e; state[5] += f; state[6] += g; state[7] += h; - -#if 0 - /* clear any sensitive info... */ - a = b = c = d = e = f = g = h = t1 = t2 = 0; - memset(W, 0, 64 * sizeof(u32)); -#endif -} - -static void runhash(void *state, const void *input, const void *init) -{ - memcpy(state, init, 32); - sha256_transform(state, input); -} - -const uint32_t sha256_init_state[8] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -}; - -/* suspiciously similar to ScanHash* from bitcoin */ -bool scanhash_c(int thr_id, const unsigned char *midstate, unsigned char *data, - unsigned char *hash1, unsigned char *hash, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t *hash32 = (uint32_t *) hash; - uint32_t *nonce = (uint32_t *)(data + 12); - uint32_t n = 0; - unsigned long stat_ctr = 0; - - work_restart[thr_id].restart = 0; - - while (1) { - n++; - *nonce = n; - - runhash(hash1, data, midstate); - runhash(hash, hash1, sha256_init_state); - - stat_ctr++; - - if (unlikely((hash32[7] == 0) && fulltest(hash, target))) { - *hashes_done = stat_ctr; - return true; - } - - if ((n >= max_nonce) || work_restart[thr_id].restart) { - *hashes_done = stat_ctr; - return false; - } - } -} - diff --git a/sha256_sse2_amd64.c b/sha256_sse2_amd64.c deleted file mode 100644 index 3aa154c..0000000 --- a/sha256_sse2_amd64.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * SHA-256 driver for ASM routine for x86_64 on Linux - * Copyright (c) Mark Crichton - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - */ - -#include "cpuminer-config.h" - -#include "miner.h" - -#ifdef WANT_X8664_SSE2 - -#include -#include - -#include -#include -#include - -extern void CalcSha256_x64(__m128i *res, __m128i *data, uint32_t init[8]); - -uint32_t g_sha256_k[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /* 0 */ - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /* 8 */ - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, /* 16 */ - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, /* 24 */ - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, /* 32 */ - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, /* 40 */ - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, /* 48 */ - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, /* 56 */ - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - - -uint32_t g_sha256_hinit[8] = -{0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; - -__m128i g_4sha256_k[64]; - -int scanhash_sse2_64(int thr_id, const unsigned char *pmidstate, - unsigned char *pdata, - unsigned char *phash1, unsigned char *phash, - const unsigned char *ptarget, - uint32_t max_nonce, unsigned long *nHashesDone) -{ - uint32_t *nNonce_p = (uint32_t *)(pdata + 12); - uint32_t nonce = 0; - uint32_t m_midstate[8], m_w[16], m_w1[16]; - __m128i m_4w[64], m_4hash[64], m_4hash1[64]; - __m128i offset; - int i; - - work_restart[thr_id].restart = 0; - - /* For debugging */ - union { - __m128i m; - uint32_t i[4]; - } mi; - - /* Message expansion */ - memcpy(m_midstate, pmidstate, sizeof(m_midstate)); - memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ - memcpy(m_w1, phash1, sizeof(m_w1)); - memset(m_4hash, 0, sizeof(m_4hash)); - - /* Transmongrify */ - for (i = 0; i < 16; i++) - m_4w[i] = _mm_set1_epi32(m_w[i]); - - for (i = 0; i < 16; i++) - m_4hash1[i] = _mm_set1_epi32(m_w1[i]); - - for (i = 0; i < 64; i++) - g_4sha256_k[i] = _mm_set1_epi32(g_sha256_k[i]); - - offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0); - - for (;;) - { - int j; - - m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce)); - - /* Some optimization can be done here W.R.T. precalculating some hash */ - CalcSha256_x64(m_4hash1, m_4w, m_midstate); - CalcSha256_x64(m_4hash, m_4hash1, g_sha256_hinit); - - for (j = 0; j < 4; j++) { - mi.m = m_4hash[7]; - if (unlikely(mi.i[j] == 0)) - break; - } - - /* If j = true, we found a hit...so check it */ - /* Use the C version for a check... */ - if (unlikely(j != 4)) { - for (i = 0; i < 8; i++) { - mi.m = m_4hash[i]; - *(uint32_t *)&(phash)[i*4] = mi.i[j]; - } - - if (fulltest(phash, ptarget)) { - *nHashesDone = nonce; - *nNonce_p = nonce + j; - return nonce + j; - } - } - - nonce += 4; - - if (unlikely((nonce >= max_nonce) || work_restart[thr_id].restart)) - { - *nHashesDone = nonce; - return -1; - } - } -} - -#endif /* WANT_X8664_SSE2 */ - diff --git a/sha256_via.c b/sha256_via.c deleted file mode 100644 index 1f0596c..0000000 --- a/sha256_via.c +++ /dev/null @@ -1,85 +0,0 @@ - -#include "cpuminer-config.h" - -#include -#include -#include -#include -#include -#include "miner.h" - -#ifdef WANT_VIA_PADLOCK - -static void via_sha256(void *hash, void *buf, unsigned len) -{ - unsigned stat = 0; - asm volatile(".byte 0xf3, 0x0f, 0xa6, 0xd0" - :"+S"(buf), "+a"(stat) - :"c"(len), "D" (hash) - :"memory"); -} - -bool scanhash_via(int thr_id, unsigned char *data_inout, - const unsigned char *target, - uint32_t max_nonce, unsigned long *hashes_done) -{ - unsigned char data[128] __attribute__((aligned(128))); - unsigned char tmp_hash[32] __attribute__((aligned(128))); - unsigned char tmp_hash1[32] __attribute__((aligned(128))); - uint32_t *data32 = (uint32_t *) data; - uint32_t *hash32 = (uint32_t *) tmp_hash; - uint32_t *nonce = (uint32_t *)(data + 64 + 12); - uint32_t n = 0; - unsigned long stat_ctr = 0; - int i; - - work_restart[thr_id].restart = 0; - - /* bitcoin gives us big endian input, but via wants LE, - * so we reverse the swapping bitcoin has already done (extra work) - * in order to permit the hardware to swap everything - * back to BE again (extra work). - */ - for (i = 0; i < 128/4; i++) - data32[i] = swab32(((uint32_t *)data_inout)[i]); - - while (1) { - n++; - *nonce = n; - - /* first SHA256 transform */ - memcpy(tmp_hash1, sha256_init_state, 32); - via_sha256(tmp_hash1, data, 80); /* or maybe 128? */ - - for (i = 0; i < 32/4; i++) - ((uint32_t *)tmp_hash1)[i] = - swab32(((uint32_t *)tmp_hash1)[i]); - - /* second SHA256 transform */ - memcpy(tmp_hash, sha256_init_state, 32); - via_sha256(tmp_hash, tmp_hash1, 32); - - stat_ctr++; - - if (unlikely((hash32[7] == 0) && fulltest(tmp_hash, target))) { - /* swap nonce'd data back into original storage area; - * TODO: only swap back the nonce, rather than all data - */ - for (i = 0; i < 128/4; i++) { - uint32_t *dout32 = (uint32_t *) data_inout; - dout32[i] = swab32(data32[i]); - } - - *hashes_done = stat_ctr; - return true; - } - - if ((n >= max_nonce) || work_restart[thr_id].restart) { - *hashes_done = stat_ctr; - return false; - } - } -} - -#endif /* WANT_VIA_PADLOCK */ - diff --git a/x86_64/.gitignore b/x86_64/.gitignore deleted file mode 100644 index a966652..0000000 --- a/x86_64/.gitignore +++ /dev/null @@ -1 +0,0 @@ -libx8664.a diff --git a/x86_64/Makefile.am b/x86_64/Makefile.am deleted file mode 100644 index c74ddd2..0000000 --- a/x86_64/Makefile.am +++ /dev/null @@ -1,8 +0,0 @@ -noinst_LIBRARIES = libx8664.a - -SUFFIXES = .asm - -libx8664_a_SOURCES = sha256_xmm_amd64.asm - -.asm.o: - $(YASM) -f elf64 $< diff --git a/x86_64/sha256_xmm_amd64.asm b/x86_64/sha256_xmm_amd64.asm deleted file mode 100644 index 4fa0ea9..0000000 --- a/x86_64/sha256_xmm_amd64.asm +++ /dev/null @@ -1,219 +0,0 @@ -;; SHA-256 for X86-64 for Linux, based off of: - -; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com -; Version 2011 -; This software is Public Domain - -; SHA-256 CPU SSE cruncher for Bitcoin Miner - -ALIGN 32 -BITS 64 - -%define hash rdi -%define data rsi -%define init rdx - -extern g_4sha256_k - -global CalcSha256_x64 -; CalcSha256 hash(rdi), data(rsi), init(rdx) -CalcSha256_x64: - - push rbx - -LAB_NEXT_NONCE: - mov r11, data -; mov rax, pnonce -; mov eax, [rax] -; mov [rbx+3*16], eax -; inc eax -; mov [rbx+3*16+4], eax -; inc eax -; mov [rbx+3*16+8], eax -; inc eax -; mov [rbx+3*16+12], eax - - mov rcx, 64*4 ;rcx is # of SHA-2 rounds - mov rax, 16*4 ;rax is where we expand to - -LAB_SHA: - push rcx - lea rcx, qword [r11+rcx*4] - lea r11, qword [r11+rax*4] -LAB_CALC: - movdqa xmm0, [r11-15*16] - movdqa xmm2, xmm0 ; (Rotr32(w_15, 7) ^ Rotr32(w_15, 18) ^ (w_15 >> 3)) - psrld xmm0, 3 - movdqa xmm1, xmm0 - pslld xmm2, 14 - psrld xmm1, 4 - pxor xmm0, xmm1 - pxor xmm0, xmm2 - pslld xmm2, 11 - psrld xmm1, 11 - pxor xmm0, xmm1 - pxor xmm0, xmm2 - - paddd xmm0, [r11-16*16] - - movdqa xmm3, [r11-2*16] - movdqa xmm2, xmm3 ; (Rotr32(w_2, 17) ^ Rotr32(w_2, 19) ^ (w_2 >> 10)) - psrld xmm3, 10 - movdqa xmm1, xmm3 - pslld xmm2, 13 - psrld xmm1, 7 - pxor xmm3, xmm1 - pxor xmm3, xmm2 - pslld xmm2, 2 - psrld xmm1, 2 - pxor xmm3, xmm1 - pxor xmm3, xmm2 - paddd xmm0, xmm3 - - paddd xmm0, [r11-7*16] - movdqa [r11], xmm0 - add r11, 16 - cmp r11, rcx - jb LAB_CALC - pop rcx - - mov rax, 0 - -; Load the init values of the message into the hash. - - movd xmm0, dword [rdx+4*4] ; xmm0 == e - pshufd xmm0, xmm0, 0 - movd xmm3, dword [rdx+3*4] ; xmm3 == d - pshufd xmm3, xmm3, 0 - movd xmm4, dword [rdx+2*4] ; xmm4 == c - pshufd xmm4, xmm4, 0 - movd xmm5, dword [rdx+1*4] ; xmm5 == b - pshufd xmm5, xmm5, 0 - movd xmm7, dword [rdx+0*4] ; xmm7 == a - pshufd xmm7, xmm7, 0 - movd xmm8, dword [rdx+5*4] ; xmm8 == f - pshufd xmm8, xmm8, 0 - movd xmm9, dword [rdx+6*4] ; xmm9 == g - pshufd xmm9, xmm9, 0 - movd xmm10, dword [rdx+7*4] ; xmm10 == h - pshufd xmm10, xmm10, 0 - -LAB_LOOP: - -;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32(g_sha256_k[j]) + w[j] - - movdqa xmm6, [rsi+rax*4] - paddd xmm6, g_4sha256_k[rax*4] - add rax, 4 - - paddd xmm6, xmm10 ; +h - - movdqa xmm1, xmm0 - movdqa xmm2, xmm9 - pandn xmm1, xmm2 ; ~e & g - - movdqa xmm10, xmm2 ; h = g - movdqa xmm2, xmm8 ; f - movdqa xmm9, xmm2 ; g = f - - pand xmm2, xmm0 ; e & f - pxor xmm1, xmm2 ; (e & f) ^ (~e & g) - movdqa xmm8, xmm0 ; f = e - - paddd xmm6, xmm1 ; Ch + h + w[i] + k[i] - - movdqa xmm1, xmm0 - psrld xmm0, 6 - movdqa xmm2, xmm0 - pslld xmm1, 7 - psrld xmm2, 5 - pxor xmm0, xmm1 - pxor xmm0, xmm2 - pslld xmm1, 14 - psrld xmm2, 14 - pxor xmm0, xmm1 - pxor xmm0, xmm2 - pslld xmm1, 5 - pxor xmm0, xmm1 ; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25) - paddd xmm6, xmm0 ; xmm6 = t1 - - movdqa xmm0, xmm3 ; d - paddd xmm0, xmm6 ; e = d+t1 - - movdqa xmm1, xmm5 ; =b - movdqa xmm3, xmm4 ; d = c - movdqa xmm2, xmm4 ; c - pand xmm2, xmm5 ; b & c - pand xmm4, xmm7 ; a & c - pand xmm1, xmm7 ; a & b - pxor xmm1, xmm4 - movdqa xmm4, xmm5 ; c = b - movdqa xmm5, xmm7 ; b = a - pxor xmm1, xmm2 ; (a & c) ^ (a & d) ^ (c & d) - paddd xmm6, xmm1 ; t1 + ((a & c) ^ (a & d) ^ (c & d)) - - movdqa xmm2, xmm7 - psrld xmm7, 2 - movdqa xmm1, xmm7 - pslld xmm2, 10 - psrld xmm1, 11 - pxor xmm7, xmm2 - pxor xmm7, xmm1 - pslld xmm2, 9 - psrld xmm1, 9 - pxor xmm7, xmm2 - pxor xmm7, xmm1 - pslld xmm2, 11 - pxor xmm7, xmm2 - paddd xmm7, xmm6 ; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d)); - - cmp rax, rcx - jb LAB_LOOP - -; Finished the 64 rounds, calculate hash and save - - movd xmm1, dword [rdx+0*4] - pshufd xmm1, xmm1, 0 - paddd xmm7, xmm1 - - movd xmm1, dword [rdx+1*4] - pshufd xmm1, xmm1, 0 - paddd xmm5, xmm1 - - movd xmm1, dword [rdx+2*4] - pshufd xmm1, xmm1, 0 - paddd xmm4, xmm1 - - movd xmm1, dword [rdx+3*4] - pshufd xmm1, xmm1, 0 - paddd xmm3, xmm1 - - movd xmm1, dword [rdx+4*4] - pshufd xmm1, xmm1, 0 - paddd xmm0, xmm1 - - movd xmm1, dword [rdx+5*4] - pshufd xmm1, xmm1, 0 - paddd xmm8, xmm1 - - movd xmm1, dword [rdx+6*4] - pshufd xmm1, xmm1, 0 - paddd xmm9, xmm1 - - movd xmm1, dword [rdx+7*4] - pshufd xmm1, xmm1, 0 - paddd xmm10, xmm1 - -debug_me: - movdqa [rdi+0*16], xmm7 - movdqa [rdi+1*16], xmm5 - movdqa [rdi+2*16], xmm4 - movdqa [rdi+3*16], xmm3 - movdqa [rdi+4*16], xmm0 - movdqa [rdi+5*16], xmm8 - movdqa [rdi+6*16], xmm9 - movdqa [rdi+7*16], xmm10 - -LAB_RET: - pop rbx - ret