diff --git a/cpu-miner.c b/cpu-miner.c
index 032a04a..2fea8e2 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -84,10 +84,12 @@ struct workio_cmd {
 
 enum sha256_algos {
 	ALGO_SCRYPT,		/* scrypt(1024,1,1) */
+	ALGO_SHA256D,		/* SHA-256d */
 };
 
 static const char *algo_names[] = {
 	[ALGO_SCRYPT]		= "scrypt",
+	[ALGO_SHA256D]		= "sha256d",
 };
 
 bool opt_debug = false;
@@ -125,6 +127,9 @@ double *thr_hashrates;
 static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
+  -a, --algo=ALGO       specify the algorithm to use\n\
+                          scrypt    scrypt(1024, 1, 1) (default)\n\
+                          sha256d   SHA-256d\n\
   -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")\n\
   -O, --userpass=U:P    username:password pair for mining server\n\
   -u, --user=USERNAME   username for mining server\n\
@@ -223,7 +228,7 @@ static bool work_decode(const json_t *val, struct work *work)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(work->data); i++)
-		work->data[i] = be32dec(work->data + i);
+		work->data[i] = le32dec(work->data + i);
 	for (i = 0; i < ARRAY_SIZE(work->target); i++)
 		work->target[i] = le32dec(work->target + i);
 
@@ -251,7 +256,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 
 	/* build hex string */
 	for (i = 0; i < ARRAY_SIZE(work->data); i++)
-		be32enc(work->data + i, work->data[i]);
+		le32enc(work->data + i, work->data[i]);
 	hexstr = bin2hex((unsigned char *)work->data, sizeof(work->data));
 	if (unlikely(!hexstr)) {
 		applog(LOG_ERR, "submit_upstream_work OOM");
@@ -544,7 +549,7 @@ static void *miner_thread(void *userdata)
 		      - time(NULL);
 		max64 *= thr_hashrates[thr_id];
 		if (max64 <= 0)
-			max64 = 0xfffLL;
+			max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0xfffffLL;
 		if (work.data[19] + max64 > end_nonce)
 			max_nonce = end_nonce;
 		else
@@ -560,6 +565,11 @@ static void *miner_thread(void *userdata)
 			                     max_nonce, &hashes_done);
 			break;
 
+		case ALGO_SHA256D:
+			rc = scanhash_sha256d(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		default:
 			/* should never happen */
 			goto out;
diff --git a/miner.h b/miner.h
index d0de194..2307729 100644
--- a/miner.h
+++ b/miner.h
@@ -113,11 +113,14 @@ void sha256_init(uint32_t *state);
 void sha256_transform(uint32_t *state, const uint32_t *block, int swap);
 
 #if defined(__x86_64__)
-#define SHA256_4WAY
+#define SHA256_4WAY 1
 void sha256_init_4way(uint32_t *state);
 void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap);
 #endif
 
+extern int scanhash_sha256d(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
+
 extern unsigned char *scrypt_buffer_alloc();
 extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
 	unsigned char *scratchbuf, const uint32_t *ptarget,
diff --git a/scrypt.c b/scrypt.c
index 0830737..3ae53ae 100644
--- a/scrypt.c
+++ b/scrypt.c
@@ -35,7 +35,7 @@
 #include <string.h>
 
 static const uint32_t keypad[12] = {
-	0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000
+	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
 };
 static const uint32_t innerpad[11] = {
 	0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
@@ -57,7 +57,7 @@ static inline void HMAC_SHA256_80_init(const uint32_t *key,
 	/* tstate is assumed to contain the midstate of key */
 	memcpy(pad, key + 16, 16);
 	memcpy(pad + 4, keypad, 48);
-	sha256_transform(tstate, pad, 1);
+	sha256_transform(tstate, pad, 0);
 	memcpy(ihash, tstate, 32);
 
 	sha256_init(ostate);
@@ -83,10 +83,9 @@ static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate,
 	int i, j;
 
 	memcpy(istate, tstate, 32);
-	sha256_transform(istate, salt, 1);
+	sha256_transform(istate, salt, 0);
 	
-	for (i = 0; i < 4; i++)
-		ibuf[i] = swab32(salt[16 + i]);
+	memcpy(ibuf, salt + 16, 16);
 	memcpy(ibuf + 5, innerpad, 44);
 	memcpy(obuf + 8, outerpad, 32);
 
@@ -123,7 +122,7 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
 #ifdef SHA256_4WAY
 
 static const uint32_t keypad_4way[4 * 12] = {
-	0x00000080, 0x00000080, 0x00000080, 0x00000080,
+	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -134,7 +133,7 @@ static const uint32_t keypad_4way[4 * 12] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
-	0x80020000, 0x80020000, 0x80020000, 0x80020000
+	0x00000280, 0x00000280, 0x00000280, 0x00000280
 };
 static const uint32_t innerpad_4way[4 * 11] = {
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
@@ -159,7 +158,7 @@ static const uint32_t outerpad_4way[4 * 8] = {
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000300, 0x00000300, 0x00000300, 0x00000300
 };
-static const uint32_t finalblk_4way[4 * 16] = {
+static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = {
 	0x00000001, 0x00000001, 0x00000001, 0x00000001,
 	0x80000000, 0x80000000, 0x80000000, 0x80000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -181,14 +180,14 @@ static const uint32_t finalblk_4way[4 * 16] = {
 static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
 	uint32_t *tstate, uint32_t *ostate)
 {
-	uint32_t ihash[4 * 8];
-	uint32_t pad[4 * 16];
+	uint32_t ihash[4 * 8] __attribute__((aligned(16)));
+	uint32_t pad[4 * 16] __attribute__((aligned(16)));
 	int i;
 
 	/* tstate is assumed to contain the midstate of key */
 	memcpy(pad, key + 4 * 16, 4 * 16);
 	memcpy(pad + 4 * 4, keypad_4way, 4 * 48);
-	sha256_transform_4way(tstate, pad, 1);
+	sha256_transform_4way(tstate, pad, 0);
 	memcpy(ihash, tstate, 4 * 32);
 
 	sha256_init_4way(ostate);
@@ -209,15 +208,16 @@ static inline void HMAC_SHA256_80_init_4way(const uint32_t *key,
 static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
 	const uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
-	uint32_t istate[4 * 8], ostate2[4 * 8];
-	uint32_t ibuf[4 * 16], obuf[4 * 16];
+	uint32_t istate[4 * 8] __attribute__((aligned(16)));
+	uint32_t ostate2[4 * 8] __attribute__((aligned(16)));
+	uint32_t ibuf[4 * 16] __attribute__((aligned(16)));
+	uint32_t obuf[4 * 16] __attribute__((aligned(16)));
 	int i, j;
 
 	memcpy(istate, tstate, 4 * 32);
-	sha256_transform_4way(istate, salt, 1);
+	sha256_transform_4way(istate, salt, 0);
 	
-	for (i = 0; i < 4 * 4; i++)
-		ibuf[i] = swab32(salt[4 * 16 + i]);
+	memcpy(ibuf, salt + 4 * 16, 4 * 16);
 	memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
 	memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32);
 
@@ -239,7 +239,7 @@ static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate,
 static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate,
 	uint32_t *ostate, const uint32_t *salt, uint32_t *output)
 {
-	uint32_t buf[4 * 16];
+	uint32_t buf[4 * 16] __attribute__((aligned(16)));
 	int i;
 	
 	sha256_transform_4way(tstate, salt, 1);
@@ -270,7 +270,7 @@ void scrypt_core(uint32_t *X, uint32_t *V);
 
 #else
 
-static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
+static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
 {
 	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
 	int i;
@@ -341,21 +341,18 @@ static inline void salsa20_8(uint32_t B[16], const uint32_t Bx[16])
 static inline void scrypt_core(uint32_t *X, uint32_t *V)
 {
 	uint32_t i, j, k;
-	uint64_t *p1, *p2;
 	
-	p1 = (uint64_t *)X;
 	for (i = 0; i < 1024; i++) {
 		memcpy(&V[i * 32], X, 128);
-		salsa20_8(&X[0], &X[16]);
-		salsa20_8(&X[16], &X[0]);
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
 	}
 	for (i = 0; i < 1024; i++) {
-		j = X[16] & 1023;
-		p2 = (uint64_t *)(&V[j * 32]);
-		for (k = 0; k < 16; k++)
-			p1[k] ^= p2[k];
-		salsa20_8(&X[0], &X[16]);
-		salsa20_8(&X[16], &X[0]);
+		j = 32 * (X[16] & 1023);
+		for (k = 0; k < 32; k++)
+			X[k] ^= V[j + k];
+		xor_salsa8(&X[0], &X[16]);
+		xor_salsa8(&X[16], &X[0]);
 	}
 }
 
@@ -377,8 +374,9 @@ static void scrypt_1024_1_1_256_sp(const uint32_t *input, uint32_t *output,
 	uint32_t *midstate, unsigned char *scratchpad)
 {
 	uint32_t tstate[8], ostate[8];
-	uint32_t *V;
 	uint32_t X[32];
+	uint32_t *V;
+	
 	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
 
 	memcpy(tstate, midstate, 32);
@@ -396,8 +394,8 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input,
 {
 	uint32_t tstate1[8], tstate2[8];
 	uint32_t ostate1[8], ostate2[8];
-	uint32_t *V;
 	uint32_t X[2 * 32], *Y = X + 32;
+	uint32_t *V;
 	
 	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
 
@@ -419,9 +417,10 @@ static void scrypt_1024_1_1_256_sp_2way(const uint32_t *input,
 static void scrypt_1024_1_1_256_sp_3way(const uint32_t *input,
 	uint32_t *output, uint32_t *midstate, unsigned char *scratchpad)
 {
-	uint32_t tstate[4 * 8], ostate[4 * 8];
-	uint32_t X[3 * 32];
-	uint32_t W[4 * 32];
+	uint32_t tstate[4 * 8] __attribute__((aligned(128)));
+	uint32_t ostate[4 * 8] __attribute__((aligned(128)));
+	uint32_t W[4 * 32] __attribute__((aligned(128)));
+	uint32_t X[3 * 32] __attribute__((aligned(128)));
 	uint32_t *V;
 	int i;
 	
@@ -474,7 +473,7 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
 		memcpy(data + i * 20, pdata, 80);
 	
 	sha256_init(midstate);
-	sha256_transform(midstate, data, 1);
+	sha256_transform(midstate, data, 0);
 	
 	do {
 		for (i = 0; i < throughput; i++)
diff --git a/sha2-x64.S b/sha2-x64.S
index 449b27c..5dbc73d 100644
--- a/sha2-x64.S
+++ b/sha2-x64.S
@@ -16,7 +16,7 @@
 #if defined(__x86_64__)
 
 	.data
-	.p2align 6
+	.p2align 7
 sha256_4h:
 	.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
 	.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
@@ -28,7 +28,7 @@ sha256_4h:
 	.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
 
 	.data
-	.p2align 6
+	.p2align 7
 sha256_4k:
 	.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
 	.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
@@ -126,6 +126,537 @@ _sha256_init_4way:
 #endif
 	ret
 	
+
+.macro sha256_sse2_extend_round i
+	movdqa	(\i-15)*16(%rcx), %xmm0
+	movdqa	(\i-14)*16(%rcx), %xmm4
+	movdqa	%xmm0, %xmm2
+	movdqa	%xmm4, %xmm6
+	psrld	$3, %xmm0
+	psrld	$3, %xmm4
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm5
+	pslld	$14, %xmm2
+	pslld	$14, %xmm6
+	psrld	$4, %xmm1
+	psrld	$4, %xmm5
+	pxor	%xmm1, %xmm0
+	pxor	%xmm5, %xmm4
+	psrld	$11, %xmm1
+	psrld	$11, %xmm5
+	pxor	%xmm2, %xmm0
+	pxor	%xmm6, %xmm4
+	pslld	$11, %xmm2
+	pslld	$11, %xmm6
+	pxor	%xmm1, %xmm0
+	pxor	%xmm5, %xmm4
+	pxor	%xmm2, %xmm0
+	pxor	%xmm6, %xmm4
+
+	movdqa	(\i-2)*16(%rcx), %xmm3
+	movdqa	(\i-1)*16(%rcx), %xmm7
+	paddd	(\i-16)*16(%rcx), %xmm0
+	paddd	(\i-15)*16(%rcx), %xmm4
+
+	movdqa	%xmm3, %xmm2
+	movdqa	%xmm7, %xmm6
+	psrld	$10, %xmm3
+	psrld	$10, %xmm7
+	movdqa	%xmm3, %xmm1
+	movdqa	%xmm7, %xmm5
+
+	paddd	(\i-7)*16(%rcx), %xmm0
+
+	pslld	$13, %xmm2
+	pslld	$13, %xmm6
+	psrld	$7, %xmm1
+	psrld	$7, %xmm5
+
+	paddd	(\i-6)*16(%rcx), %xmm4
+
+	pxor	%xmm1, %xmm3
+	pxor	%xmm5, %xmm7
+	psrld	$2, %xmm1
+	psrld	$2, %xmm5
+	pxor	%xmm2, %xmm3
+	pxor	%xmm6, %xmm7
+	pslld	$2, %xmm2
+	pslld	$2, %xmm6
+	pxor	%xmm1, %xmm3
+	pxor	%xmm5, %xmm7
+	pxor	%xmm2, %xmm3
+	pxor	%xmm6, %xmm7
+
+	paddd	%xmm3, %xmm0
+	paddd	%xmm7, %xmm4
+	movdqa	%xmm0, \i*16(%rcx)
+	movdqa	%xmm4, (\i+1)*16(%rcx)
+.endm
+
+	.text
+	.p2align 6
+sha256_sse2_extend_loop:
+	sha256_sse2_extend_round 0
+	sha256_sse2_extend_round 2
+	sha256_sse2_extend_round 4
+	sha256_sse2_extend_round 6
+	sha256_sse2_extend_round 8
+	sha256_sse2_extend_round 10
+	sha256_sse2_extend_round 12
+	sha256_sse2_extend_round 14
+	sha256_sse2_extend_round 16
+	sha256_sse2_extend_round 18
+	sha256_sse2_extend_round 20
+	sha256_sse2_extend_round 22
+	sha256_sse2_extend_round 24
+	sha256_sse2_extend_round 26
+	sha256_sse2_extend_round 28
+	sha256_sse2_extend_round 30
+	sha256_sse2_extend_round 32
+	sha256_sse2_extend_round 34
+	sha256_sse2_extend_round 36
+	sha256_sse2_extend_round 38
+	sha256_sse2_extend_round 40
+	sha256_sse2_extend_round 42
+	sha256_sse2_extend_round 44
+	sha256_sse2_extend_round 46
+	ret
+	
+.macro sha256_sse2_main_round i
+	movdqa	16*\i(%rax), %xmm6
+	paddd	16*\i(%rcx), %xmm6
+	paddd	%xmm10, %xmm6
+
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm9, %xmm2
+	pandn	%xmm2, %xmm1
+
+	movdqa	%xmm2, %xmm10
+	movdqa	%xmm8, %xmm2
+	movdqa	%xmm2, %xmm9
+
+	pand	%xmm0, %xmm2
+	pxor	%xmm2, %xmm1
+	movdqa	%xmm0, %xmm8
+
+	paddd	%xmm1, %xmm6
+
+	movdqa	%xmm0, %xmm1
+	psrld	$6, %xmm0
+	movdqa	%xmm0, %xmm2
+	pslld	$7, %xmm1
+	psrld	$5, %xmm2
+	pxor	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	pslld	$14, %xmm1
+	psrld	$14, %xmm2
+	pxor	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	pslld	$5, %xmm1
+	pxor	%xmm1, %xmm0
+	paddd	%xmm0, %xmm6
+
+	movdqa	%xmm3, %xmm0
+	paddd	%xmm6, %xmm0
+
+	movdqa	%xmm5, %xmm1
+	movdqa	%xmm4, %xmm3
+	movdqa	%xmm4, %xmm2
+	pand	%xmm5, %xmm2
+	pand	%xmm7, %xmm4
+	pand	%xmm7, %xmm1
+	pxor	%xmm4, %xmm1
+	movdqa	%xmm5, %xmm4
+	movdqa	%xmm7, %xmm5
+	pxor	%xmm2, %xmm1
+	paddd	%xmm1, %xmm6
+
+	movdqa	%xmm7, %xmm2
+	psrld	$2, %xmm7
+	movdqa	%xmm7, %xmm1
+	pslld	$10, %xmm2
+	psrld	$11, %xmm1
+	pxor	%xmm2, %xmm7
+	pxor	%xmm1, %xmm7
+	pslld	$9, %xmm2
+	psrld	$9, %xmm1
+	pxor	%xmm2, %xmm7
+	pxor	%xmm1, %xmm7
+	pslld	$11, %xmm2
+	pxor	%xmm2, %xmm7
+	paddd	%xmm6, %xmm7
+.endm
+
+	.text
+	.p2align 6
+sha256_sse2_main_loop:
+	sha256_sse2_main_round 0
+	sha256_sse2_main_round 1
+	sha256_sse2_main_round 2
+	sha256_sse2_main_round 3
+	sha256_sse2_main_round 4
+	sha256_sse2_main_round 5
+	sha256_sse2_main_round 6
+	sha256_sse2_main_round 7
+	sha256_sse2_main_round 8
+	sha256_sse2_main_round 9
+	sha256_sse2_main_round 10
+	sha256_sse2_main_round 11
+	sha256_sse2_main_round 12
+	sha256_sse2_main_round 13
+	sha256_sse2_main_round 14
+	sha256_sse2_main_round 15
+	sha256_sse2_main_round 16
+	sha256_sse2_main_round 17
+	sha256_sse2_main_round 18
+	sha256_sse2_main_round 19
+	sha256_sse2_main_round 20
+	sha256_sse2_main_round 21
+	sha256_sse2_main_round 22
+	sha256_sse2_main_round 23
+	sha256_sse2_main_round 24
+	sha256_sse2_main_round 25
+	sha256_sse2_main_round 26
+	sha256_sse2_main_round 27
+	sha256_sse2_main_round 28
+	sha256_sse2_main_round 29
+	sha256_sse2_main_round 30
+	sha256_sse2_main_round 31
+	sha256_sse2_main_round 32
+	sha256_sse2_main_round 33
+	sha256_sse2_main_round 34
+	sha256_sse2_main_round 35
+	sha256_sse2_main_round 36
+	sha256_sse2_main_round 37
+	sha256_sse2_main_round 38
+	sha256_sse2_main_round 39
+	sha256_sse2_main_round 40
+	sha256_sse2_main_round 41
+	sha256_sse2_main_round 42
+	sha256_sse2_main_round 43
+	sha256_sse2_main_round 44
+	sha256_sse2_main_round 45
+	sha256_sse2_main_round 46
+	sha256_sse2_main_round 47
+	sha256_sse2_main_round 48
+	sha256_sse2_main_round 49
+	sha256_sse2_main_round 50
+	sha256_sse2_main_round 51
+	sha256_sse2_main_round 52
+	sha256_sse2_main_round 53
+	sha256_sse2_main_round 54
+	sha256_sse2_main_round 55
+	sha256_sse2_main_round 56
+	sha256_sse2_main_round 57
+	sha256_sse2_main_round 58
+	sha256_sse2_main_round 59
+	sha256_sse2_main_round 60
+	sha256_sse2_main_round 61
+	sha256_sse2_main_round 62
+	sha256_sse2_main_round 63
+	ret
+
+
+#if defined(USE_AVX)	
+
+.macro sha256_avx_extend_round i
+	movdqa	(\i-15)*16(%rcx), %xmm0
+	movdqa	(\i-14)*16(%rcx), %xmm4
+	vpslld	$14, %xmm0, %xmm2
+	vpslld	$14, %xmm4, %xmm6
+	psrld	$3, %xmm0
+	psrld	$3, %xmm4
+	vpsrld	$4, %xmm0, %xmm1
+	vpsrld	$4, %xmm4, %xmm5
+	pxor	%xmm1, %xmm0
+	pxor	%xmm5, %xmm4
+	psrld	$11, %xmm1
+	psrld	$11, %xmm5
+	pxor	%xmm2, %xmm0
+	pxor	%xmm6, %xmm4
+	pslld	$11, %xmm2
+	pslld	$11, %xmm6
+	pxor	%xmm1, %xmm0
+	pxor	%xmm5, %xmm4
+	pxor	%xmm2, %xmm0
+	pxor	%xmm6, %xmm4
+
+	movdqa	(\i-2)*16(%rcx), %xmm3
+	movdqa	(\i-1)*16(%rcx), %xmm7
+	paddd	(\i-16)*16(%rcx), %xmm0
+	paddd	(\i-15)*16(%rcx), %xmm4
+
+	vpslld	$13, %xmm3, %xmm2
+	vpslld	$13, %xmm7, %xmm6
+	psrld	$10, %xmm3
+	psrld	$10, %xmm7
+
+	paddd	(\i-7)*16(%rcx), %xmm0
+
+	vpsrld	$7, %xmm3, %xmm1
+	vpsrld	$7, %xmm7, %xmm5
+
+	paddd	(\i-6)*16(%rcx), %xmm4
+
+	pxor	%xmm1, %xmm3
+	pxor	%xmm5, %xmm7
+	psrld	$2, %xmm1
+	psrld	$2, %xmm5
+	pxor	%xmm2, %xmm3
+	pxor	%xmm6, %xmm7
+	pslld	$2, %xmm2
+	pslld	$2, %xmm6
+	pxor	%xmm1, %xmm3
+	pxor	%xmm5, %xmm7
+	pxor	%xmm2, %xmm3
+	pxor	%xmm6, %xmm7
+
+	paddd	%xmm3, %xmm0
+	paddd	%xmm7, %xmm4
+	movdqa	%xmm0, \i*16(%rcx)
+	movdqa	%xmm4, (\i+1)*16(%rcx)
+.endm
+
+	.text
+	.p2align 6
+sha256_avx_extend_loop:
+	sha256_avx_extend_round 0
+	sha256_avx_extend_round 2
+	sha256_avx_extend_round 4
+	sha256_avx_extend_round 6
+	sha256_avx_extend_round 8
+	sha256_avx_extend_round 10
+	sha256_avx_extend_round 12
+	sha256_avx_extend_round 14
+	sha256_avx_extend_round 16
+	sha256_avx_extend_round 18
+	sha256_avx_extend_round 20
+	sha256_avx_extend_round 22
+	sha256_avx_extend_round 24
+	sha256_avx_extend_round 26
+	sha256_avx_extend_round 28
+	sha256_avx_extend_round 30
+	sha256_avx_extend_round 32
+	sha256_avx_extend_round 34
+	sha256_avx_extend_round 36
+	sha256_avx_extend_round 38
+	sha256_avx_extend_round 40
+	sha256_avx_extend_round 42
+	sha256_avx_extend_round 44
+	sha256_avx_extend_round 46
+	ret
+
+.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
+	vpaddd	16*(\i)(%rax), \r0, %xmm6
+	paddd	16*(\i)(%rcx), %xmm6
+
+	vpandn	\r1, \r3, %xmm1
+	vpand	\r3, \r2, %xmm2
+	pxor	%xmm2, %xmm1
+	paddd	%xmm1, %xmm6
+
+	vpslld	$7, \r3, %xmm1
+	vpsrld	$6, \r3, \r0
+	vpsrld	$5, \r0, %xmm2
+	pxor	%xmm1, \r0
+	pxor	%xmm2, \r0
+	pslld	$14, %xmm1
+	psrld	$14, %xmm2
+	pxor	%xmm1, \r0
+	pxor	%xmm2, \r0
+	pslld	$5, %xmm1
+	pxor	%xmm1, \r0
+	paddd	\r0, %xmm6
+
+	vpaddd	%xmm6, \r4, \r0
+
+	vpand	\r6, \r5, %xmm2
+	vpand	\r7, \r5, \r4
+	vpand	\r7, \r6, %xmm1
+	pxor	\r4, %xmm1
+	pxor	%xmm2, %xmm1
+	paddd	%xmm1, %xmm6
+
+	vpslld	$10, \r7, %xmm2
+	vpsrld	$2, \r7, \r4
+	vpsrld	$11, \r4, %xmm1
+	pxor	%xmm2, \r4
+	pxor	%xmm1, \r4
+	pslld	$9, %xmm2
+	psrld	$9, %xmm1
+	pxor	%xmm2, \r4
+	pxor	%xmm1, \r4
+	pslld	$11, %xmm2
+	pxor	%xmm2, \r4
+	paddd	%xmm6, \r4
+.endm
+
+.macro sha256_avx_main_quadround i
+	sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+	sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+	sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+	sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+.endm
+
+	.text
+	.p2align 6
+sha256_avx_main_loop:
+	sha256_avx_main_quadround 0
+	sha256_avx_main_quadround 4
+	sha256_avx_main_quadround 8
+	sha256_avx_main_quadround 12
+	sha256_avx_main_quadround 16
+	sha256_avx_main_quadround 20
+	sha256_avx_main_quadround 24
+	sha256_avx_main_quadround 28
+	sha256_avx_main_quadround 32
+	sha256_avx_main_quadround 36
+	sha256_avx_main_quadround 40
+	sha256_avx_main_quadround 44
+	sha256_avx_main_quadround 48
+	sha256_avx_main_quadround 52
+	sha256_avx_main_quadround 56
+	sha256_avx_main_quadround 60
+	ret
+
+#endif /* USE_AVX */
+
+
+#if defined(USE_XOP)	
+
+.macro sha256_xop_extend_round i
+	vmovdqa	(\i-15)*16(%rcx), %xmm0
+	vmovdqa	(\i-14)*16(%rcx), %xmm4
+	vprotd	$25, %xmm0, %xmm1
+	vprotd	$25, %xmm4, %xmm5
+	vprotd	$14, %xmm0, %xmm2
+	vprotd	$14, %xmm4, %xmm6
+	vpxor	%xmm1, %xmm2, %xmm2
+	vpxor	%xmm5, %xmm6, %xmm6
+	vpsrld	$3, %xmm0, %xmm0
+	vpsrld	$3, %xmm4, %xmm4
+	vpxor	%xmm2, %xmm0, %xmm0
+	vpxor	%xmm6, %xmm4, %xmm4
+
+	vmovdqa	(\i-2)*16(%rcx), %xmm3
+	vmovdqa	(\i-1)*16(%rcx), %xmm7
+	vpaddd	(\i-16)*16(%rcx), %xmm0, %xmm0
+	vpaddd	(\i-15)*16(%rcx), %xmm4, %xmm4
+
+	vprotd	$15, %xmm3, %xmm1
+	vprotd	$15, %xmm7, %xmm5
+	vprotd	$13, %xmm3, %xmm2
+	vprotd	$13, %xmm7, %xmm6
+	vpxor	%xmm1, %xmm2, %xmm2
+	vpxor	%xmm5, %xmm6, %xmm6
+
+	vpaddd	(\i-7)*16(%rcx), %xmm0, %xmm0
+	vpaddd	(\i-6)*16(%rcx), %xmm4, %xmm4
+
+	vpsrld	$10, %xmm3, %xmm3
+	vpsrld	$10, %xmm7, %xmm7
+	vpxor	%xmm2, %xmm3, %xmm3
+	vpxor	%xmm6, %xmm7, %xmm7
+
+	vpaddd	%xmm3, %xmm0, %xmm0
+	vpaddd	%xmm7, %xmm4, %xmm4
+	vmovdqa	%xmm0, \i*16(%rcx)
+	vmovdqa	%xmm4, (\i+1)*16(%rcx)
+.endm
+
+	.text
+	.p2align 6
+sha256_xop_extend_loop:
+	sha256_xop_extend_round 0
+	sha256_xop_extend_round 2
+	sha256_xop_extend_round 4
+	sha256_xop_extend_round 6
+	sha256_xop_extend_round 8
+	sha256_xop_extend_round 10
+	sha256_xop_extend_round 12
+	sha256_xop_extend_round 14
+	sha256_xop_extend_round 16
+	sha256_xop_extend_round 18
+	sha256_xop_extend_round 20
+	sha256_xop_extend_round 22
+	sha256_xop_extend_round 24
+	sha256_xop_extend_round 26
+	sha256_xop_extend_round 28
+	sha256_xop_extend_round 30
+	sha256_xop_extend_round 32
+	sha256_xop_extend_round 34
+	sha256_xop_extend_round 36
+	sha256_xop_extend_round 38
+	sha256_xop_extend_round 40
+	sha256_xop_extend_round 42
+	sha256_xop_extend_round 44
+	sha256_xop_extend_round 46
+	ret
+	
+.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
+	vpaddd	16*(\i)(%rax), \r0, %xmm6
+	vpaddd	16*(\i)(%rcx), %xmm6, %xmm6
+
+	vpandn	\r1, \r3, %xmm1
+	vpand	\r3, \r2, %xmm2
+	vpxor	%xmm2, %xmm1, %xmm1
+	vpaddd	%xmm1, %xmm6, %xmm6
+
+	vprotd	$26, \r3, %xmm1
+	vprotd	$21, \r3, %xmm2
+	vpxor	%xmm1, %xmm2, %xmm2
+	vprotd	$7, \r3, \r0
+	vpxor	%xmm2, \r0, \r0
+	vpaddd	\r0, %xmm6, %xmm6
+
+	vpaddd	%xmm6, \r4, \r0
+
+	vpand	\r6, \r5, %xmm2
+	vpand	\r7, \r5, \r4
+	vpand	\r7, \r6, %xmm1
+	vpxor	\r4, %xmm1, %xmm1
+	vpxor	%xmm2, %xmm1, %xmm1
+	vpaddd	%xmm1, %xmm6, %xmm6
+
+	vprotd	$30, \r7, %xmm1
+	vprotd	$19, \r7, %xmm2
+	vpxor	%xmm1, %xmm2, %xmm2
+	vprotd	$10, \r7, \r4
+	vpxor	%xmm2, \r4, \r4
+	vpaddd	%xmm6, \r4, \r4
+.endm
+
+.macro sha256_xop_main_quadround i
+	sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
+	sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
+	sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
+	sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
+.endm
+
+	.text
+	.p2align 6
+sha256_xop_main_loop:
+	sha256_xop_main_quadround 0
+	sha256_xop_main_quadround 4
+	sha256_xop_main_quadround 8
+	sha256_xop_main_quadround 12
+	sha256_xop_main_quadround 16
+	sha256_xop_main_quadround 20
+	sha256_xop_main_quadround 24
+	sha256_xop_main_quadround 28
+	sha256_xop_main_quadround 32
+	sha256_xop_main_quadround 36
+	sha256_xop_main_quadround 40
+	sha256_xop_main_quadround 44
+	sha256_xop_main_quadround 48
+	sha256_xop_main_quadround 52
+	sha256_xop_main_quadround 56
+	sha256_xop_main_quadround 60
+	ret
+
+#endif /* USE_XOP */
+
+
 .macro p2bswap_rsi_rsp i
 	movdqu	\i*16(%rsi), %xmm0
 	movdqu	(\i+1)*16(%rsi), %xmm2
@@ -165,7 +696,9 @@ _sha256_transform_4way:
 	movq	%rdx, %rsi
 	movq	%r8, %rdx
 #endif
+	movq	%rsp, %r8
 	subq	$1032, %rsp
+	andq	$-128, %rsp
 	
 	testq	%rdx, %rdx
 	jz sha256_transform_4way_block_copy
@@ -391,7 +924,7 @@ sha256_transform_4way_main_loop:
 	movdqu	%xmm9, 96(%rdi)
 	movdqu	%xmm10, 112(%rdi)
 	
-	addq	$1032, %rsp
+	movq	%r8, %rsp
 #if defined(WIN64)
 	popq	%rsi
 	movdqa	0(%rsp), %xmm6
@@ -404,5 +937,414 @@ sha256_transform_4way_main_loop:
 	popq	%rdi
 #endif
 	ret
+	
+	
+	.data
+	.p2align 3
+sha256d_4way_addr:
+	.quad 0x0
+	
+	.text
+	.p2align 6
+	.globl sha256d_4way
+	.globl _sha256d_4way
+sha256d_4way:
+_sha256d_4way:
+	movq	sha256d_4way_addr(%rip), %rax
+	testq	%rax, %rax
+	jz sha256d_4way_set
+	jmp *%rax
+	
+sha256d_4way_set:
+	pushq	%rbx
+	pushq	%rcx
+	pushq	%rdx
+	
+#if defined(USE_AVX)
+	# Check for AVX and OSXSAVE support
+	movl	$1, %eax
+	cpuid
+	andl	$0x18000000, %ecx
+	cmpl	$0x18000000, %ecx
+	jne sha256d_4way_set_sse2
+	# Check for XMM and YMM state support
+	xorl	%ecx, %ecx
+	xgetbv
+	andl	$0x00000006, %eax
+	cmpl	$0x00000006, %eax
+	jne sha256d_4way_set_sse2
+#if defined(USE_XOP)
+	# Check for XOP support
+	movl	$0x80000001, %eax
+	cpuid
+	andl	$0x00000800, %ecx
+	jz sha256d_4way_set_avx
+	
+sha256d_4way_set_xop:
+	leaq	sha256d_4way_xop(%rip), %rax
+	jmp sha256d_4way_set_done
+#endif /* USE_XOP */
+	
+sha256d_4way_set_avx:
+	leaq	sha256d_4way_avx(%rip), %rax
+	jmp sha256d_4way_set_done
+#endif /* USE_AVX */
+	
+sha256d_4way_set_sse2:
+	leaq	sha256d_4way_sse2(%rip), %rax
+	
+sha256d_4way_set_done:
+	movq	%rax, sha256d_4way_addr(%rip)
+	popq	%rdx
+	popq	%rcx
+	popq	%rbx
+	jmp *%rax
+	
+	
+	.p2align 6
+sha256d_4way_sse2:
+#if defined(WIN64)
+	pushq	%rdi
+	subq	$80, %rsp
+	movdqa	%xmm6, 0(%rsp)
+	movdqa	%xmm7, 16(%rsp)
+	movdqa	%xmm8, 32(%rsp)
+	movdqa	%xmm9, 48(%rsp)
+	movdqa	%xmm10, 64(%rsp)
+	pushq	%rsi
+	movq	%rcx, %rdi
+	movq	%rdx, %rsi
+	movq	%r8, %rdx
+#endif
+	subq	$1032, %rsp
+	
+	leaq	256(%rsi), %rcx
+	call sha256_sse2_extend_loop
+	
+	movdqa	0(%rdx), %xmm7
+	movdqa	16(%rdx), %xmm5
+	movdqa	32(%rdx), %xmm4
+	movdqa	48(%rdx), %xmm3
+	movdqa	64(%rdx), %xmm0
+	movdqa	80(%rdx), %xmm8
+	movdqa	96(%rdx), %xmm9
+	movdqa	112(%rdx), %xmm10
+	
+	movq	%rsi, %rax
+	leaq	sha256_4k(%rip), %rcx
+	call sha256_sse2_main_loop
+	
+	paddd	0(%rdx), %xmm7
+	paddd	16(%rdx), %xmm5
+	paddd	32(%rdx), %xmm4
+	paddd	48(%rdx), %xmm3
+	paddd	64(%rdx), %xmm0
+	paddd	80(%rdx), %xmm8
+	paddd	96(%rdx), %xmm9
+	paddd	112(%rdx), %xmm10
+	
+	movdqa	%xmm7, 0(%rsp)
+	movdqa	%xmm5, 16(%rsp)
+	movdqa	%xmm4, 32(%rsp)
+	movdqa	%xmm3, 48(%rsp)
+	movdqa	%xmm0, 64(%rsp)
+	movdqa	%xmm8, 80(%rsp)
+	movdqa	%xmm9, 96(%rsp)
+	movdqa	%xmm10, 112(%rsp)
+	
+	pxor	%xmm0, %xmm0
+	movq	$0x8000000000000100, %rax
+	movd	%rax, %xmm1
+	pshufd	$0x55, %xmm1, %xmm2
+	pshufd	$0x00, %xmm1, %xmm1
+	movdqa	%xmm2, 128(%rsp)
+	movdqa	%xmm0, 144(%rsp)
+	movdqa	%xmm0, 160(%rsp)
+	movdqa	%xmm0, 176(%rsp)
+	movdqa	%xmm0, 192(%rsp)
+	movdqa	%xmm0, 208(%rsp)
+	movdqa	%xmm0, 224(%rsp)
+	movdqa	%xmm1, 240(%rsp)
+	
+	leaq	256(%rsp), %rcx
+	call sha256_sse2_extend_loop
+	
+	movdqa	sha256_4h+0(%rip), %xmm7
+	movdqa	sha256_4h+16(%rip), %xmm5
+	movdqa	sha256_4h+32(%rip), %xmm4
+	movdqa	sha256_4h+48(%rip), %xmm3
+	movdqa	sha256_4h+64(%rip), %xmm0
+	movdqa	sha256_4h+80(%rip), %xmm8
+	movdqa	sha256_4h+96(%rip), %xmm9
+	movdqa	sha256_4h+112(%rip), %xmm10
+	
+	movq	%rsp, %rax
+	leaq	sha256_4k(%rip), %rcx
+	call sha256_sse2_main_loop
+	
+	paddd	sha256_4h+0(%rip), %xmm7
+	paddd	sha256_4h+16(%rip), %xmm5
+	paddd	sha256_4h+32(%rip), %xmm4
+	paddd	sha256_4h+48(%rip), %xmm3
+	paddd	sha256_4h+64(%rip), %xmm0
+	paddd	sha256_4h+80(%rip), %xmm8
+	paddd	sha256_4h+96(%rip), %xmm9
+	paddd	sha256_4h+112(%rip), %xmm10
+	
+	movdqa	%xmm7, 0(%rdi)
+	movdqa	%xmm5, 16(%rdi)
+	movdqa	%xmm4, 32(%rdi)
+	movdqa	%xmm3, 48(%rdi)
+	movdqa	%xmm0, 64(%rdi)
+	movdqa	%xmm8, 80(%rdi)
+	movdqa	%xmm9, 96(%rdi)
+	movdqa	%xmm10, 112(%rdi)
+	
+	addq	$1032, %rsp
+#if defined(WIN64)
+	popq	%rsi
+	movdqa	0(%rsp), %xmm6
+	movdqa	16(%rsp), %xmm7
+	movdqa	32(%rsp), %xmm8
+	movdqa	48(%rsp), %xmm9
+	movdqa	64(%rsp), %xmm10
+	addq	$80, %rsp
+	popq	%rdi
+#endif
+	ret
+	
+	
+#if defined(USE_AVX)
+	
+	.p2align 6
+sha256d_4way_avx:
+#if defined(WIN64)
+	pushq	%rdi
+	subq	$80, %rsp
+	movdqa	%xmm6, 0(%rsp)
+	movdqa	%xmm7, 16(%rsp)
+	movdqa	%xmm8, 32(%rsp)
+	movdqa	%xmm9, 48(%rsp)
+	movdqa	%xmm10, 64(%rsp)
+	pushq	%rsi
+	movq	%rcx, %rdi
+	movq	%rdx, %rsi
+	movq	%r8, %rdx
+#endif
+	subq	$1032, %rsp
+	
+	leaq	256(%rsi), %rcx
+	call sha256_avx_extend_loop
+	
+	movdqa	0(%rdx), %xmm7
+	movdqa	16(%rdx), %xmm5
+	movdqa	32(%rdx), %xmm4
+	movdqa	48(%rdx), %xmm3
+	movdqa	64(%rdx), %xmm0
+	movdqa	80(%rdx), %xmm8
+	movdqa	96(%rdx), %xmm9
+	movdqa	112(%rdx), %xmm10
+	
+	movq	%rsi, %rax
+	leaq	sha256_4k(%rip), %rcx
+	call sha256_avx_main_loop
+	
+	paddd	0(%rdx), %xmm7
+	paddd	16(%rdx), %xmm5
+	paddd	32(%rdx), %xmm4
+	paddd	48(%rdx), %xmm3
+	paddd	64(%rdx), %xmm0
+	paddd	80(%rdx), %xmm8
+	paddd	96(%rdx), %xmm9
+	paddd	112(%rdx), %xmm10
+	
+	movdqa	%xmm7, 0(%rsp)
+	movdqa	%xmm5, 16(%rsp)
+	movdqa	%xmm4, 32(%rsp)
+	movdqa	%xmm3, 48(%rsp)
+	movdqa	%xmm0, 64(%rsp)
+	movdqa	%xmm8, 80(%rsp)
+	movdqa	%xmm9, 96(%rsp)
+	movdqa	%xmm10, 112(%rsp)
+	
+	pxor	%xmm0, %xmm0
+	movq	$0x8000000000000100, %rax
+	movd	%rax, %xmm1
+	pshufd	$0x55, %xmm1, %xmm2
+	pshufd	$0x00, %xmm1, %xmm1
+	movdqa	%xmm2, 128(%rsp)
+	movdqa	%xmm0, 144(%rsp)
+	movdqa	%xmm0, 160(%rsp)
+	movdqa	%xmm0, 176(%rsp)
+	movdqa	%xmm0, 192(%rsp)
+	movdqa	%xmm0, 208(%rsp)
+	movdqa	%xmm0, 224(%rsp)
+	movdqa	%xmm1, 240(%rsp)
+	
+	leaq	256(%rsp), %rcx
+	call sha256_avx_extend_loop
+	
+	movdqa	sha256_4h+0(%rip), %xmm7
+	movdqa	sha256_4h+16(%rip), %xmm5
+	movdqa	sha256_4h+32(%rip), %xmm4
+	movdqa	sha256_4h+48(%rip), %xmm3
+	movdqa	sha256_4h+64(%rip), %xmm0
+	movdqa	sha256_4h+80(%rip), %xmm8
+	movdqa	sha256_4h+96(%rip), %xmm9
+	movdqa	sha256_4h+112(%rip), %xmm10
+	
+	movq	%rsp, %rax
+	leaq	sha256_4k(%rip), %rcx
+	call sha256_avx_main_loop
+	
+	paddd	sha256_4h+0(%rip), %xmm7
+	paddd	sha256_4h+16(%rip), %xmm5
+	paddd	sha256_4h+32(%rip), %xmm4
+	paddd	sha256_4h+48(%rip), %xmm3
+	paddd	sha256_4h+64(%rip), %xmm0
+	paddd	sha256_4h+80(%rip), %xmm8
+	paddd	sha256_4h+96(%rip), %xmm9
+	paddd	sha256_4h+112(%rip), %xmm10
+	
+	movdqa	%xmm7, 0(%rdi)
+	movdqa	%xmm5, 16(%rdi)
+	movdqa	%xmm4, 32(%rdi)
+	movdqa	%xmm3, 48(%rdi)
+	movdqa	%xmm0, 64(%rdi)
+	movdqa	%xmm8, 80(%rdi)
+	movdqa	%xmm9, 96(%rdi)
+	movdqa	%xmm10, 112(%rdi)
+	
+	addq	$1032, %rsp
+#if defined(WIN64)
+	popq	%rsi
+	movdqa	0(%rsp), %xmm6
+	movdqa	16(%rsp), %xmm7
+	movdqa	32(%rsp), %xmm8
+	movdqa	48(%rsp), %xmm9
+	movdqa	64(%rsp), %xmm10
+	addq	$80, %rsp
+	popq	%rdi
+#endif
+	ret
+	
+#endif /* USE_AVX */
+	
+	
+#if defined(USE_XOP)
+	
+	.p2align 6
+sha256d_4way_xop:
+#if defined(WIN64)
+	pushq	%rdi
+	subq	$80, %rsp
+	movdqa	%xmm6, 0(%rsp)
+	movdqa	%xmm7, 16(%rsp)
+	movdqa	%xmm8, 32(%rsp)
+	movdqa	%xmm9, 48(%rsp)
+	movdqa	%xmm10, 64(%rsp)
+	pushq	%rsi
+	movq	%rcx, %rdi
+	movq	%rdx, %rsi
+	movq	%r8, %rdx
+#endif
+	subq	$1032, %rsp
+	
+	leaq	256(%rsi), %rcx
+	call sha256_xop_extend_loop
+	
+	movdqa	0(%rdx), %xmm7
+	movdqa	16(%rdx), %xmm5
+	movdqa	32(%rdx), %xmm4
+	movdqa	48(%rdx), %xmm3
+	movdqa	64(%rdx), %xmm0
+	movdqa	80(%rdx), %xmm8
+	movdqa	96(%rdx), %xmm9
+	movdqa	112(%rdx), %xmm10
+	
+	movq	%rsi, %rax
+	leaq	sha256_4k(%rip), %rcx
+	call sha256_xop_main_loop
+	
+	paddd	0(%rdx), %xmm7
+	paddd	16(%rdx), %xmm5
+	paddd	32(%rdx), %xmm4
+	paddd	48(%rdx), %xmm3
+	paddd	64(%rdx), %xmm0
+	paddd	80(%rdx), %xmm8
+	paddd	96(%rdx), %xmm9
+	paddd	112(%rdx), %xmm10
+	
+	movdqa	%xmm7, 0(%rsp)
+	movdqa	%xmm5, 16(%rsp)
+	movdqa	%xmm4, 32(%rsp)
+	movdqa	%xmm3, 48(%rsp)
+	movdqa	%xmm0, 64(%rsp)
+	movdqa	%xmm8, 80(%rsp)
+	movdqa	%xmm9, 96(%rsp)
+	movdqa	%xmm10, 112(%rsp)
+	
+	pxor	%xmm0, %xmm0
+	movq	$0x8000000000000100, %rax
+	movd	%rax, %xmm1
+	pshufd	$0x55, %xmm1, %xmm2
+	pshufd	$0x00, %xmm1, %xmm1
+	movdqa	%xmm2, 128(%rsp)
+	movdqa	%xmm0, 144(%rsp)
+	movdqa	%xmm0, 160(%rsp)
+	movdqa	%xmm0, 176(%rsp)
+	movdqa	%xmm0, 192(%rsp)
+	movdqa	%xmm0, 208(%rsp)
+	movdqa	%xmm0, 224(%rsp)
+	movdqa	%xmm1, 240(%rsp)
+	
+	leaq	256(%rsp), %rcx
+	call sha256_xop_extend_loop
+	
+	movdqa	sha256_4h+0(%rip), %xmm7
+	movdqa	sha256_4h+16(%rip), %xmm5
+	movdqa	sha256_4h+32(%rip), %xmm4
+	movdqa	sha256_4h+48(%rip), %xmm3
+	movdqa	sha256_4h+64(%rip), %xmm0
+	movdqa	sha256_4h+80(%rip), %xmm8
+	movdqa	sha256_4h+96(%rip), %xmm9
+	movdqa	sha256_4h+112(%rip), %xmm10
+	
+	movq	%rsp, %rax
+	leaq	sha256_4k(%rip), %rcx
+	call sha256_xop_main_loop
+	
+	paddd	sha256_4h+0(%rip), %xmm7
+	paddd	sha256_4h+16(%rip), %xmm5
+	paddd	sha256_4h+32(%rip), %xmm4
+	paddd	sha256_4h+48(%rip), %xmm3
+	paddd	sha256_4h+64(%rip), %xmm0
+	paddd	sha256_4h+80(%rip), %xmm8
+	paddd	sha256_4h+96(%rip), %xmm9
+	paddd	sha256_4h+112(%rip), %xmm10
+	
+	movdqa	%xmm7, 0(%rdi)
+	movdqa	%xmm5, 16(%rdi)
+	movdqa	%xmm4, 32(%rdi)
+	movdqa	%xmm3, 48(%rdi)
+	movdqa	%xmm0, 64(%rdi)
+	movdqa	%xmm8, 80(%rdi)
+	movdqa	%xmm9, 96(%rdi)
+	movdqa	%xmm10, 112(%rdi)
+	
+	addq	$1032, %rsp
+#if defined(WIN64)
+	popq	%rsi
+	movdqa	0(%rsp), %xmm6
+	movdqa	16(%rsp), %xmm7
+	movdqa	32(%rsp), %xmm8
+	movdqa	48(%rsp), %xmm9
+	movdqa	64(%rsp), %xmm10
+	addq	$80, %rsp
+	popq	%rdi
+#endif
+	ret
+	
+#endif /* USE_XOP */
 
 #endif
diff --git a/sha2.c b/sha2.c
index 78368b7..144bacf 100644
--- a/sha2.c
+++ b/sha2.c
@@ -13,43 +13,60 @@
 #include <string.h>
 #include <stdint.h>
 
+static const uint32_t sha256_h[8] = {
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+	0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+static const uint32_t sha256_k[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
 void sha256_init(uint32_t *state)
 {
-	/* Magic initialization constants */
-	state[0] = 0x6a09e667;
-	state[1] = 0xbb67ae85;
-	state[2] = 0x3C6ef372;
-	state[3] = 0xa54ff53a;
-	state[4] = 0x510e527f;
-	state[5] = 0x9b05688c;
-	state[6] = 0x1f83d9Ab;
-	state[7] = 0x5be0cd19;
+	memcpy(state, sha256_h, 32);
 }
 
 /* Elementary functions used by SHA256 */
 #define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)    ((x & (y | z)) | (y & z))
-#define SHR(x, n)       (x >> n)
 #define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
 #define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
 #define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
-#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
+#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
 
 /* SHA256 round function */
 #define RND(a, b, c, d, e, f, g, h, k) \
-	t0 = h + S1(e) + Ch(e, f, g) + k; \
-	t1 = S0(a) + Maj(a, b, c); \
-	d += t0; \
-	h  = t0 + t1;
+	do { \
+		t0 = h + S1(e) + Ch(e, f, g) + k; \
+		t1 = S0(a) + Maj(a, b, c); \
+		d += t0; \
+		h  = t0 + t1; \
+	} while (0)
 
 /* Adjusted round function for rotating state */
-#define RNDr(S, W, i, k) \
+#define RNDr(S, W, i) \
 	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
 	    S[(66 - i) % 8], S[(67 - i) % 8], \
 	    S[(68 - i) % 8], S[(69 - i) % 8], \
 	    S[(70 - i) % 8], S[(71 - i) % 8], \
-	    W[i] + k)
+	    W[i] + sha256_k[i])
 
 /*
  * SHA256 block compression function.  The 256-bit state is transformed via
@@ -77,72 +94,299 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
 	memcpy(S, state, 32);
 
 	/* 3. Mix. */
-	RNDr(S, W,  0, 0x428a2f98);
-	RNDr(S, W,  1, 0x71374491);
-	RNDr(S, W,  2, 0xb5c0fbcf);
-	RNDr(S, W,  3, 0xe9b5dba5);
-	RNDr(S, W,  4, 0x3956c25b);
-	RNDr(S, W,  5, 0x59f111f1);
-	RNDr(S, W,  6, 0x923f82a4);
-	RNDr(S, W,  7, 0xab1c5ed5);
-	RNDr(S, W,  8, 0xd807aa98);
-	RNDr(S, W,  9, 0x12835b01);
-	RNDr(S, W, 10, 0x243185be);
-	RNDr(S, W, 11, 0x550c7dc3);
-	RNDr(S, W, 12, 0x72be5d74);
-	RNDr(S, W, 13, 0x80deb1fe);
-	RNDr(S, W, 14, 0x9bdc06a7);
-	RNDr(S, W, 15, 0xc19bf174);
-	RNDr(S, W, 16, 0xe49b69c1);
-	RNDr(S, W, 17, 0xefbe4786);
-	RNDr(S, W, 18, 0x0fc19dc6);
-	RNDr(S, W, 19, 0x240ca1cc);
-	RNDr(S, W, 20, 0x2de92c6f);
-	RNDr(S, W, 21, 0x4a7484aa);
-	RNDr(S, W, 22, 0x5cb0a9dc);
-	RNDr(S, W, 23, 0x76f988da);
-	RNDr(S, W, 24, 0x983e5152);
-	RNDr(S, W, 25, 0xa831c66d);
-	RNDr(S, W, 26, 0xb00327c8);
-	RNDr(S, W, 27, 0xbf597fc7);
-	RNDr(S, W, 28, 0xc6e00bf3);
-	RNDr(S, W, 29, 0xd5a79147);
-	RNDr(S, W, 30, 0x06ca6351);
-	RNDr(S, W, 31, 0x14292967);
-	RNDr(S, W, 32, 0x27b70a85);
-	RNDr(S, W, 33, 0x2e1b2138);
-	RNDr(S, W, 34, 0x4d2c6dfc);
-	RNDr(S, W, 35, 0x53380d13);
-	RNDr(S, W, 36, 0x650a7354);
-	RNDr(S, W, 37, 0x766a0abb);
-	RNDr(S, W, 38, 0x81c2c92e);
-	RNDr(S, W, 39, 0x92722c85);
-	RNDr(S, W, 40, 0xa2bfe8a1);
-	RNDr(S, W, 41, 0xa81a664b);
-	RNDr(S, W, 42, 0xc24b8b70);
-	RNDr(S, W, 43, 0xc76c51a3);
-	RNDr(S, W, 44, 0xd192e819);
-	RNDr(S, W, 45, 0xd6990624);
-	RNDr(S, W, 46, 0xf40e3585);
-	RNDr(S, W, 47, 0x106aa070);
-	RNDr(S, W, 48, 0x19a4c116);
-	RNDr(S, W, 49, 0x1e376c08);
-	RNDr(S, W, 50, 0x2748774c);
-	RNDr(S, W, 51, 0x34b0bcb5);
-	RNDr(S, W, 52, 0x391c0cb3);
-	RNDr(S, W, 53, 0x4ed8aa4a);
-	RNDr(S, W, 54, 0x5b9cca4f);
-	RNDr(S, W, 55, 0x682e6ff3);
-	RNDr(S, W, 56, 0x748f82ee);
-	RNDr(S, W, 57, 0x78a5636f);
-	RNDr(S, W, 58, 0x84c87814);
-	RNDr(S, W, 59, 0x8cc70208);
-	RNDr(S, W, 60, 0x90befffa);
-	RNDr(S, W, 61, 0xa4506ceb);
-	RNDr(S, W, 62, 0xbef9a3f7);
-	RNDr(S, W, 63, 0xc67178f2);
+	RNDr(S, W,  0);
+	RNDr(S, W,  1);
+	RNDr(S, W,  2);
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
 
 	/* 4. Mix local working variables into global state */
 	for (i = 0; i < 8; i++)
 		state[i] += S[i];
 }
+
+#if defined(__x86_64__)
+
+#define SHA256D_WAYS 4
+
+void sha256d_4way(uint32_t *hash,  uint32_t *data, const uint32_t *midstate);
+
+#else
+
+#define SHA256D_WAYS 1
+
+static const uint32_t sha256d_hash1[16] = {
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x80000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000100
+};
+
+static inline void sha256d(uint32_t *hash, uint32_t *W,
+	const uint32_t *midstate)
+{
+	uint32_t S[64];
+	uint32_t t0, t1;
+	int i;
+
+	for (i = 16; i < 64; i += 2) {
+		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	memcpy(S, midstate, 32);
+
+	RNDr(S, W,  0);
+	RNDr(S, W,  1);
+	RNDr(S, W,  2);
+	RNDr(S, W,  3);
+	RNDr(S, W,  4);
+	RNDr(S, W,  5);
+	RNDr(S, W,  6);
+	RNDr(S, W,  7);
+	RNDr(S, W,  8);
+	RNDr(S, W,  9);
+	RNDr(S, W, 10);
+	RNDr(S, W, 11);
+	RNDr(S, W, 12);
+	RNDr(S, W, 13);
+	RNDr(S, W, 14);
+	RNDr(S, W, 15);
+	RNDr(S, W, 16);
+	RNDr(S, W, 17);
+	RNDr(S, W, 18);
+	RNDr(S, W, 19);
+	RNDr(S, W, 20);
+	RNDr(S, W, 21);
+	RNDr(S, W, 22);
+	RNDr(S, W, 23);
+	RNDr(S, W, 24);
+	RNDr(S, W, 25);
+	RNDr(S, W, 26);
+	RNDr(S, W, 27);
+	RNDr(S, W, 28);
+	RNDr(S, W, 29);
+	RNDr(S, W, 30);
+	RNDr(S, W, 31);
+	RNDr(S, W, 32);
+	RNDr(S, W, 33);
+	RNDr(S, W, 34);
+	RNDr(S, W, 35);
+	RNDr(S, W, 36);
+	RNDr(S, W, 37);
+	RNDr(S, W, 38);
+	RNDr(S, W, 39);
+	RNDr(S, W, 40);
+	RNDr(S, W, 41);
+	RNDr(S, W, 42);
+	RNDr(S, W, 43);
+	RNDr(S, W, 44);
+	RNDr(S, W, 45);
+	RNDr(S, W, 46);
+	RNDr(S, W, 47);
+	RNDr(S, W, 48);
+	RNDr(S, W, 49);
+	RNDr(S, W, 50);
+	RNDr(S, W, 51);
+	RNDr(S, W, 52);
+	RNDr(S, W, 53);
+	RNDr(S, W, 54);
+	RNDr(S, W, 55);
+	RNDr(S, W, 56);
+	RNDr(S, W, 57);
+	RNDr(S, W, 58);
+	RNDr(S, W, 59);
+	RNDr(S, W, 60);
+	RNDr(S, W, 61);
+	RNDr(S, W, 62);
+	RNDr(S, W, 63);
+
+	for (i = 0; i < 8; i++)
+		S[i] += midstate[i];
+	
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	for (i = 16; i < 64; i += 2) {
+		S[i]   = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
+		S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
+	}
+
+	sha256_init(hash);
+
+	RNDr(hash, S,  0);
+	RNDr(hash, S,  1);
+	RNDr(hash, S,  2);
+	RNDr(hash, S,  3);
+	RNDr(hash, S,  4);
+	RNDr(hash, S,  5);
+	RNDr(hash, S,  6);
+	RNDr(hash, S,  7);
+	RNDr(hash, S,  8);
+	RNDr(hash, S,  9);
+	RNDr(hash, S, 10);
+	RNDr(hash, S, 11);
+	RNDr(hash, S, 12);
+	RNDr(hash, S, 13);
+	RNDr(hash, S, 14);
+	RNDr(hash, S, 15);
+	RNDr(hash, S, 16);
+	RNDr(hash, S, 17);
+	RNDr(hash, S, 18);
+	RNDr(hash, S, 19);
+	RNDr(hash, S, 20);
+	RNDr(hash, S, 21);
+	RNDr(hash, S, 22);
+	RNDr(hash, S, 23);
+	RNDr(hash, S, 24);
+	RNDr(hash, S, 25);
+	RNDr(hash, S, 26);
+	RNDr(hash, S, 27);
+	RNDr(hash, S, 28);
+	RNDr(hash, S, 29);
+	RNDr(hash, S, 30);
+	RNDr(hash, S, 31);
+	RNDr(hash, S, 32);
+	RNDr(hash, S, 33);
+	RNDr(hash, S, 34);
+	RNDr(hash, S, 35);
+	RNDr(hash, S, 36);
+	RNDr(hash, S, 37);
+	RNDr(hash, S, 38);
+	RNDr(hash, S, 39);
+	RNDr(hash, S, 40);
+	RNDr(hash, S, 41);
+	RNDr(hash, S, 42);
+	RNDr(hash, S, 43);
+	RNDr(hash, S, 44);
+	RNDr(hash, S, 45);
+	RNDr(hash, S, 46);
+	RNDr(hash, S, 47);
+	RNDr(hash, S, 48);
+	RNDr(hash, S, 49);
+	RNDr(hash, S, 50);
+	RNDr(hash, S, 51);
+	RNDr(hash, S, 52);
+	RNDr(hash, S, 53);
+	RNDr(hash, S, 54);
+	RNDr(hash, S, 55);
+	RNDr(hash, S, 56);
+	RNDr(hash, S, 57);
+	RNDr(hash, S, 58);
+	RNDr(hash, S, 59);
+	RNDr(hash, S, 60);
+	RNDr(hash, S, 61);
+	RNDr(hash, S, 62);
+	RNDr(hash, S, 63);
+
+	for (i = 0; i < 8; i++)
+		hash[i] += sha256_h[i];
+}
+
+#endif
+
+int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t data[SHA256D_WAYS * 64] __attribute__((aligned(128)));
+	uint32_t hash[SHA256D_WAYS * 8] __attribute__((aligned(32)));
+	uint32_t midstate[SHA256D_WAYS * 8] __attribute__((aligned(32)));
+	uint32_t tmp[8];
+	uint32_t n = pdata[19] - 1;
+	const uint32_t Htarg = ptarget[7];
+	int i, j;
+	
+	for (i = 15; i >= 0; i--)
+		for (j = 0; j < SHA256D_WAYS; j++)
+			data[i * SHA256D_WAYS + j] = pdata[16 + i];
+	
+	sha256_init(midstate);
+	sha256_transform(midstate, pdata, 0);
+	for (i = 7; i >= 0; i--)
+		for (j = 0; j < SHA256D_WAYS; j++)
+			midstate[i * SHA256D_WAYS + j] = midstate[i];
+	
+	do {
+		for (i = 0; i < SHA256D_WAYS; i++)
+			data[SHA256D_WAYS * 3 + i] = ++n;
+		
+#if SHA256D_WAYS == 4
+		sha256d_4way(hash, data, midstate);
+#else
+		sha256d(hash, data, midstate);
+#endif
+		
+		for (i = 0; i < SHA256D_WAYS; i++) {
+			if (hash[SHA256D_WAYS * 7 + i] <= Htarg) {
+				for (j = 0; j < 8; j++)
+					tmp[j] = hash[SHA256D_WAYS * j + i];
+				if (fulltest(tmp, ptarget)) {
+					*hashes_done = n - pdata[19] + 1;
+					pdata[19] = data[SHA256D_WAYS * 3 + i];
+					return 1;
+				}
+			}
+		}
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = n - pdata[19] + 1;
+	pdata[19] = n;
+	return 0;
+}