Skip the last three rounds of SHA-256d

This commit is contained in:
pooler 2012-03-25 15:43:49 +02:00
parent 18a34a72ab
commit 9fd497db5e
2 changed files with 466 additions and 312 deletions

View file

@ -95,6 +95,7 @@ sha256_4k:
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
.text .text
.p2align 6 .p2align 6
.globl sha256_init_4way .globl sha256_init_4way
@ -128,6 +129,40 @@ _sha256_init_4way:
.macro sha256_sse2_extend_round i .macro sha256_sse2_extend_round i
movdqa (\i-15)*16(%rax), %xmm0
movdqa %xmm0, %xmm2
psrld $3, %xmm0
movdqa %xmm0, %xmm1
pslld $14, %xmm2
psrld $4, %xmm1
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
psrld $11, %xmm1
pslld $11, %xmm2
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
movdqa (\i-2)*16(%rax), %xmm3
movdqa %xmm3, %xmm2
paddd (\i-16)*16(%rax), %xmm0
paddd (\i-7)*16(%rax), %xmm0
psrld $10, %xmm3
movdqa %xmm3, %xmm1
pslld $13, %xmm2
psrld $7, %xmm1
pxor %xmm1, %xmm3
pxor %xmm2, %xmm3
psrld $2, %xmm1
pslld $2, %xmm2
pxor %xmm1, %xmm3
pxor %xmm2, %xmm3
paddd %xmm3, %xmm0
movdqa %xmm0, \i*16(%rax)
.endm
.macro sha256_sse2_extend_doubleround i
movdqa (\i-15)*16(%rax), %xmm0 movdqa (\i-15)*16(%rax), %xmm0
movdqa (\i-14)*16(%rax), %xmm4 movdqa (\i-14)*16(%rax), %xmm4
movdqa %xmm0, %xmm2 movdqa %xmm0, %xmm2
@ -193,36 +228,6 @@ _sha256_init_4way:
movdqa %xmm4, (\i+1)*16(%rax) movdqa %xmm4, (\i+1)*16(%rax)
.endm .endm
.text
.p2align 6
sha256_sse2_extend_loop:
sha256_sse2_extend_round 0
sha256_sse2_extend_loop_pre:
sha256_sse2_extend_round 2
sha256_sse2_extend_round 4
sha256_sse2_extend_round 6
sha256_sse2_extend_round 8
sha256_sse2_extend_round 10
sha256_sse2_extend_round 12
sha256_sse2_extend_round 14
sha256_sse2_extend_round 16
sha256_sse2_extend_round 18
sha256_sse2_extend_round 20
sha256_sse2_extend_round 22
sha256_sse2_extend_round 24
sha256_sse2_extend_round 26
sha256_sse2_extend_round 28
sha256_sse2_extend_round 30
sha256_sse2_extend_round 32
sha256_sse2_extend_round 34
sha256_sse2_extend_round 36
sha256_sse2_extend_round 38
sha256_sse2_extend_round 40
sha256_sse2_extend_round 42
sha256_sse2_extend_round 44
sha256_sse2_extend_round 46
ret
.macro sha256_sse2_main_round i .macro sha256_sse2_main_round i
movdqa 16*\i(%rax), %xmm6 movdqa 16*\i(%rax), %xmm6
paddd 16*\i(%rcx), %xmm6 paddd 16*\i(%rcx), %xmm6
@ -288,80 +293,39 @@ sha256_sse2_extend_loop_pre:
paddd %xmm6, %xmm7 paddd %xmm6, %xmm7
.endm .endm
.text
.p2align 6
sha256_sse2_main_loop:
sha256_sse2_main_round 0
sha256_sse2_main_round 1
sha256_sse2_main_round 2
sha256_sse2_main_loop_pre:
sha256_sse2_main_round 3
sha256_sse2_main_round 4
sha256_sse2_main_round 5
sha256_sse2_main_round 6
sha256_sse2_main_round 7
sha256_sse2_main_round 8
sha256_sse2_main_round 9
sha256_sse2_main_round 10
sha256_sse2_main_round 11
sha256_sse2_main_round 12
sha256_sse2_main_round 13
sha256_sse2_main_round 14
sha256_sse2_main_round 15
sha256_sse2_main_round 16
sha256_sse2_main_round 17
sha256_sse2_main_round 18
sha256_sse2_main_round 19
sha256_sse2_main_round 20
sha256_sse2_main_round 21
sha256_sse2_main_round 22
sha256_sse2_main_round 23
sha256_sse2_main_round 24
sha256_sse2_main_round 25
sha256_sse2_main_round 26
sha256_sse2_main_round 27
sha256_sse2_main_round 28
sha256_sse2_main_round 29
sha256_sse2_main_round 30
sha256_sse2_main_round 31
sha256_sse2_main_round 32
sha256_sse2_main_round 33
sha256_sse2_main_round 34
sha256_sse2_main_round 35
sha256_sse2_main_round 36
sha256_sse2_main_round 37
sha256_sse2_main_round 38
sha256_sse2_main_round 39
sha256_sse2_main_round 40
sha256_sse2_main_round 41
sha256_sse2_main_round 42
sha256_sse2_main_round 43
sha256_sse2_main_round 44
sha256_sse2_main_round 45
sha256_sse2_main_round 46
sha256_sse2_main_round 47
sha256_sse2_main_round 48
sha256_sse2_main_round 49
sha256_sse2_main_round 50
sha256_sse2_main_round 51
sha256_sse2_main_round 52
sha256_sse2_main_round 53
sha256_sse2_main_round 54
sha256_sse2_main_round 55
sha256_sse2_main_round 56
sha256_sse2_main_round 57
sha256_sse2_main_round 58
sha256_sse2_main_round 59
sha256_sse2_main_round 60
sha256_sse2_main_round 61
sha256_sse2_main_round 62
sha256_sse2_main_round 63
ret
#if defined(USE_AVX) #if defined(USE_AVX)
.macro sha256_avx_extend_round i .macro sha256_avx_extend_round i
vmovdqa (\i-15)*16(%rax), %xmm0
vpslld $14, %xmm0, %xmm2
vpsrld $3, %xmm0, %xmm0
vpsrld $4, %xmm0, %xmm1
vpxor %xmm1, %xmm0, %xmm0
vpxor %xmm2, %xmm0, %xmm0
vpsrld $11, %xmm1, %xmm1
vpslld $11, %xmm2, %xmm2
vpxor %xmm1, %xmm0, %xmm0
vpxor %xmm2, %xmm0, %xmm0
vmovdqa (\i-2)*16(%rax), %xmm3
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
vpslld $13, %xmm3, %xmm2
vpsrld $10, %xmm3, %xmm3
vpsrld $7, %xmm3, %xmm1
vpxor %xmm1, %xmm3, %xmm3
vpxor %xmm2, %xmm3, %xmm3
vpsrld $2, %xmm1, %xmm1
vpslld $2, %xmm2, %xmm2
vpxor %xmm1, %xmm3, %xmm3
vpxor %xmm2, %xmm3, %xmm3
vpaddd %xmm3, %xmm0, %xmm0
vmovdqa %xmm0, \i*16(%rax)
.endm
.macro sha256_avx_extend_doubleround i
vmovdqa (\i-15)*16(%rax), %xmm0 vmovdqa (\i-15)*16(%rax), %xmm0
vmovdqa (\i-14)*16(%rax), %xmm4 vmovdqa (\i-14)*16(%rax), %xmm4
vpslld $14, %xmm0, %xmm2 vpslld $14, %xmm0, %xmm2
@ -419,36 +383,6 @@ sha256_sse2_main_loop_pre:
vmovdqa %xmm4, (\i+1)*16(%rax) vmovdqa %xmm4, (\i+1)*16(%rax)
.endm .endm
.text
.p2align 6
sha256_avx_extend_loop:
sha256_avx_extend_round 0
sha256_avx_extend_loop_pre:
sha256_avx_extend_round 2
sha256_avx_extend_round 4
sha256_avx_extend_round 6
sha256_avx_extend_round 8
sha256_avx_extend_round 10
sha256_avx_extend_round 12
sha256_avx_extend_round 14
sha256_avx_extend_round 16
sha256_avx_extend_round 18
sha256_avx_extend_round 20
sha256_avx_extend_round 22
sha256_avx_extend_round 24
sha256_avx_extend_round 26
sha256_avx_extend_round 28
sha256_avx_extend_round 30
sha256_avx_extend_round 32
sha256_avx_extend_round 34
sha256_avx_extend_round 36
sha256_avx_extend_round 38
sha256_avx_extend_round 40
sha256_avx_extend_round 42
sha256_avx_extend_round 44
sha256_avx_extend_round 46
ret
.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 .macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
vpaddd 16*(\i)(%rax), \r0, %xmm6 vpaddd 16*(\i)(%rax), \r0, %xmm6
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
@ -501,37 +435,33 @@ sha256_avx_extend_loop_pre:
sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
.endm .endm
.text
.p2align 6
sha256_avx_main_loop:
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_avx_main_loop_pre:
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
sha256_avx_main_quadround 4
sha256_avx_main_quadround 8
sha256_avx_main_quadround 12
sha256_avx_main_quadround 16
sha256_avx_main_quadround 20
sha256_avx_main_quadround 24
sha256_avx_main_quadround 28
sha256_avx_main_quadround 32
sha256_avx_main_quadround 36
sha256_avx_main_quadround 40
sha256_avx_main_quadround 44
sha256_avx_main_quadround 48
sha256_avx_main_quadround 52
sha256_avx_main_quadround 56
sha256_avx_main_quadround 60
ret
#endif /* USE_AVX */ #endif /* USE_AVX */
#if defined(USE_XOP) #if defined(USE_XOP)
.macro sha256_xop_extend_round i .macro sha256_xop_extend_round i
vmovdqa (\i-15)*16(%rax), %xmm0
vprotd $25, %xmm0, %xmm1
vprotd $14, %xmm0, %xmm2
vpsrld $3, %xmm0, %xmm0
vpxor %xmm1, %xmm2, %xmm2
vpxor %xmm2, %xmm0, %xmm0
vmovdqa (\i-2)*16(%rax), %xmm3
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
vprotd $15, %xmm3, %xmm1
vprotd $13, %xmm3, %xmm2
vpsrld $10, %xmm3, %xmm3
vpxor %xmm1, %xmm2, %xmm2
vpxor %xmm2, %xmm3, %xmm3
vpaddd %xmm3, %xmm0, %xmm0
vmovdqa %xmm0, \i*16(%rax)
.endm
.macro sha256_xop_extend_doubleround i
vmovdqa (\i-15)*16(%rax), %xmm0 vmovdqa (\i-15)*16(%rax), %xmm0
vmovdqa (\i-14)*16(%rax), %xmm4 vmovdqa (\i-14)*16(%rax), %xmm4
vprotd $25, %xmm0, %xmm1 vprotd $25, %xmm0, %xmm1
@ -571,36 +501,6 @@ sha256_avx_main_loop_pre:
vmovdqa %xmm4, (\i+1)*16(%rax) vmovdqa %xmm4, (\i+1)*16(%rax)
.endm .endm
.text
.p2align 6
sha256_xop_extend_loop:
sha256_xop_extend_round 0
sha256_xop_extend_loop_pre:
sha256_xop_extend_round 2
sha256_xop_extend_round 4
sha256_xop_extend_round 6
sha256_xop_extend_round 8
sha256_xop_extend_round 10
sha256_xop_extend_round 12
sha256_xop_extend_round 14
sha256_xop_extend_round 16
sha256_xop_extend_round 18
sha256_xop_extend_round 20
sha256_xop_extend_round 22
sha256_xop_extend_round 24
sha256_xop_extend_round 26
sha256_xop_extend_round 28
sha256_xop_extend_round 30
sha256_xop_extend_round 32
sha256_xop_extend_round 34
sha256_xop_extend_round 36
sha256_xop_extend_round 38
sha256_xop_extend_round 40
sha256_xop_extend_round 42
sha256_xop_extend_round 44
sha256_xop_extend_round 46
ret
.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 .macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
vpaddd 16*(\i)(%rax), \r0, %xmm6 vpaddd 16*(\i)(%rax), \r0, %xmm6
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
@ -641,31 +541,6 @@ sha256_xop_extend_loop_pre:
sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
.endm .endm
.text
.p2align 6
sha256_xop_main_loop:
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_xop_main_loop_pre:
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
sha256_xop_main_quadround 4
sha256_xop_main_quadround 8
sha256_xop_main_quadround 12
sha256_xop_main_quadround 16
sha256_xop_main_quadround 20
sha256_xop_main_quadround 24
sha256_xop_main_quadround 28
sha256_xop_main_quadround 32
sha256_xop_main_quadround 36
sha256_xop_main_quadround 40
sha256_xop_main_quadround 44
sha256_xop_main_quadround 48
sha256_xop_main_quadround 52
sha256_xop_main_quadround 56
sha256_xop_main_quadround 60
ret
#endif /* USE_XOP */ #endif /* USE_XOP */
@ -828,7 +703,30 @@ sha256_transform_4way_sse2_main_loop:
.p2align 6 .p2align 6
sha256_transform_4way_core_avx: sha256_transform_4way_core_avx:
leaq 256(%rsp), %rax leaq 256(%rsp), %rax
call sha256_avx_extend_loop sha256_avx_extend_doubleround 0
sha256_avx_extend_doubleround 2
sha256_avx_extend_doubleround 4
sha256_avx_extend_doubleround 6
sha256_avx_extend_doubleround 8
sha256_avx_extend_doubleround 10
sha256_avx_extend_doubleround 12
sha256_avx_extend_doubleround 14
sha256_avx_extend_doubleround 16
sha256_avx_extend_doubleround 18
sha256_avx_extend_doubleround 20
sha256_avx_extend_doubleround 22
sha256_avx_extend_doubleround 24
sha256_avx_extend_doubleround 26
sha256_avx_extend_doubleround 28
sha256_avx_extend_doubleround 30
sha256_avx_extend_doubleround 32
sha256_avx_extend_doubleround 34
sha256_avx_extend_doubleround 36
sha256_avx_extend_doubleround 38
sha256_avx_extend_doubleround 40
sha256_avx_extend_doubleround 42
sha256_avx_extend_doubleround 44
sha256_avx_extend_doubleround 46
movdqu 0(%rdi), %xmm7 movdqu 0(%rdi), %xmm7
movdqu 16(%rdi), %xmm5 movdqu 16(%rdi), %xmm5
movdqu 32(%rdi), %xmm4 movdqu 32(%rdi), %xmm4
@ -839,7 +737,22 @@ sha256_transform_4way_core_avx:
movdqu 112(%rdi), %xmm10 movdqu 112(%rdi), %xmm10
movq %rsp, %rax movq %rsp, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_avx_main_loop sha256_avx_main_quadround 0
sha256_avx_main_quadround 4
sha256_avx_main_quadround 8
sha256_avx_main_quadround 12
sha256_avx_main_quadround 16
sha256_avx_main_quadround 20
sha256_avx_main_quadround 24
sha256_avx_main_quadround 28
sha256_avx_main_quadround 32
sha256_avx_main_quadround 36
sha256_avx_main_quadround 40
sha256_avx_main_quadround 44
sha256_avx_main_quadround 48
sha256_avx_main_quadround 52
sha256_avx_main_quadround 56
sha256_avx_main_quadround 60
jmp sha256_transform_4way_finish jmp sha256_transform_4way_finish
#endif /* USE_AVX */ #endif /* USE_AVX */
@ -849,7 +762,30 @@ sha256_transform_4way_core_avx:
.p2align 6 .p2align 6
sha256_transform_4way_core_xop: sha256_transform_4way_core_xop:
leaq 256(%rsp), %rax leaq 256(%rsp), %rax
call sha256_xop_extend_loop sha256_xop_extend_doubleround 0
sha256_xop_extend_doubleround 2
sha256_xop_extend_doubleround 4
sha256_xop_extend_doubleround 6
sha256_xop_extend_doubleround 8
sha256_xop_extend_doubleround 10
sha256_xop_extend_doubleround 12
sha256_xop_extend_doubleround 14
sha256_xop_extend_doubleround 16
sha256_xop_extend_doubleround 18
sha256_xop_extend_doubleround 20
sha256_xop_extend_doubleround 22
sha256_xop_extend_doubleround 24
sha256_xop_extend_doubleround 26
sha256_xop_extend_doubleround 28
sha256_xop_extend_doubleround 30
sha256_xop_extend_doubleround 32
sha256_xop_extend_doubleround 34
sha256_xop_extend_doubleround 36
sha256_xop_extend_doubleround 38
sha256_xop_extend_doubleround 40
sha256_xop_extend_doubleround 42
sha256_xop_extend_doubleround 44
sha256_xop_extend_doubleround 46
movdqu 0(%rdi), %xmm7 movdqu 0(%rdi), %xmm7
movdqu 16(%rdi), %xmm5 movdqu 16(%rdi), %xmm5
movdqu 32(%rdi), %xmm4 movdqu 32(%rdi), %xmm4
@ -860,7 +796,22 @@ sha256_transform_4way_core_xop:
movdqu 112(%rdi), %xmm10 movdqu 112(%rdi), %xmm10
movq %rsp, %rax movq %rsp, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_xop_main_loop sha256_xop_main_quadround 0
sha256_xop_main_quadround 4
sha256_xop_main_quadround 8
sha256_xop_main_quadround 12
sha256_xop_main_quadround 16
sha256_xop_main_quadround 20
sha256_xop_main_quadround 24
sha256_xop_main_quadround 28
sha256_xop_main_quadround 32
sha256_xop_main_quadround 36
sha256_xop_main_quadround 40
sha256_xop_main_quadround 44
sha256_xop_main_quadround 48
sha256_xop_main_quadround 52
sha256_xop_main_quadround 56
sha256_xop_main_quadround 60
jmp sha256_transform_4way_finish jmp sha256_transform_4way_finish
#endif /* USE_XOP */ #endif /* USE_XOP */
@ -1007,20 +958,20 @@ sha256_transform_4way_finish:
.data .data
.p2align 3 .p2align 3
sha256d_4way_addr: sha256d_ms_4way_addr:
.quad 0x0 .quad 0x0
.text .text
.p2align 6 .p2align 6
.globl sha256d_4way .globl sha256d_ms_4way
.globl _sha256d_4way .globl _sha256d_ms_4way
sha256d_4way: sha256d_ms_4way:
_sha256d_4way: _sha256d_ms_4way:
jmp *sha256d_4way_addr(%rip) jmp *sha256d_ms_4way_addr(%rip)
.p2align 6 .p2align 6
sha256d_4way_sse2: sha256d_ms_4way_sse2:
#if defined(WIN64) #if defined(WIN64)
pushq %rdi pushq %rdi
subq $80, %rsp subq $80, %rsp
@ -1038,7 +989,35 @@ sha256d_4way_sse2:
subq $1032, %rsp subq $1032, %rsp
leaq 256(%rsi), %rax leaq 256(%rsi), %rax
call sha256_sse2_extend_loop_pre jmp sha256d_ms_4way_sse2_extend_loop1
sha256d_ms_4way_sse2_extend_loop2:
sha256_sse2_extend_doubleround 0
sha256d_ms_4way_sse2_extend_loop1:
sha256_sse2_extend_doubleround 2
sha256_sse2_extend_doubleround 4
sha256_sse2_extend_doubleround 6
sha256_sse2_extend_doubleround 8
sha256_sse2_extend_doubleround 10
sha256_sse2_extend_doubleround 12
sha256_sse2_extend_doubleround 14
sha256_sse2_extend_doubleround 16
sha256_sse2_extend_doubleround 18
sha256_sse2_extend_doubleround 20
sha256_sse2_extend_doubleround 22
sha256_sse2_extend_doubleround 24
sha256_sse2_extend_doubleround 26
sha256_sse2_extend_doubleround 28
sha256_sse2_extend_doubleround 30
sha256_sse2_extend_doubleround 32
sha256_sse2_extend_doubleround 34
sha256_sse2_extend_doubleround 36
sha256_sse2_extend_doubleround 38
sha256_sse2_extend_doubleround 40
sha256_sse2_extend_doubleround 42
jz sha256d_ms_4way_sse2_extend_coda2
sha256_sse2_extend_doubleround 44
sha256_sse2_extend_doubleround 46
movdqa 0(%rcx), %xmm3 movdqa 0(%rcx), %xmm3
movdqa 16(%rcx), %xmm0 movdqa 16(%rcx), %xmm0
@ -1051,7 +1030,75 @@ sha256d_4way_sse2:
movq %rsi, %rax movq %rsi, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_sse2_main_loop_pre jmp sha256d_ms_4way_sse2_main_loop1
sha256d_ms_4way_sse2_main_loop2:
sha256_sse2_main_round 0
sha256_sse2_main_round 1
sha256_sse2_main_round 2
sha256d_ms_4way_sse2_main_loop1:
sha256_sse2_main_round 3
sha256_sse2_main_round 4
sha256_sse2_main_round 5
sha256_sse2_main_round 6
sha256_sse2_main_round 7
sha256_sse2_main_round 8
sha256_sse2_main_round 9
sha256_sse2_main_round 10
sha256_sse2_main_round 11
sha256_sse2_main_round 12
sha256_sse2_main_round 13
sha256_sse2_main_round 14
sha256_sse2_main_round 15
sha256_sse2_main_round 16
sha256_sse2_main_round 17
sha256_sse2_main_round 18
sha256_sse2_main_round 19
sha256_sse2_main_round 20
sha256_sse2_main_round 21
sha256_sse2_main_round 22
sha256_sse2_main_round 23
sha256_sse2_main_round 24
sha256_sse2_main_round 25
sha256_sse2_main_round 26
sha256_sse2_main_round 27
sha256_sse2_main_round 28
sha256_sse2_main_round 29
sha256_sse2_main_round 30
sha256_sse2_main_round 31
sha256_sse2_main_round 32
sha256_sse2_main_round 33
sha256_sse2_main_round 34
sha256_sse2_main_round 35
sha256_sse2_main_round 36
sha256_sse2_main_round 37
sha256_sse2_main_round 38
sha256_sse2_main_round 39
sha256_sse2_main_round 40
sha256_sse2_main_round 41
sha256_sse2_main_round 42
sha256_sse2_main_round 43
sha256_sse2_main_round 44
sha256_sse2_main_round 45
sha256_sse2_main_round 46
sha256_sse2_main_round 47
sha256_sse2_main_round 48
sha256_sse2_main_round 49
sha256_sse2_main_round 50
sha256_sse2_main_round 51
sha256_sse2_main_round 52
sha256_sse2_main_round 53
sha256_sse2_main_round 54
sha256_sse2_main_round 55
sha256_sse2_main_round 56
sha256_sse2_main_round 57
sha256_sse2_main_round 58
sha256_sse2_main_round 59
sha256_sse2_main_round 60
jz sha256d_ms_4way_sse2_finish
sha256_sse2_main_round 61
sha256_sse2_main_round 62
sha256_sse2_main_round 63
paddd 0(%rdx), %xmm7 paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5 paddd 16(%rdx), %xmm5
@ -1086,7 +1133,11 @@ sha256d_4way_sse2:
movdqa %xmm1, 240(%rsp) movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rax leaq 256(%rsp), %rax
call sha256_sse2_extend_loop cmpq %rax, %rax
jmp sha256d_ms_4way_sse2_extend_loop2
sha256d_ms_4way_sse2_extend_coda2:
sha256_sse2_extend_round 44
movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+0(%rip), %xmm7
movdqa sha256_4h+16(%rip), %xmm5 movdqa sha256_4h+16(%rip), %xmm5
@ -1099,25 +1150,11 @@ sha256d_4way_sse2:
movq %rsp, %rax movq %rsp, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_sse2_main_loop jmp sha256d_ms_4way_sse2_main_loop2
paddd sha256_4h+0(%rip), %xmm7 sha256d_ms_4way_sse2_finish:
paddd sha256_4h+16(%rip), %xmm5 paddd sha256_4h+112(%rip), %xmm0
paddd sha256_4h+32(%rip), %xmm4 movdqa %xmm0, 112(%rdi)
paddd sha256_4h+48(%rip), %xmm3
paddd sha256_4h+64(%rip), %xmm0
paddd sha256_4h+80(%rip), %xmm8
paddd sha256_4h+96(%rip), %xmm9
paddd sha256_4h+112(%rip), %xmm10
movdqa %xmm7, 0(%rdi)
movdqa %xmm5, 16(%rdi)
movdqa %xmm4, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm0, 64(%rdi)
movdqa %xmm8, 80(%rdi)
movdqa %xmm9, 96(%rdi)
movdqa %xmm10, 112(%rdi)
addq $1032, %rsp addq $1032, %rsp
#if defined(WIN64) #if defined(WIN64)
@ -1136,7 +1173,7 @@ sha256d_4way_sse2:
#if defined(USE_AVX) #if defined(USE_AVX)
.p2align 6 .p2align 6
sha256d_4way_avx: sha256d_ms_4way_avx:
#if defined(WIN64) #if defined(WIN64)
pushq %rdi pushq %rdi
subq $80, %rsp subq $80, %rsp
@ -1154,7 +1191,35 @@ sha256d_4way_avx:
subq $1032, %rsp subq $1032, %rsp
leaq 256(%rsi), %rax leaq 256(%rsi), %rax
call sha256_avx_extend_loop_pre jmp sha256d_ms_4way_avx_extend_loop1
sha256d_ms_4way_avx_extend_loop2:
sha256_avx_extend_doubleround 0
sha256d_ms_4way_avx_extend_loop1:
sha256_avx_extend_doubleround 2
sha256_avx_extend_doubleround 4
sha256_avx_extend_doubleround 6
sha256_avx_extend_doubleround 8
sha256_avx_extend_doubleround 10
sha256_avx_extend_doubleround 12
sha256_avx_extend_doubleround 14
sha256_avx_extend_doubleround 16
sha256_avx_extend_doubleround 18
sha256_avx_extend_doubleround 20
sha256_avx_extend_doubleround 22
sha256_avx_extend_doubleround 24
sha256_avx_extend_doubleround 26
sha256_avx_extend_doubleround 28
sha256_avx_extend_doubleround 30
sha256_avx_extend_doubleround 32
sha256_avx_extend_doubleround 34
sha256_avx_extend_doubleround 36
sha256_avx_extend_doubleround 38
sha256_avx_extend_doubleround 40
sha256_avx_extend_doubleround 42
jz sha256d_ms_4way_avx_extend_coda2
sha256_avx_extend_doubleround 44
sha256_avx_extend_doubleround 46
movdqa 0(%rcx), %xmm7 movdqa 0(%rcx), %xmm7
movdqa 16(%rcx), %xmm8 movdqa 16(%rcx), %xmm8
@ -1167,7 +1232,33 @@ sha256d_4way_avx:
movq %rsi, %rax movq %rsi, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_avx_main_loop_pre jmp sha256d_ms_4way_avx_main_loop1
sha256d_ms_4way_avx_main_loop2:
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256d_ms_4way_avx_main_loop1:
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
sha256_avx_main_quadround 4
sha256_avx_main_quadround 8
sha256_avx_main_quadround 12
sha256_avx_main_quadround 16
sha256_avx_main_quadround 20
sha256_avx_main_quadround 24
sha256_avx_main_quadround 28
sha256_avx_main_quadround 32
sha256_avx_main_quadround 36
sha256_avx_main_quadround 40
sha256_avx_main_quadround 44
sha256_avx_main_quadround 48
sha256_avx_main_quadround 52
sha256_avx_main_quadround 56
sha256_avx_main_round 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
jz sha256d_ms_4way_avx_finish
sha256_avx_main_round 61, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_avx_main_round 62, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_avx_main_round 63, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
paddd 0(%rdx), %xmm7 paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5 paddd 16(%rdx), %xmm5
@ -1202,7 +1293,11 @@ sha256d_4way_avx:
movdqa %xmm1, 240(%rsp) movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rax leaq 256(%rsp), %rax
call sha256_avx_extend_loop cmpq %rax, %rax
jmp sha256d_ms_4way_avx_extend_loop2
sha256d_ms_4way_avx_extend_coda2:
sha256_avx_extend_round 44
movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+0(%rip), %xmm7
movdqa sha256_4h+16(%rip), %xmm5 movdqa sha256_4h+16(%rip), %xmm5
@ -1215,24 +1310,10 @@ sha256d_4way_avx:
movq %rsp, %rax movq %rsp, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_avx_main_loop jmp sha256d_ms_4way_avx_main_loop2
paddd sha256_4h+0(%rip), %xmm7 sha256d_ms_4way_avx_finish:
paddd sha256_4h+16(%rip), %xmm5
paddd sha256_4h+32(%rip), %xmm4
paddd sha256_4h+48(%rip), %xmm3
paddd sha256_4h+64(%rip), %xmm0
paddd sha256_4h+80(%rip), %xmm8
paddd sha256_4h+96(%rip), %xmm9
paddd sha256_4h+112(%rip), %xmm10 paddd sha256_4h+112(%rip), %xmm10
movdqa %xmm7, 0(%rdi)
movdqa %xmm5, 16(%rdi)
movdqa %xmm4, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm0, 64(%rdi)
movdqa %xmm8, 80(%rdi)
movdqa %xmm9, 96(%rdi)
movdqa %xmm10, 112(%rdi) movdqa %xmm10, 112(%rdi)
addq $1032, %rsp addq $1032, %rsp
@ -1254,7 +1335,7 @@ sha256d_4way_avx:
#if defined(USE_XOP) #if defined(USE_XOP)
.p2align 6 .p2align 6
sha256d_4way_xop: sha256d_ms_4way_xop:
#if defined(WIN64) #if defined(WIN64)
pushq %rdi pushq %rdi
subq $80, %rsp subq $80, %rsp
@ -1272,7 +1353,35 @@ sha256d_4way_xop:
subq $1032, %rsp subq $1032, %rsp
leaq 256(%rsi), %rax leaq 256(%rsi), %rax
call sha256_xop_extend_loop_pre jmp sha256d_ms_4way_xop_extend_loop1
sha256d_ms_4way_xop_extend_loop2:
sha256_xop_extend_doubleround 0
sha256d_ms_4way_xop_extend_loop1:
sha256_xop_extend_doubleround 2
sha256_xop_extend_doubleround 4
sha256_xop_extend_doubleround 6
sha256_xop_extend_doubleround 8
sha256_xop_extend_doubleround 10
sha256_xop_extend_doubleround 12
sha256_xop_extend_doubleround 14
sha256_xop_extend_doubleround 16
sha256_xop_extend_doubleround 18
sha256_xop_extend_doubleround 20
sha256_xop_extend_doubleround 22
sha256_xop_extend_doubleround 24
sha256_xop_extend_doubleround 26
sha256_xop_extend_doubleround 28
sha256_xop_extend_doubleround 30
sha256_xop_extend_doubleround 32
sha256_xop_extend_doubleround 34
sha256_xop_extend_doubleround 36
sha256_xop_extend_doubleround 38
sha256_xop_extend_doubleround 40
sha256_xop_extend_doubleround 42
jz sha256d_ms_4way_xop_extend_coda2
sha256_xop_extend_doubleround 44
sha256_xop_extend_doubleround 46
movdqa 0(%rcx), %xmm7 movdqa 0(%rcx), %xmm7
movdqa 16(%rcx), %xmm8 movdqa 16(%rcx), %xmm8
@ -1285,7 +1394,33 @@ sha256d_4way_xop:
movq %rsi, %rax movq %rsi, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_xop_main_loop_pre jmp sha256d_ms_4way_xop_main_loop1
sha256d_ms_4way_xop_main_loop2:
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256d_ms_4way_xop_main_loop1:
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
sha256_xop_main_quadround 4
sha256_xop_main_quadround 8
sha256_xop_main_quadround 12
sha256_xop_main_quadround 16
sha256_xop_main_quadround 20
sha256_xop_main_quadround 24
sha256_xop_main_quadround 28
sha256_xop_main_quadround 32
sha256_xop_main_quadround 36
sha256_xop_main_quadround 40
sha256_xop_main_quadround 44
sha256_xop_main_quadround 48
sha256_xop_main_quadround 52
sha256_xop_main_quadround 56
sha256_xop_main_round 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
jz sha256d_ms_4way_xop_finish
sha256_xop_main_round 61, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
sha256_xop_main_round 62, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
sha256_xop_main_round 63, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
paddd 0(%rdx), %xmm7 paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5 paddd 16(%rdx), %xmm5
@ -1320,7 +1455,11 @@ sha256d_4way_xop:
movdqa %xmm1, 240(%rsp) movdqa %xmm1, 240(%rsp)
leaq 256(%rsp), %rax leaq 256(%rsp), %rax
call sha256_xop_extend_loop cmpq %rax, %rax
jmp sha256d_ms_4way_xop_extend_loop2
sha256d_ms_4way_xop_extend_coda2:
sha256_xop_extend_round 44
movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+0(%rip), %xmm7
movdqa sha256_4h+16(%rip), %xmm5 movdqa sha256_4h+16(%rip), %xmm5
@ -1333,24 +1472,10 @@ sha256d_4way_xop:
movq %rsp, %rax movq %rsp, %rax
leaq sha256_4k(%rip), %rcx leaq sha256_4k(%rip), %rcx
call sha256_xop_main_loop jmp sha256d_ms_4way_xop_main_loop2
paddd sha256_4h+0(%rip), %xmm7 sha256d_ms_4way_xop_finish:
paddd sha256_4h+16(%rip), %xmm5
paddd sha256_4h+32(%rip), %xmm4
paddd sha256_4h+48(%rip), %xmm3
paddd sha256_4h+64(%rip), %xmm0
paddd sha256_4h+80(%rip), %xmm8
paddd sha256_4h+96(%rip), %xmm9
paddd sha256_4h+112(%rip), %xmm10 paddd sha256_4h+112(%rip), %xmm10
movdqa %xmm7, 0(%rdi)
movdqa %xmm5, 16(%rdi)
movdqa %xmm4, 32(%rdi)
movdqa %xmm3, 48(%rdi)
movdqa %xmm0, 64(%rdi)
movdqa %xmm8, 80(%rdi)
movdqa %xmm9, 96(%rdi)
movdqa %xmm10, 112(%rdi) movdqa %xmm10, 112(%rdi)
addq $1032, %rsp addq $1032, %rsp
@ -1400,23 +1525,23 @@ _sha256_use_4way:
jz sha2_4way_init_avx jz sha2_4way_init_avx
sha2_4way_init_xop: sha2_4way_init_xop:
leaq sha256d_4way_xop(%rip), %rax leaq sha256d_ms_4way_xop(%rip), %rax
leaq sha256_transform_4way_core_xop(%rip), %rdx leaq sha256_transform_4way_core_xop(%rip), %rdx
jmp sha2_4way_init_done jmp sha2_4way_init_done
#endif /* USE_XOP */ #endif /* USE_XOP */
sha2_4way_init_avx: sha2_4way_init_avx:
leaq sha256d_4way_avx(%rip), %rax leaq sha256d_ms_4way_avx(%rip), %rax
leaq sha256_transform_4way_core_avx(%rip), %rdx leaq sha256_transform_4way_core_avx(%rip), %rdx
jmp sha2_4way_init_done jmp sha2_4way_init_done
#endif /* USE_AVX */ #endif /* USE_AVX */
sha2_4way_init_sse2: sha2_4way_init_sse2:
leaq sha256d_4way_sse2(%rip), %rax leaq sha256d_ms_4way_sse2(%rip), %rax
leaq sha256_transform_4way_core_sse2(%rip), %rdx leaq sha256_transform_4way_core_sse2(%rip), %rdx
sha2_4way_init_done: sha2_4way_init_done:
movq %rax, sha256d_4way_addr(%rip) movq %rax, sha256d_ms_4way_addr(%rip)
movq %rdx, sha256_transform_4way_core_addr(%rip) movq %rdx, sha256_transform_4way_core_addr(%rip)
popq %rdx popq %rdx
popq %rcx popq %rcx

75
sha2.c
View file

@ -172,6 +172,18 @@ static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000100 0x00000000, 0x00000000, 0x00000000, 0x00000100
}; };
static void sha256d(uint32_t *hash, uint32_t *data)
{
uint32_t S[16];
sha256_init(S);
sha256_transform(S, data, 0);
sha256_transform(S, data + 16, 0);
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(hash);
sha256_transform(hash, S, 0);
}
static inline void sha256d_preextend(uint32_t *W) static inline void sha256d_preextend(uint32_t *W)
{ {
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
@ -200,7 +212,7 @@ static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
RNDr(S, W, 2); RNDr(S, W, 2);
} }
static inline void sha256d(uint32_t *hash, uint32_t *W, static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
const uint32_t *midstate, const uint32_t *prehash) const uint32_t *midstate, const uint32_t *prehash)
{ {
uint32_t S[64]; uint32_t S[64];
@ -298,10 +310,27 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
memcpy(W + 18, E, sizeof(E)); memcpy(W + 18, E, sizeof(E));
memcpy(S + 8, sha256d_hash1 + 8, 32); memcpy(S + 8, sha256d_hash1 + 8, 32);
for (i = 16; i < 64; i += 2) { S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15];
for (i = 32; i < 60; i += 2) {
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
} }
S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
sha256_init(hash); sha256_init(hash);
@ -362,21 +391,21 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
RNDr(hash, S, 54); RNDr(hash, S, 54);
RNDr(hash, S, 55); RNDr(hash, S, 55);
RNDr(hash, S, 56); RNDr(hash, S, 56);
RNDr(hash, S, 57);
RNDr(hash, S, 58);
RNDr(hash, S, 59);
RNDr(hash, S, 60);
RNDr(hash, S, 61);
RNDr(hash, S, 62);
RNDr(hash, S, 63);
for (i = 0; i < 8; i++) hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
hash[i] += sha256_h[i]; + S[57] + sha256_k[57];
hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+ S[58] + sha256_k[58];
hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+ S[59] + sha256_k[59];
hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+ S[60] + sha256_k[60]
+ sha256_h[7];
} }
#ifdef HAVE_SHA256_4WAY #ifdef HAVE_SHA256_4WAY
#define SHA256D_MAX_WAYS 4 #define SHA256D_MAX_WAYS 4
void sha256d_4way(uint32_t *hash, uint32_t *data, void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash); const uint32_t *midstate, const uint32_t *prehash);
#else #else
#define SHA256D_MAX_WAYS 1 #define SHA256D_MAX_WAYS 1
@ -390,6 +419,7 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32))); uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32))); uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
uint32_t n = pdata[19] - 1; uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
#ifdef HAVE_SHA256_4WAY #ifdef HAVE_SHA256_4WAY
const int ways = sha256_use_4way() ? 4 : 1; const int ways = sha256_use_4way() ? 4 : 1;
@ -421,16 +451,14 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
data[4 * 3 + i] = ++n; data[4 * 3 + i] = ++n;
sha256d_4way(hash, data, midstate, prehash); sha256d_ms_4way(hash, data, midstate, prehash);
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
if (hash[4 * 7 + i] <= Htarg) { if (hash[4 * 7 + i] <= Htarg) {
uint32_t tmp[8]; pdata[19] = data[4 * 3 + i];
for (j = 0; j < 8; j++) sha256d(hash, pdata);
tmp[j] = hash[4 * j + i]; if (fulltest(hash, ptarget)) {
if (fulltest(tmp, ptarget)) { *hashes_done = n - first_nonce + 1;
*hashes_done = n - pdata[19] + 1;
pdata[19] = data[4 * 3 + i];
return 1; return 1;
} }
} }
@ -440,17 +468,18 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
#endif #endif
do { do {
data[3] = ++n; data[3] = ++n;
sha256d(hash, data, midstate, prehash); sha256d_ms(hash, data, midstate, prehash);
if (hash[7] <= Htarg) { if (hash[7] <= Htarg) {
pdata[19] = data[3];
sha256d(hash, pdata);
if (fulltest(hash, ptarget)) { if (fulltest(hash, ptarget)) {
*hashes_done = n - pdata[19] + 1; *hashes_done = n - first_nonce + 1;
pdata[19] = data[3];
return 1; return 1;
} }
} }
} while (n < max_nonce && !work_restart[thr_id].restart); } while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - pdata[19] + 1; *hashes_done = n - first_nonce + 1;
pdata[19] = n; pdata[19] = n;
return 0; return 0;
} }