Skip the last three rounds of SHA-256d
This commit is contained in:
parent
18a34a72ab
commit
9fd497db5e
2 changed files with 466 additions and 312 deletions
697
sha2-x64.S
697
sha2-x64.S
|
@ -95,6 +95,7 @@ sha256_4k:
|
||||||
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
|
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
|
||||||
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
|
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
|
||||||
|
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
.globl sha256_init_4way
|
.globl sha256_init_4way
|
||||||
|
@ -128,6 +129,40 @@ _sha256_init_4way:
|
||||||
|
|
||||||
|
|
||||||
.macro sha256_sse2_extend_round i
|
.macro sha256_sse2_extend_round i
|
||||||
|
movdqa (\i-15)*16(%rax), %xmm0
|
||||||
|
movdqa %xmm0, %xmm2
|
||||||
|
psrld $3, %xmm0
|
||||||
|
movdqa %xmm0, %xmm1
|
||||||
|
pslld $14, %xmm2
|
||||||
|
psrld $4, %xmm1
|
||||||
|
pxor %xmm1, %xmm0
|
||||||
|
pxor %xmm2, %xmm0
|
||||||
|
psrld $11, %xmm1
|
||||||
|
pslld $11, %xmm2
|
||||||
|
pxor %xmm1, %xmm0
|
||||||
|
pxor %xmm2, %xmm0
|
||||||
|
|
||||||
|
movdqa (\i-2)*16(%rax), %xmm3
|
||||||
|
movdqa %xmm3, %xmm2
|
||||||
|
|
||||||
|
paddd (\i-16)*16(%rax), %xmm0
|
||||||
|
paddd (\i-7)*16(%rax), %xmm0
|
||||||
|
|
||||||
|
psrld $10, %xmm3
|
||||||
|
movdqa %xmm3, %xmm1
|
||||||
|
pslld $13, %xmm2
|
||||||
|
psrld $7, %xmm1
|
||||||
|
pxor %xmm1, %xmm3
|
||||||
|
pxor %xmm2, %xmm3
|
||||||
|
psrld $2, %xmm1
|
||||||
|
pslld $2, %xmm2
|
||||||
|
pxor %xmm1, %xmm3
|
||||||
|
pxor %xmm2, %xmm3
|
||||||
|
paddd %xmm3, %xmm0
|
||||||
|
movdqa %xmm0, \i*16(%rax)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro sha256_sse2_extend_doubleround i
|
||||||
movdqa (\i-15)*16(%rax), %xmm0
|
movdqa (\i-15)*16(%rax), %xmm0
|
||||||
movdqa (\i-14)*16(%rax), %xmm4
|
movdqa (\i-14)*16(%rax), %xmm4
|
||||||
movdqa %xmm0, %xmm2
|
movdqa %xmm0, %xmm2
|
||||||
|
@ -193,36 +228,6 @@ _sha256_init_4way:
|
||||||
movdqa %xmm4, (\i+1)*16(%rax)
|
movdqa %xmm4, (\i+1)*16(%rax)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
|
||||||
.p2align 6
|
|
||||||
sha256_sse2_extend_loop:
|
|
||||||
sha256_sse2_extend_round 0
|
|
||||||
sha256_sse2_extend_loop_pre:
|
|
||||||
sha256_sse2_extend_round 2
|
|
||||||
sha256_sse2_extend_round 4
|
|
||||||
sha256_sse2_extend_round 6
|
|
||||||
sha256_sse2_extend_round 8
|
|
||||||
sha256_sse2_extend_round 10
|
|
||||||
sha256_sse2_extend_round 12
|
|
||||||
sha256_sse2_extend_round 14
|
|
||||||
sha256_sse2_extend_round 16
|
|
||||||
sha256_sse2_extend_round 18
|
|
||||||
sha256_sse2_extend_round 20
|
|
||||||
sha256_sse2_extend_round 22
|
|
||||||
sha256_sse2_extend_round 24
|
|
||||||
sha256_sse2_extend_round 26
|
|
||||||
sha256_sse2_extend_round 28
|
|
||||||
sha256_sse2_extend_round 30
|
|
||||||
sha256_sse2_extend_round 32
|
|
||||||
sha256_sse2_extend_round 34
|
|
||||||
sha256_sse2_extend_round 36
|
|
||||||
sha256_sse2_extend_round 38
|
|
||||||
sha256_sse2_extend_round 40
|
|
||||||
sha256_sse2_extend_round 42
|
|
||||||
sha256_sse2_extend_round 44
|
|
||||||
sha256_sse2_extend_round 46
|
|
||||||
ret
|
|
||||||
|
|
||||||
.macro sha256_sse2_main_round i
|
.macro sha256_sse2_main_round i
|
||||||
movdqa 16*\i(%rax), %xmm6
|
movdqa 16*\i(%rax), %xmm6
|
||||||
paddd 16*\i(%rcx), %xmm6
|
paddd 16*\i(%rcx), %xmm6
|
||||||
|
@ -288,80 +293,39 @@ sha256_sse2_extend_loop_pre:
|
||||||
paddd %xmm6, %xmm7
|
paddd %xmm6, %xmm7
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
|
||||||
.p2align 6
|
|
||||||
sha256_sse2_main_loop:
|
|
||||||
sha256_sse2_main_round 0
|
|
||||||
sha256_sse2_main_round 1
|
|
||||||
sha256_sse2_main_round 2
|
|
||||||
sha256_sse2_main_loop_pre:
|
|
||||||
sha256_sse2_main_round 3
|
|
||||||
sha256_sse2_main_round 4
|
|
||||||
sha256_sse2_main_round 5
|
|
||||||
sha256_sse2_main_round 6
|
|
||||||
sha256_sse2_main_round 7
|
|
||||||
sha256_sse2_main_round 8
|
|
||||||
sha256_sse2_main_round 9
|
|
||||||
sha256_sse2_main_round 10
|
|
||||||
sha256_sse2_main_round 11
|
|
||||||
sha256_sse2_main_round 12
|
|
||||||
sha256_sse2_main_round 13
|
|
||||||
sha256_sse2_main_round 14
|
|
||||||
sha256_sse2_main_round 15
|
|
||||||
sha256_sse2_main_round 16
|
|
||||||
sha256_sse2_main_round 17
|
|
||||||
sha256_sse2_main_round 18
|
|
||||||
sha256_sse2_main_round 19
|
|
||||||
sha256_sse2_main_round 20
|
|
||||||
sha256_sse2_main_round 21
|
|
||||||
sha256_sse2_main_round 22
|
|
||||||
sha256_sse2_main_round 23
|
|
||||||
sha256_sse2_main_round 24
|
|
||||||
sha256_sse2_main_round 25
|
|
||||||
sha256_sse2_main_round 26
|
|
||||||
sha256_sse2_main_round 27
|
|
||||||
sha256_sse2_main_round 28
|
|
||||||
sha256_sse2_main_round 29
|
|
||||||
sha256_sse2_main_round 30
|
|
||||||
sha256_sse2_main_round 31
|
|
||||||
sha256_sse2_main_round 32
|
|
||||||
sha256_sse2_main_round 33
|
|
||||||
sha256_sse2_main_round 34
|
|
||||||
sha256_sse2_main_round 35
|
|
||||||
sha256_sse2_main_round 36
|
|
||||||
sha256_sse2_main_round 37
|
|
||||||
sha256_sse2_main_round 38
|
|
||||||
sha256_sse2_main_round 39
|
|
||||||
sha256_sse2_main_round 40
|
|
||||||
sha256_sse2_main_round 41
|
|
||||||
sha256_sse2_main_round 42
|
|
||||||
sha256_sse2_main_round 43
|
|
||||||
sha256_sse2_main_round 44
|
|
||||||
sha256_sse2_main_round 45
|
|
||||||
sha256_sse2_main_round 46
|
|
||||||
sha256_sse2_main_round 47
|
|
||||||
sha256_sse2_main_round 48
|
|
||||||
sha256_sse2_main_round 49
|
|
||||||
sha256_sse2_main_round 50
|
|
||||||
sha256_sse2_main_round 51
|
|
||||||
sha256_sse2_main_round 52
|
|
||||||
sha256_sse2_main_round 53
|
|
||||||
sha256_sse2_main_round 54
|
|
||||||
sha256_sse2_main_round 55
|
|
||||||
sha256_sse2_main_round 56
|
|
||||||
sha256_sse2_main_round 57
|
|
||||||
sha256_sse2_main_round 58
|
|
||||||
sha256_sse2_main_round 59
|
|
||||||
sha256_sse2_main_round 60
|
|
||||||
sha256_sse2_main_round 61
|
|
||||||
sha256_sse2_main_round 62
|
|
||||||
sha256_sse2_main_round 63
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_AVX)
|
#if defined(USE_AVX)
|
||||||
|
|
||||||
.macro sha256_avx_extend_round i
|
.macro sha256_avx_extend_round i
|
||||||
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
||||||
|
vpslld $14, %xmm0, %xmm2
|
||||||
|
vpsrld $3, %xmm0, %xmm0
|
||||||
|
vpsrld $4, %xmm0, %xmm1
|
||||||
|
vpxor %xmm1, %xmm0, %xmm0
|
||||||
|
vpxor %xmm2, %xmm0, %xmm0
|
||||||
|
vpsrld $11, %xmm1, %xmm1
|
||||||
|
vpslld $11, %xmm2, %xmm2
|
||||||
|
vpxor %xmm1, %xmm0, %xmm0
|
||||||
|
vpxor %xmm2, %xmm0, %xmm0
|
||||||
|
|
||||||
|
vmovdqa (\i-2)*16(%rax), %xmm3
|
||||||
|
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
|
||||||
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
||||||
|
|
||||||
|
vpslld $13, %xmm3, %xmm2
|
||||||
|
vpsrld $10, %xmm3, %xmm3
|
||||||
|
vpsrld $7, %xmm3, %xmm1
|
||||||
|
vpxor %xmm1, %xmm3, %xmm3
|
||||||
|
vpxor %xmm2, %xmm3, %xmm3
|
||||||
|
vpsrld $2, %xmm1, %xmm1
|
||||||
|
vpslld $2, %xmm2, %xmm2
|
||||||
|
vpxor %xmm1, %xmm3, %xmm3
|
||||||
|
vpxor %xmm2, %xmm3, %xmm3
|
||||||
|
vpaddd %xmm3, %xmm0, %xmm0
|
||||||
|
vmovdqa %xmm0, \i*16(%rax)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro sha256_avx_extend_doubleround i
|
||||||
vmovdqa (\i-15)*16(%rax), %xmm0
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
||||||
vmovdqa (\i-14)*16(%rax), %xmm4
|
vmovdqa (\i-14)*16(%rax), %xmm4
|
||||||
vpslld $14, %xmm0, %xmm2
|
vpslld $14, %xmm0, %xmm2
|
||||||
|
@ -419,36 +383,6 @@ sha256_sse2_main_loop_pre:
|
||||||
vmovdqa %xmm4, (\i+1)*16(%rax)
|
vmovdqa %xmm4, (\i+1)*16(%rax)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
|
||||||
.p2align 6
|
|
||||||
sha256_avx_extend_loop:
|
|
||||||
sha256_avx_extend_round 0
|
|
||||||
sha256_avx_extend_loop_pre:
|
|
||||||
sha256_avx_extend_round 2
|
|
||||||
sha256_avx_extend_round 4
|
|
||||||
sha256_avx_extend_round 6
|
|
||||||
sha256_avx_extend_round 8
|
|
||||||
sha256_avx_extend_round 10
|
|
||||||
sha256_avx_extend_round 12
|
|
||||||
sha256_avx_extend_round 14
|
|
||||||
sha256_avx_extend_round 16
|
|
||||||
sha256_avx_extend_round 18
|
|
||||||
sha256_avx_extend_round 20
|
|
||||||
sha256_avx_extend_round 22
|
|
||||||
sha256_avx_extend_round 24
|
|
||||||
sha256_avx_extend_round 26
|
|
||||||
sha256_avx_extend_round 28
|
|
||||||
sha256_avx_extend_round 30
|
|
||||||
sha256_avx_extend_round 32
|
|
||||||
sha256_avx_extend_round 34
|
|
||||||
sha256_avx_extend_round 36
|
|
||||||
sha256_avx_extend_round 38
|
|
||||||
sha256_avx_extend_round 40
|
|
||||||
sha256_avx_extend_round 42
|
|
||||||
sha256_avx_extend_round 44
|
|
||||||
sha256_avx_extend_round 46
|
|
||||||
ret
|
|
||||||
|
|
||||||
.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
|
.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
|
||||||
vpaddd 16*(\i)(%rax), \r0, %xmm6
|
vpaddd 16*(\i)(%rax), \r0, %xmm6
|
||||||
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
|
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
|
||||||
|
@ -501,37 +435,33 @@ sha256_avx_extend_loop_pre:
|
||||||
sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
|
||||||
.p2align 6
|
|
||||||
sha256_avx_main_loop:
|
|
||||||
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
|
||||||
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
||||||
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
||||||
sha256_avx_main_loop_pre:
|
|
||||||
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
||||||
sha256_avx_main_quadround 4
|
|
||||||
sha256_avx_main_quadround 8
|
|
||||||
sha256_avx_main_quadround 12
|
|
||||||
sha256_avx_main_quadround 16
|
|
||||||
sha256_avx_main_quadround 20
|
|
||||||
sha256_avx_main_quadround 24
|
|
||||||
sha256_avx_main_quadround 28
|
|
||||||
sha256_avx_main_quadround 32
|
|
||||||
sha256_avx_main_quadround 36
|
|
||||||
sha256_avx_main_quadround 40
|
|
||||||
sha256_avx_main_quadround 44
|
|
||||||
sha256_avx_main_quadround 48
|
|
||||||
sha256_avx_main_quadround 52
|
|
||||||
sha256_avx_main_quadround 56
|
|
||||||
sha256_avx_main_quadround 60
|
|
||||||
ret
|
|
||||||
|
|
||||||
#endif /* USE_AVX */
|
#endif /* USE_AVX */
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_XOP)
|
#if defined(USE_XOP)
|
||||||
|
|
||||||
.macro sha256_xop_extend_round i
|
.macro sha256_xop_extend_round i
|
||||||
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
||||||
|
vprotd $25, %xmm0, %xmm1
|
||||||
|
vprotd $14, %xmm0, %xmm2
|
||||||
|
vpsrld $3, %xmm0, %xmm0
|
||||||
|
vpxor %xmm1, %xmm2, %xmm2
|
||||||
|
vpxor %xmm2, %xmm0, %xmm0
|
||||||
|
|
||||||
|
vmovdqa (\i-2)*16(%rax), %xmm3
|
||||||
|
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
|
||||||
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
||||||
|
|
||||||
|
vprotd $15, %xmm3, %xmm1
|
||||||
|
vprotd $13, %xmm3, %xmm2
|
||||||
|
vpsrld $10, %xmm3, %xmm3
|
||||||
|
vpxor %xmm1, %xmm2, %xmm2
|
||||||
|
vpxor %xmm2, %xmm3, %xmm3
|
||||||
|
vpaddd %xmm3, %xmm0, %xmm0
|
||||||
|
vmovdqa %xmm0, \i*16(%rax)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro sha256_xop_extend_doubleround i
|
||||||
vmovdqa (\i-15)*16(%rax), %xmm0
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
||||||
vmovdqa (\i-14)*16(%rax), %xmm4
|
vmovdqa (\i-14)*16(%rax), %xmm4
|
||||||
vprotd $25, %xmm0, %xmm1
|
vprotd $25, %xmm0, %xmm1
|
||||||
|
@ -571,36 +501,6 @@ sha256_avx_main_loop_pre:
|
||||||
vmovdqa %xmm4, (\i+1)*16(%rax)
|
vmovdqa %xmm4, (\i+1)*16(%rax)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
|
||||||
.p2align 6
|
|
||||||
sha256_xop_extend_loop:
|
|
||||||
sha256_xop_extend_round 0
|
|
||||||
sha256_xop_extend_loop_pre:
|
|
||||||
sha256_xop_extend_round 2
|
|
||||||
sha256_xop_extend_round 4
|
|
||||||
sha256_xop_extend_round 6
|
|
||||||
sha256_xop_extend_round 8
|
|
||||||
sha256_xop_extend_round 10
|
|
||||||
sha256_xop_extend_round 12
|
|
||||||
sha256_xop_extend_round 14
|
|
||||||
sha256_xop_extend_round 16
|
|
||||||
sha256_xop_extend_round 18
|
|
||||||
sha256_xop_extend_round 20
|
|
||||||
sha256_xop_extend_round 22
|
|
||||||
sha256_xop_extend_round 24
|
|
||||||
sha256_xop_extend_round 26
|
|
||||||
sha256_xop_extend_round 28
|
|
||||||
sha256_xop_extend_round 30
|
|
||||||
sha256_xop_extend_round 32
|
|
||||||
sha256_xop_extend_round 34
|
|
||||||
sha256_xop_extend_round 36
|
|
||||||
sha256_xop_extend_round 38
|
|
||||||
sha256_xop_extend_round 40
|
|
||||||
sha256_xop_extend_round 42
|
|
||||||
sha256_xop_extend_round 44
|
|
||||||
sha256_xop_extend_round 46
|
|
||||||
ret
|
|
||||||
|
|
||||||
.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
|
.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
|
||||||
vpaddd 16*(\i)(%rax), \r0, %xmm6
|
vpaddd 16*(\i)(%rax), \r0, %xmm6
|
||||||
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
|
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
|
||||||
|
@ -641,31 +541,6 @@ sha256_xop_extend_loop_pre:
|
||||||
sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
|
||||||
.p2align 6
|
|
||||||
sha256_xop_main_loop:
|
|
||||||
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
|
||||||
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
||||||
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
||||||
sha256_xop_main_loop_pre:
|
|
||||||
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
||||||
sha256_xop_main_quadround 4
|
|
||||||
sha256_xop_main_quadround 8
|
|
||||||
sha256_xop_main_quadround 12
|
|
||||||
sha256_xop_main_quadround 16
|
|
||||||
sha256_xop_main_quadround 20
|
|
||||||
sha256_xop_main_quadround 24
|
|
||||||
sha256_xop_main_quadround 28
|
|
||||||
sha256_xop_main_quadround 32
|
|
||||||
sha256_xop_main_quadround 36
|
|
||||||
sha256_xop_main_quadround 40
|
|
||||||
sha256_xop_main_quadround 44
|
|
||||||
sha256_xop_main_quadround 48
|
|
||||||
sha256_xop_main_quadround 52
|
|
||||||
sha256_xop_main_quadround 56
|
|
||||||
sha256_xop_main_quadround 60
|
|
||||||
ret
|
|
||||||
|
|
||||||
#endif /* USE_XOP */
|
#endif /* USE_XOP */
|
||||||
|
|
||||||
|
|
||||||
|
@ -828,7 +703,30 @@ sha256_transform_4way_sse2_main_loop:
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_transform_4way_core_avx:
|
sha256_transform_4way_core_avx:
|
||||||
leaq 256(%rsp), %rax
|
leaq 256(%rsp), %rax
|
||||||
call sha256_avx_extend_loop
|
sha256_avx_extend_doubleround 0
|
||||||
|
sha256_avx_extend_doubleround 2
|
||||||
|
sha256_avx_extend_doubleround 4
|
||||||
|
sha256_avx_extend_doubleround 6
|
||||||
|
sha256_avx_extend_doubleround 8
|
||||||
|
sha256_avx_extend_doubleround 10
|
||||||
|
sha256_avx_extend_doubleround 12
|
||||||
|
sha256_avx_extend_doubleround 14
|
||||||
|
sha256_avx_extend_doubleround 16
|
||||||
|
sha256_avx_extend_doubleround 18
|
||||||
|
sha256_avx_extend_doubleround 20
|
||||||
|
sha256_avx_extend_doubleround 22
|
||||||
|
sha256_avx_extend_doubleround 24
|
||||||
|
sha256_avx_extend_doubleround 26
|
||||||
|
sha256_avx_extend_doubleround 28
|
||||||
|
sha256_avx_extend_doubleround 30
|
||||||
|
sha256_avx_extend_doubleround 32
|
||||||
|
sha256_avx_extend_doubleround 34
|
||||||
|
sha256_avx_extend_doubleround 36
|
||||||
|
sha256_avx_extend_doubleround 38
|
||||||
|
sha256_avx_extend_doubleround 40
|
||||||
|
sha256_avx_extend_doubleround 42
|
||||||
|
sha256_avx_extend_doubleround 44
|
||||||
|
sha256_avx_extend_doubleround 46
|
||||||
movdqu 0(%rdi), %xmm7
|
movdqu 0(%rdi), %xmm7
|
||||||
movdqu 16(%rdi), %xmm5
|
movdqu 16(%rdi), %xmm5
|
||||||
movdqu 32(%rdi), %xmm4
|
movdqu 32(%rdi), %xmm4
|
||||||
|
@ -839,7 +737,22 @@ sha256_transform_4way_core_avx:
|
||||||
movdqu 112(%rdi), %xmm10
|
movdqu 112(%rdi), %xmm10
|
||||||
movq %rsp, %rax
|
movq %rsp, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_avx_main_loop
|
sha256_avx_main_quadround 0
|
||||||
|
sha256_avx_main_quadround 4
|
||||||
|
sha256_avx_main_quadround 8
|
||||||
|
sha256_avx_main_quadround 12
|
||||||
|
sha256_avx_main_quadround 16
|
||||||
|
sha256_avx_main_quadround 20
|
||||||
|
sha256_avx_main_quadround 24
|
||||||
|
sha256_avx_main_quadround 28
|
||||||
|
sha256_avx_main_quadround 32
|
||||||
|
sha256_avx_main_quadround 36
|
||||||
|
sha256_avx_main_quadround 40
|
||||||
|
sha256_avx_main_quadround 44
|
||||||
|
sha256_avx_main_quadround 48
|
||||||
|
sha256_avx_main_quadround 52
|
||||||
|
sha256_avx_main_quadround 56
|
||||||
|
sha256_avx_main_quadround 60
|
||||||
jmp sha256_transform_4way_finish
|
jmp sha256_transform_4way_finish
|
||||||
#endif /* USE_AVX */
|
#endif /* USE_AVX */
|
||||||
|
|
||||||
|
@ -849,7 +762,30 @@ sha256_transform_4way_core_avx:
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_transform_4way_core_xop:
|
sha256_transform_4way_core_xop:
|
||||||
leaq 256(%rsp), %rax
|
leaq 256(%rsp), %rax
|
||||||
call sha256_xop_extend_loop
|
sha256_xop_extend_doubleround 0
|
||||||
|
sha256_xop_extend_doubleround 2
|
||||||
|
sha256_xop_extend_doubleround 4
|
||||||
|
sha256_xop_extend_doubleround 6
|
||||||
|
sha256_xop_extend_doubleround 8
|
||||||
|
sha256_xop_extend_doubleround 10
|
||||||
|
sha256_xop_extend_doubleround 12
|
||||||
|
sha256_xop_extend_doubleround 14
|
||||||
|
sha256_xop_extend_doubleround 16
|
||||||
|
sha256_xop_extend_doubleround 18
|
||||||
|
sha256_xop_extend_doubleround 20
|
||||||
|
sha256_xop_extend_doubleround 22
|
||||||
|
sha256_xop_extend_doubleround 24
|
||||||
|
sha256_xop_extend_doubleround 26
|
||||||
|
sha256_xop_extend_doubleround 28
|
||||||
|
sha256_xop_extend_doubleround 30
|
||||||
|
sha256_xop_extend_doubleround 32
|
||||||
|
sha256_xop_extend_doubleround 34
|
||||||
|
sha256_xop_extend_doubleround 36
|
||||||
|
sha256_xop_extend_doubleround 38
|
||||||
|
sha256_xop_extend_doubleround 40
|
||||||
|
sha256_xop_extend_doubleround 42
|
||||||
|
sha256_xop_extend_doubleround 44
|
||||||
|
sha256_xop_extend_doubleround 46
|
||||||
movdqu 0(%rdi), %xmm7
|
movdqu 0(%rdi), %xmm7
|
||||||
movdqu 16(%rdi), %xmm5
|
movdqu 16(%rdi), %xmm5
|
||||||
movdqu 32(%rdi), %xmm4
|
movdqu 32(%rdi), %xmm4
|
||||||
|
@ -860,7 +796,22 @@ sha256_transform_4way_core_xop:
|
||||||
movdqu 112(%rdi), %xmm10
|
movdqu 112(%rdi), %xmm10
|
||||||
movq %rsp, %rax
|
movq %rsp, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_xop_main_loop
|
sha256_xop_main_quadround 0
|
||||||
|
sha256_xop_main_quadround 4
|
||||||
|
sha256_xop_main_quadround 8
|
||||||
|
sha256_xop_main_quadround 12
|
||||||
|
sha256_xop_main_quadround 16
|
||||||
|
sha256_xop_main_quadround 20
|
||||||
|
sha256_xop_main_quadround 24
|
||||||
|
sha256_xop_main_quadround 28
|
||||||
|
sha256_xop_main_quadround 32
|
||||||
|
sha256_xop_main_quadround 36
|
||||||
|
sha256_xop_main_quadround 40
|
||||||
|
sha256_xop_main_quadround 44
|
||||||
|
sha256_xop_main_quadround 48
|
||||||
|
sha256_xop_main_quadround 52
|
||||||
|
sha256_xop_main_quadround 56
|
||||||
|
sha256_xop_main_quadround 60
|
||||||
jmp sha256_transform_4way_finish
|
jmp sha256_transform_4way_finish
|
||||||
#endif /* USE_XOP */
|
#endif /* USE_XOP */
|
||||||
|
|
||||||
|
@ -1007,20 +958,20 @@ sha256_transform_4way_finish:
|
||||||
|
|
||||||
.data
|
.data
|
||||||
.p2align 3
|
.p2align 3
|
||||||
sha256d_4way_addr:
|
sha256d_ms_4way_addr:
|
||||||
.quad 0x0
|
.quad 0x0
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
.globl sha256d_4way
|
.globl sha256d_ms_4way
|
||||||
.globl _sha256d_4way
|
.globl _sha256d_ms_4way
|
||||||
sha256d_4way:
|
sha256d_ms_4way:
|
||||||
_sha256d_4way:
|
_sha256d_ms_4way:
|
||||||
jmp *sha256d_4way_addr(%rip)
|
jmp *sha256d_ms_4way_addr(%rip)
|
||||||
|
|
||||||
|
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256d_4way_sse2:
|
sha256d_ms_4way_sse2:
|
||||||
#if defined(WIN64)
|
#if defined(WIN64)
|
||||||
pushq %rdi
|
pushq %rdi
|
||||||
subq $80, %rsp
|
subq $80, %rsp
|
||||||
|
@ -1038,7 +989,35 @@ sha256d_4way_sse2:
|
||||||
subq $1032, %rsp
|
subq $1032, %rsp
|
||||||
|
|
||||||
leaq 256(%rsi), %rax
|
leaq 256(%rsi), %rax
|
||||||
call sha256_sse2_extend_loop_pre
|
jmp sha256d_ms_4way_sse2_extend_loop1
|
||||||
|
|
||||||
|
sha256d_ms_4way_sse2_extend_loop2:
|
||||||
|
sha256_sse2_extend_doubleround 0
|
||||||
|
sha256d_ms_4way_sse2_extend_loop1:
|
||||||
|
sha256_sse2_extend_doubleround 2
|
||||||
|
sha256_sse2_extend_doubleround 4
|
||||||
|
sha256_sse2_extend_doubleround 6
|
||||||
|
sha256_sse2_extend_doubleround 8
|
||||||
|
sha256_sse2_extend_doubleround 10
|
||||||
|
sha256_sse2_extend_doubleround 12
|
||||||
|
sha256_sse2_extend_doubleround 14
|
||||||
|
sha256_sse2_extend_doubleround 16
|
||||||
|
sha256_sse2_extend_doubleround 18
|
||||||
|
sha256_sse2_extend_doubleround 20
|
||||||
|
sha256_sse2_extend_doubleround 22
|
||||||
|
sha256_sse2_extend_doubleround 24
|
||||||
|
sha256_sse2_extend_doubleround 26
|
||||||
|
sha256_sse2_extend_doubleround 28
|
||||||
|
sha256_sse2_extend_doubleround 30
|
||||||
|
sha256_sse2_extend_doubleround 32
|
||||||
|
sha256_sse2_extend_doubleround 34
|
||||||
|
sha256_sse2_extend_doubleround 36
|
||||||
|
sha256_sse2_extend_doubleround 38
|
||||||
|
sha256_sse2_extend_doubleround 40
|
||||||
|
sha256_sse2_extend_doubleround 42
|
||||||
|
jz sha256d_ms_4way_sse2_extend_coda2
|
||||||
|
sha256_sse2_extend_doubleround 44
|
||||||
|
sha256_sse2_extend_doubleround 46
|
||||||
|
|
||||||
movdqa 0(%rcx), %xmm3
|
movdqa 0(%rcx), %xmm3
|
||||||
movdqa 16(%rcx), %xmm0
|
movdqa 16(%rcx), %xmm0
|
||||||
|
@ -1051,7 +1030,75 @@ sha256d_4way_sse2:
|
||||||
|
|
||||||
movq %rsi, %rax
|
movq %rsi, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_sse2_main_loop_pre
|
jmp sha256d_ms_4way_sse2_main_loop1
|
||||||
|
|
||||||
|
sha256d_ms_4way_sse2_main_loop2:
|
||||||
|
sha256_sse2_main_round 0
|
||||||
|
sha256_sse2_main_round 1
|
||||||
|
sha256_sse2_main_round 2
|
||||||
|
sha256d_ms_4way_sse2_main_loop1:
|
||||||
|
sha256_sse2_main_round 3
|
||||||
|
sha256_sse2_main_round 4
|
||||||
|
sha256_sse2_main_round 5
|
||||||
|
sha256_sse2_main_round 6
|
||||||
|
sha256_sse2_main_round 7
|
||||||
|
sha256_sse2_main_round 8
|
||||||
|
sha256_sse2_main_round 9
|
||||||
|
sha256_sse2_main_round 10
|
||||||
|
sha256_sse2_main_round 11
|
||||||
|
sha256_sse2_main_round 12
|
||||||
|
sha256_sse2_main_round 13
|
||||||
|
sha256_sse2_main_round 14
|
||||||
|
sha256_sse2_main_round 15
|
||||||
|
sha256_sse2_main_round 16
|
||||||
|
sha256_sse2_main_round 17
|
||||||
|
sha256_sse2_main_round 18
|
||||||
|
sha256_sse2_main_round 19
|
||||||
|
sha256_sse2_main_round 20
|
||||||
|
sha256_sse2_main_round 21
|
||||||
|
sha256_sse2_main_round 22
|
||||||
|
sha256_sse2_main_round 23
|
||||||
|
sha256_sse2_main_round 24
|
||||||
|
sha256_sse2_main_round 25
|
||||||
|
sha256_sse2_main_round 26
|
||||||
|
sha256_sse2_main_round 27
|
||||||
|
sha256_sse2_main_round 28
|
||||||
|
sha256_sse2_main_round 29
|
||||||
|
sha256_sse2_main_round 30
|
||||||
|
sha256_sse2_main_round 31
|
||||||
|
sha256_sse2_main_round 32
|
||||||
|
sha256_sse2_main_round 33
|
||||||
|
sha256_sse2_main_round 34
|
||||||
|
sha256_sse2_main_round 35
|
||||||
|
sha256_sse2_main_round 36
|
||||||
|
sha256_sse2_main_round 37
|
||||||
|
sha256_sse2_main_round 38
|
||||||
|
sha256_sse2_main_round 39
|
||||||
|
sha256_sse2_main_round 40
|
||||||
|
sha256_sse2_main_round 41
|
||||||
|
sha256_sse2_main_round 42
|
||||||
|
sha256_sse2_main_round 43
|
||||||
|
sha256_sse2_main_round 44
|
||||||
|
sha256_sse2_main_round 45
|
||||||
|
sha256_sse2_main_round 46
|
||||||
|
sha256_sse2_main_round 47
|
||||||
|
sha256_sse2_main_round 48
|
||||||
|
sha256_sse2_main_round 49
|
||||||
|
sha256_sse2_main_round 50
|
||||||
|
sha256_sse2_main_round 51
|
||||||
|
sha256_sse2_main_round 52
|
||||||
|
sha256_sse2_main_round 53
|
||||||
|
sha256_sse2_main_round 54
|
||||||
|
sha256_sse2_main_round 55
|
||||||
|
sha256_sse2_main_round 56
|
||||||
|
sha256_sse2_main_round 57
|
||||||
|
sha256_sse2_main_round 58
|
||||||
|
sha256_sse2_main_round 59
|
||||||
|
sha256_sse2_main_round 60
|
||||||
|
jz sha256d_ms_4way_sse2_finish
|
||||||
|
sha256_sse2_main_round 61
|
||||||
|
sha256_sse2_main_round 62
|
||||||
|
sha256_sse2_main_round 63
|
||||||
|
|
||||||
paddd 0(%rdx), %xmm7
|
paddd 0(%rdx), %xmm7
|
||||||
paddd 16(%rdx), %xmm5
|
paddd 16(%rdx), %xmm5
|
||||||
|
@ -1086,7 +1133,11 @@ sha256d_4way_sse2:
|
||||||
movdqa %xmm1, 240(%rsp)
|
movdqa %xmm1, 240(%rsp)
|
||||||
|
|
||||||
leaq 256(%rsp), %rax
|
leaq 256(%rsp), %rax
|
||||||
call sha256_sse2_extend_loop
|
cmpq %rax, %rax
|
||||||
|
jmp sha256d_ms_4way_sse2_extend_loop2
|
||||||
|
|
||||||
|
sha256d_ms_4way_sse2_extend_coda2:
|
||||||
|
sha256_sse2_extend_round 44
|
||||||
|
|
||||||
movdqa sha256_4h+0(%rip), %xmm7
|
movdqa sha256_4h+0(%rip), %xmm7
|
||||||
movdqa sha256_4h+16(%rip), %xmm5
|
movdqa sha256_4h+16(%rip), %xmm5
|
||||||
|
@ -1099,25 +1150,11 @@ sha256d_4way_sse2:
|
||||||
|
|
||||||
movq %rsp, %rax
|
movq %rsp, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_sse2_main_loop
|
jmp sha256d_ms_4way_sse2_main_loop2
|
||||||
|
|
||||||
paddd sha256_4h+0(%rip), %xmm7
|
sha256d_ms_4way_sse2_finish:
|
||||||
paddd sha256_4h+16(%rip), %xmm5
|
paddd sha256_4h+112(%rip), %xmm0
|
||||||
paddd sha256_4h+32(%rip), %xmm4
|
movdqa %xmm0, 112(%rdi)
|
||||||
paddd sha256_4h+48(%rip), %xmm3
|
|
||||||
paddd sha256_4h+64(%rip), %xmm0
|
|
||||||
paddd sha256_4h+80(%rip), %xmm8
|
|
||||||
paddd sha256_4h+96(%rip), %xmm9
|
|
||||||
paddd sha256_4h+112(%rip), %xmm10
|
|
||||||
|
|
||||||
movdqa %xmm7, 0(%rdi)
|
|
||||||
movdqa %xmm5, 16(%rdi)
|
|
||||||
movdqa %xmm4, 32(%rdi)
|
|
||||||
movdqa %xmm3, 48(%rdi)
|
|
||||||
movdqa %xmm0, 64(%rdi)
|
|
||||||
movdqa %xmm8, 80(%rdi)
|
|
||||||
movdqa %xmm9, 96(%rdi)
|
|
||||||
movdqa %xmm10, 112(%rdi)
|
|
||||||
|
|
||||||
addq $1032, %rsp
|
addq $1032, %rsp
|
||||||
#if defined(WIN64)
|
#if defined(WIN64)
|
||||||
|
@ -1136,7 +1173,7 @@ sha256d_4way_sse2:
|
||||||
#if defined(USE_AVX)
|
#if defined(USE_AVX)
|
||||||
|
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256d_4way_avx:
|
sha256d_ms_4way_avx:
|
||||||
#if defined(WIN64)
|
#if defined(WIN64)
|
||||||
pushq %rdi
|
pushq %rdi
|
||||||
subq $80, %rsp
|
subq $80, %rsp
|
||||||
|
@ -1154,7 +1191,35 @@ sha256d_4way_avx:
|
||||||
subq $1032, %rsp
|
subq $1032, %rsp
|
||||||
|
|
||||||
leaq 256(%rsi), %rax
|
leaq 256(%rsi), %rax
|
||||||
call sha256_avx_extend_loop_pre
|
jmp sha256d_ms_4way_avx_extend_loop1
|
||||||
|
|
||||||
|
sha256d_ms_4way_avx_extend_loop2:
|
||||||
|
sha256_avx_extend_doubleround 0
|
||||||
|
sha256d_ms_4way_avx_extend_loop1:
|
||||||
|
sha256_avx_extend_doubleround 2
|
||||||
|
sha256_avx_extend_doubleround 4
|
||||||
|
sha256_avx_extend_doubleround 6
|
||||||
|
sha256_avx_extend_doubleround 8
|
||||||
|
sha256_avx_extend_doubleround 10
|
||||||
|
sha256_avx_extend_doubleround 12
|
||||||
|
sha256_avx_extend_doubleround 14
|
||||||
|
sha256_avx_extend_doubleround 16
|
||||||
|
sha256_avx_extend_doubleround 18
|
||||||
|
sha256_avx_extend_doubleround 20
|
||||||
|
sha256_avx_extend_doubleround 22
|
||||||
|
sha256_avx_extend_doubleround 24
|
||||||
|
sha256_avx_extend_doubleround 26
|
||||||
|
sha256_avx_extend_doubleround 28
|
||||||
|
sha256_avx_extend_doubleround 30
|
||||||
|
sha256_avx_extend_doubleround 32
|
||||||
|
sha256_avx_extend_doubleround 34
|
||||||
|
sha256_avx_extend_doubleround 36
|
||||||
|
sha256_avx_extend_doubleround 38
|
||||||
|
sha256_avx_extend_doubleround 40
|
||||||
|
sha256_avx_extend_doubleround 42
|
||||||
|
jz sha256d_ms_4way_avx_extend_coda2
|
||||||
|
sha256_avx_extend_doubleround 44
|
||||||
|
sha256_avx_extend_doubleround 46
|
||||||
|
|
||||||
movdqa 0(%rcx), %xmm7
|
movdqa 0(%rcx), %xmm7
|
||||||
movdqa 16(%rcx), %xmm8
|
movdqa 16(%rcx), %xmm8
|
||||||
|
@ -1167,7 +1232,33 @@ sha256d_4way_avx:
|
||||||
|
|
||||||
movq %rsi, %rax
|
movq %rsi, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_avx_main_loop_pre
|
jmp sha256d_ms_4way_avx_main_loop1
|
||||||
|
|
||||||
|
sha256d_ms_4way_avx_main_loop2:
|
||||||
|
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
||||||
|
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
||||||
|
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
||||||
|
sha256d_ms_4way_avx_main_loop1:
|
||||||
|
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
|
sha256_avx_main_quadround 4
|
||||||
|
sha256_avx_main_quadround 8
|
||||||
|
sha256_avx_main_quadround 12
|
||||||
|
sha256_avx_main_quadround 16
|
||||||
|
sha256_avx_main_quadround 20
|
||||||
|
sha256_avx_main_quadround 24
|
||||||
|
sha256_avx_main_quadround 28
|
||||||
|
sha256_avx_main_quadround 32
|
||||||
|
sha256_avx_main_quadround 36
|
||||||
|
sha256_avx_main_quadround 40
|
||||||
|
sha256_avx_main_quadround 44
|
||||||
|
sha256_avx_main_quadround 48
|
||||||
|
sha256_avx_main_quadround 52
|
||||||
|
sha256_avx_main_quadround 56
|
||||||
|
sha256_avx_main_round 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
||||||
|
jz sha256d_ms_4way_avx_finish
|
||||||
|
sha256_avx_main_round 61, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
||||||
|
sha256_avx_main_round 62, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
||||||
|
sha256_avx_main_round 63, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
|
|
||||||
paddd 0(%rdx), %xmm7
|
paddd 0(%rdx), %xmm7
|
||||||
paddd 16(%rdx), %xmm5
|
paddd 16(%rdx), %xmm5
|
||||||
|
@ -1202,7 +1293,11 @@ sha256d_4way_avx:
|
||||||
movdqa %xmm1, 240(%rsp)
|
movdqa %xmm1, 240(%rsp)
|
||||||
|
|
||||||
leaq 256(%rsp), %rax
|
leaq 256(%rsp), %rax
|
||||||
call sha256_avx_extend_loop
|
cmpq %rax, %rax
|
||||||
|
jmp sha256d_ms_4way_avx_extend_loop2
|
||||||
|
|
||||||
|
sha256d_ms_4way_avx_extend_coda2:
|
||||||
|
sha256_avx_extend_round 44
|
||||||
|
|
||||||
movdqa sha256_4h+0(%rip), %xmm7
|
movdqa sha256_4h+0(%rip), %xmm7
|
||||||
movdqa sha256_4h+16(%rip), %xmm5
|
movdqa sha256_4h+16(%rip), %xmm5
|
||||||
|
@ -1215,24 +1310,10 @@ sha256d_4way_avx:
|
||||||
|
|
||||||
movq %rsp, %rax
|
movq %rsp, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_avx_main_loop
|
jmp sha256d_ms_4way_avx_main_loop2
|
||||||
|
|
||||||
paddd sha256_4h+0(%rip), %xmm7
|
sha256d_ms_4way_avx_finish:
|
||||||
paddd sha256_4h+16(%rip), %xmm5
|
|
||||||
paddd sha256_4h+32(%rip), %xmm4
|
|
||||||
paddd sha256_4h+48(%rip), %xmm3
|
|
||||||
paddd sha256_4h+64(%rip), %xmm0
|
|
||||||
paddd sha256_4h+80(%rip), %xmm8
|
|
||||||
paddd sha256_4h+96(%rip), %xmm9
|
|
||||||
paddd sha256_4h+112(%rip), %xmm10
|
paddd sha256_4h+112(%rip), %xmm10
|
||||||
|
|
||||||
movdqa %xmm7, 0(%rdi)
|
|
||||||
movdqa %xmm5, 16(%rdi)
|
|
||||||
movdqa %xmm4, 32(%rdi)
|
|
||||||
movdqa %xmm3, 48(%rdi)
|
|
||||||
movdqa %xmm0, 64(%rdi)
|
|
||||||
movdqa %xmm8, 80(%rdi)
|
|
||||||
movdqa %xmm9, 96(%rdi)
|
|
||||||
movdqa %xmm10, 112(%rdi)
|
movdqa %xmm10, 112(%rdi)
|
||||||
|
|
||||||
addq $1032, %rsp
|
addq $1032, %rsp
|
||||||
|
@ -1254,7 +1335,7 @@ sha256d_4way_avx:
|
||||||
#if defined(USE_XOP)
|
#if defined(USE_XOP)
|
||||||
|
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256d_4way_xop:
|
sha256d_ms_4way_xop:
|
||||||
#if defined(WIN64)
|
#if defined(WIN64)
|
||||||
pushq %rdi
|
pushq %rdi
|
||||||
subq $80, %rsp
|
subq $80, %rsp
|
||||||
|
@ -1272,7 +1353,35 @@ sha256d_4way_xop:
|
||||||
subq $1032, %rsp
|
subq $1032, %rsp
|
||||||
|
|
||||||
leaq 256(%rsi), %rax
|
leaq 256(%rsi), %rax
|
||||||
call sha256_xop_extend_loop_pre
|
jmp sha256d_ms_4way_xop_extend_loop1
|
||||||
|
|
||||||
|
sha256d_ms_4way_xop_extend_loop2:
|
||||||
|
sha256_xop_extend_doubleround 0
|
||||||
|
sha256d_ms_4way_xop_extend_loop1:
|
||||||
|
sha256_xop_extend_doubleround 2
|
||||||
|
sha256_xop_extend_doubleround 4
|
||||||
|
sha256_xop_extend_doubleround 6
|
||||||
|
sha256_xop_extend_doubleround 8
|
||||||
|
sha256_xop_extend_doubleround 10
|
||||||
|
sha256_xop_extend_doubleround 12
|
||||||
|
sha256_xop_extend_doubleround 14
|
||||||
|
sha256_xop_extend_doubleround 16
|
||||||
|
sha256_xop_extend_doubleround 18
|
||||||
|
sha256_xop_extend_doubleround 20
|
||||||
|
sha256_xop_extend_doubleround 22
|
||||||
|
sha256_xop_extend_doubleround 24
|
||||||
|
sha256_xop_extend_doubleround 26
|
||||||
|
sha256_xop_extend_doubleround 28
|
||||||
|
sha256_xop_extend_doubleround 30
|
||||||
|
sha256_xop_extend_doubleround 32
|
||||||
|
sha256_xop_extend_doubleround 34
|
||||||
|
sha256_xop_extend_doubleround 36
|
||||||
|
sha256_xop_extend_doubleround 38
|
||||||
|
sha256_xop_extend_doubleround 40
|
||||||
|
sha256_xop_extend_doubleround 42
|
||||||
|
jz sha256d_ms_4way_xop_extend_coda2
|
||||||
|
sha256_xop_extend_doubleround 44
|
||||||
|
sha256_xop_extend_doubleround 46
|
||||||
|
|
||||||
movdqa 0(%rcx), %xmm7
|
movdqa 0(%rcx), %xmm7
|
||||||
movdqa 16(%rcx), %xmm8
|
movdqa 16(%rcx), %xmm8
|
||||||
|
@ -1285,7 +1394,33 @@ sha256d_4way_xop:
|
||||||
|
|
||||||
movq %rsi, %rax
|
movq %rsi, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_xop_main_loop_pre
|
jmp sha256d_ms_4way_xop_main_loop1
|
||||||
|
|
||||||
|
sha256d_ms_4way_xop_main_loop2:
|
||||||
|
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
||||||
|
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
||||||
|
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
||||||
|
sha256d_ms_4way_xop_main_loop1:
|
||||||
|
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
|
sha256_xop_main_quadround 4
|
||||||
|
sha256_xop_main_quadround 8
|
||||||
|
sha256_xop_main_quadround 12
|
||||||
|
sha256_xop_main_quadround 16
|
||||||
|
sha256_xop_main_quadround 20
|
||||||
|
sha256_xop_main_quadround 24
|
||||||
|
sha256_xop_main_quadround 28
|
||||||
|
sha256_xop_main_quadround 32
|
||||||
|
sha256_xop_main_quadround 36
|
||||||
|
sha256_xop_main_quadround 40
|
||||||
|
sha256_xop_main_quadround 44
|
||||||
|
sha256_xop_main_quadround 48
|
||||||
|
sha256_xop_main_quadround 52
|
||||||
|
sha256_xop_main_quadround 56
|
||||||
|
sha256_xop_main_round 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
||||||
|
jz sha256d_ms_4way_xop_finish
|
||||||
|
sha256_xop_main_round 61, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
||||||
|
sha256_xop_main_round 62, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
||||||
|
sha256_xop_main_round 63, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
|
|
||||||
paddd 0(%rdx), %xmm7
|
paddd 0(%rdx), %xmm7
|
||||||
paddd 16(%rdx), %xmm5
|
paddd 16(%rdx), %xmm5
|
||||||
|
@ -1320,7 +1455,11 @@ sha256d_4way_xop:
|
||||||
movdqa %xmm1, 240(%rsp)
|
movdqa %xmm1, 240(%rsp)
|
||||||
|
|
||||||
leaq 256(%rsp), %rax
|
leaq 256(%rsp), %rax
|
||||||
call sha256_xop_extend_loop
|
cmpq %rax, %rax
|
||||||
|
jmp sha256d_ms_4way_xop_extend_loop2
|
||||||
|
|
||||||
|
sha256d_ms_4way_xop_extend_coda2:
|
||||||
|
sha256_xop_extend_round 44
|
||||||
|
|
||||||
movdqa sha256_4h+0(%rip), %xmm7
|
movdqa sha256_4h+0(%rip), %xmm7
|
||||||
movdqa sha256_4h+16(%rip), %xmm5
|
movdqa sha256_4h+16(%rip), %xmm5
|
||||||
|
@ -1333,24 +1472,10 @@ sha256d_4way_xop:
|
||||||
|
|
||||||
movq %rsp, %rax
|
movq %rsp, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_xop_main_loop
|
jmp sha256d_ms_4way_xop_main_loop2
|
||||||
|
|
||||||
paddd sha256_4h+0(%rip), %xmm7
|
sha256d_ms_4way_xop_finish:
|
||||||
paddd sha256_4h+16(%rip), %xmm5
|
|
||||||
paddd sha256_4h+32(%rip), %xmm4
|
|
||||||
paddd sha256_4h+48(%rip), %xmm3
|
|
||||||
paddd sha256_4h+64(%rip), %xmm0
|
|
||||||
paddd sha256_4h+80(%rip), %xmm8
|
|
||||||
paddd sha256_4h+96(%rip), %xmm9
|
|
||||||
paddd sha256_4h+112(%rip), %xmm10
|
paddd sha256_4h+112(%rip), %xmm10
|
||||||
|
|
||||||
movdqa %xmm7, 0(%rdi)
|
|
||||||
movdqa %xmm5, 16(%rdi)
|
|
||||||
movdqa %xmm4, 32(%rdi)
|
|
||||||
movdqa %xmm3, 48(%rdi)
|
|
||||||
movdqa %xmm0, 64(%rdi)
|
|
||||||
movdqa %xmm8, 80(%rdi)
|
|
||||||
movdqa %xmm9, 96(%rdi)
|
|
||||||
movdqa %xmm10, 112(%rdi)
|
movdqa %xmm10, 112(%rdi)
|
||||||
|
|
||||||
addq $1032, %rsp
|
addq $1032, %rsp
|
||||||
|
@ -1400,23 +1525,23 @@ _sha256_use_4way:
|
||||||
jz sha2_4way_init_avx
|
jz sha2_4way_init_avx
|
||||||
|
|
||||||
sha2_4way_init_xop:
|
sha2_4way_init_xop:
|
||||||
leaq sha256d_4way_xop(%rip), %rax
|
leaq sha256d_ms_4way_xop(%rip), %rax
|
||||||
leaq sha256_transform_4way_core_xop(%rip), %rdx
|
leaq sha256_transform_4way_core_xop(%rip), %rdx
|
||||||
jmp sha2_4way_init_done
|
jmp sha2_4way_init_done
|
||||||
#endif /* USE_XOP */
|
#endif /* USE_XOP */
|
||||||
|
|
||||||
sha2_4way_init_avx:
|
sha2_4way_init_avx:
|
||||||
leaq sha256d_4way_avx(%rip), %rax
|
leaq sha256d_ms_4way_avx(%rip), %rax
|
||||||
leaq sha256_transform_4way_core_avx(%rip), %rdx
|
leaq sha256_transform_4way_core_avx(%rip), %rdx
|
||||||
jmp sha2_4way_init_done
|
jmp sha2_4way_init_done
|
||||||
#endif /* USE_AVX */
|
#endif /* USE_AVX */
|
||||||
|
|
||||||
sha2_4way_init_sse2:
|
sha2_4way_init_sse2:
|
||||||
leaq sha256d_4way_sse2(%rip), %rax
|
leaq sha256d_ms_4way_sse2(%rip), %rax
|
||||||
leaq sha256_transform_4way_core_sse2(%rip), %rdx
|
leaq sha256_transform_4way_core_sse2(%rip), %rdx
|
||||||
|
|
||||||
sha2_4way_init_done:
|
sha2_4way_init_done:
|
||||||
movq %rax, sha256d_4way_addr(%rip)
|
movq %rax, sha256d_ms_4way_addr(%rip)
|
||||||
movq %rdx, sha256_transform_4way_core_addr(%rip)
|
movq %rdx, sha256_transform_4way_core_addr(%rip)
|
||||||
popq %rdx
|
popq %rdx
|
||||||
popq %rcx
|
popq %rcx
|
||||||
|
|
75
sha2.c
75
sha2.c
|
@ -172,6 +172,18 @@ static const uint32_t sha256d_hash1[16] = {
|
||||||
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static void sha256d(uint32_t *hash, uint32_t *data)
|
||||||
|
{
|
||||||
|
uint32_t S[16];
|
||||||
|
|
||||||
|
sha256_init(S);
|
||||||
|
sha256_transform(S, data, 0);
|
||||||
|
sha256_transform(S, data + 16, 0);
|
||||||
|
memcpy(S + 8, sha256d_hash1 + 8, 32);
|
||||||
|
sha256_init(hash);
|
||||||
|
sha256_transform(hash, S, 0);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void sha256d_preextend(uint32_t *W)
|
static inline void sha256d_preextend(uint32_t *W)
|
||||||
{
|
{
|
||||||
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
|
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
|
||||||
|
@ -200,7 +212,7 @@ static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
|
||||||
RNDr(S, W, 2);
|
RNDr(S, W, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void sha256d(uint32_t *hash, uint32_t *W,
|
static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
|
||||||
const uint32_t *midstate, const uint32_t *prehash)
|
const uint32_t *midstate, const uint32_t *prehash)
|
||||||
{
|
{
|
||||||
uint32_t S[64];
|
uint32_t S[64];
|
||||||
|
@ -298,10 +310,27 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
|
||||||
memcpy(W + 18, E, sizeof(E));
|
memcpy(W + 18, E, sizeof(E));
|
||||||
|
|
||||||
memcpy(S + 8, sha256d_hash1 + 8, 32);
|
memcpy(S + 8, sha256d_hash1 + 8, 32);
|
||||||
for (i = 16; i < 64; i += 2) {
|
S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
|
||||||
|
S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
|
||||||
|
S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
|
||||||
|
S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
|
||||||
|
S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
|
||||||
|
S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
|
||||||
|
S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
|
||||||
|
S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
|
||||||
|
S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
|
||||||
|
S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
|
||||||
|
S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
|
||||||
|
S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
|
||||||
|
S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
|
||||||
|
S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
|
||||||
|
S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
|
||||||
|
S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15];
|
||||||
|
for (i = 32; i < 60; i += 2) {
|
||||||
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
|
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
|
||||||
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
|
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
|
||||||
}
|
}
|
||||||
|
S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
|
||||||
|
|
||||||
sha256_init(hash);
|
sha256_init(hash);
|
||||||
|
|
||||||
|
@ -362,21 +391,21 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
|
||||||
RNDr(hash, S, 54);
|
RNDr(hash, S, 54);
|
||||||
RNDr(hash, S, 55);
|
RNDr(hash, S, 55);
|
||||||
RNDr(hash, S, 56);
|
RNDr(hash, S, 56);
|
||||||
RNDr(hash, S, 57);
|
|
||||||
RNDr(hash, S, 58);
|
|
||||||
RNDr(hash, S, 59);
|
|
||||||
RNDr(hash, S, 60);
|
|
||||||
RNDr(hash, S, 61);
|
|
||||||
RNDr(hash, S, 62);
|
|
||||||
RNDr(hash, S, 63);
|
|
||||||
|
|
||||||
for (i = 0; i < 8; i++)
|
hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
|
||||||
hash[i] += sha256_h[i];
|
+ S[57] + sha256_k[57];
|
||||||
|
hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
|
||||||
|
+ S[58] + sha256_k[58];
|
||||||
|
hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
|
||||||
|
+ S[59] + sha256_k[59];
|
||||||
|
hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
|
||||||
|
+ S[60] + sha256_k[60]
|
||||||
|
+ sha256_h[7];
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_SHA256_4WAY
|
#ifdef HAVE_SHA256_4WAY
|
||||||
#define SHA256D_MAX_WAYS 4
|
#define SHA256D_MAX_WAYS 4
|
||||||
void sha256d_4way(uint32_t *hash, uint32_t *data,
|
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
|
||||||
const uint32_t *midstate, const uint32_t *prehash);
|
const uint32_t *midstate, const uint32_t *prehash);
|
||||||
#else
|
#else
|
||||||
#define SHA256D_MAX_WAYS 1
|
#define SHA256D_MAX_WAYS 1
|
||||||
|
@ -390,6 +419,7 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
||||||
uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
||||||
uint32_t n = pdata[19] - 1;
|
uint32_t n = pdata[19] - 1;
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
#ifdef HAVE_SHA256_4WAY
|
#ifdef HAVE_SHA256_4WAY
|
||||||
const int ways = sha256_use_4way() ? 4 : 1;
|
const int ways = sha256_use_4way() ? 4 : 1;
|
||||||
|
@ -421,16 +451,14 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
for (i = 0; i < 4; i++)
|
for (i = 0; i < 4; i++)
|
||||||
data[4 * 3 + i] = ++n;
|
data[4 * 3 + i] = ++n;
|
||||||
|
|
||||||
sha256d_4way(hash, data, midstate, prehash);
|
sha256d_ms_4way(hash, data, midstate, prehash);
|
||||||
|
|
||||||
for (i = 0; i < 4; i++) {
|
for (i = 0; i < 4; i++) {
|
||||||
if (hash[4 * 7 + i] <= Htarg) {
|
if (hash[4 * 7 + i] <= Htarg) {
|
||||||
uint32_t tmp[8];
|
pdata[19] = data[4 * 3 + i];
|
||||||
for (j = 0; j < 8; j++)
|
sha256d(hash, pdata);
|
||||||
tmp[j] = hash[4 * j + i];
|
if (fulltest(hash, ptarget)) {
|
||||||
if (fulltest(tmp, ptarget)) {
|
*hashes_done = n - first_nonce + 1;
|
||||||
*hashes_done = n - pdata[19] + 1;
|
|
||||||
pdata[19] = data[4 * 3 + i];
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -440,17 +468,18 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
#endif
|
#endif
|
||||||
do {
|
do {
|
||||||
data[3] = ++n;
|
data[3] = ++n;
|
||||||
sha256d(hash, data, midstate, prehash);
|
sha256d_ms(hash, data, midstate, prehash);
|
||||||
if (hash[7] <= Htarg) {
|
if (hash[7] <= Htarg) {
|
||||||
|
pdata[19] = data[3];
|
||||||
|
sha256d(hash, pdata);
|
||||||
if (fulltest(hash, ptarget)) {
|
if (fulltest(hash, ptarget)) {
|
||||||
*hashes_done = n - pdata[19] + 1;
|
*hashes_done = n - first_nonce + 1;
|
||||||
pdata[19] = data[3];
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||||
|
|
||||||
*hashes_done = n - pdata[19] + 1;
|
*hashes_done = n - first_nonce + 1;
|
||||||
pdata[19] = n;
|
pdata[19] = n;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue