Precompute the first few SHA-256d rounds
This commit is contained in:
parent
e52982ab7f
commit
18a34a72ab
2 changed files with 161 additions and 91 deletions
157
sha2-x64.S
157
sha2-x64.S
|
@ -128,8 +128,8 @@ _sha256_init_4way:
|
||||||
|
|
||||||
|
|
||||||
.macro sha256_sse2_extend_round i
|
.macro sha256_sse2_extend_round i
|
||||||
movdqa (\i-15)*16(%rcx), %xmm0
|
movdqa (\i-15)*16(%rax), %xmm0
|
||||||
movdqa (\i-14)*16(%rcx), %xmm4
|
movdqa (\i-14)*16(%rax), %xmm4
|
||||||
movdqa %xmm0, %xmm2
|
movdqa %xmm0, %xmm2
|
||||||
movdqa %xmm4, %xmm6
|
movdqa %xmm4, %xmm6
|
||||||
psrld $3, %xmm0
|
psrld $3, %xmm0
|
||||||
|
@ -153,10 +153,10 @@ _sha256_init_4way:
|
||||||
pxor %xmm2, %xmm0
|
pxor %xmm2, %xmm0
|
||||||
pxor %xmm6, %xmm4
|
pxor %xmm6, %xmm4
|
||||||
|
|
||||||
movdqa (\i-2)*16(%rcx), %xmm3
|
movdqa (\i-2)*16(%rax), %xmm3
|
||||||
movdqa (\i-1)*16(%rcx), %xmm7
|
movdqa (\i-1)*16(%rax), %xmm7
|
||||||
paddd (\i-16)*16(%rcx), %xmm0
|
paddd (\i-16)*16(%rax), %xmm0
|
||||||
paddd (\i-15)*16(%rcx), %xmm4
|
paddd (\i-15)*16(%rax), %xmm4
|
||||||
|
|
||||||
movdqa %xmm3, %xmm2
|
movdqa %xmm3, %xmm2
|
||||||
movdqa %xmm7, %xmm6
|
movdqa %xmm7, %xmm6
|
||||||
|
@ -165,14 +165,14 @@ _sha256_init_4way:
|
||||||
movdqa %xmm3, %xmm1
|
movdqa %xmm3, %xmm1
|
||||||
movdqa %xmm7, %xmm5
|
movdqa %xmm7, %xmm5
|
||||||
|
|
||||||
paddd (\i-7)*16(%rcx), %xmm0
|
paddd (\i-7)*16(%rax), %xmm0
|
||||||
|
|
||||||
pslld $13, %xmm2
|
pslld $13, %xmm2
|
||||||
pslld $13, %xmm6
|
pslld $13, %xmm6
|
||||||
psrld $7, %xmm1
|
psrld $7, %xmm1
|
||||||
psrld $7, %xmm5
|
psrld $7, %xmm5
|
||||||
|
|
||||||
paddd (\i-6)*16(%rcx), %xmm4
|
paddd (\i-6)*16(%rax), %xmm4
|
||||||
|
|
||||||
pxor %xmm1, %xmm3
|
pxor %xmm1, %xmm3
|
||||||
pxor %xmm5, %xmm7
|
pxor %xmm5, %xmm7
|
||||||
|
@ -189,14 +189,15 @@ _sha256_init_4way:
|
||||||
|
|
||||||
paddd %xmm3, %xmm0
|
paddd %xmm3, %xmm0
|
||||||
paddd %xmm7, %xmm4
|
paddd %xmm7, %xmm4
|
||||||
movdqa %xmm0, \i*16(%rcx)
|
movdqa %xmm0, \i*16(%rax)
|
||||||
movdqa %xmm4, (\i+1)*16(%rcx)
|
movdqa %xmm4, (\i+1)*16(%rax)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_sse2_extend_loop:
|
sha256_sse2_extend_loop:
|
||||||
sha256_sse2_extend_round 0
|
sha256_sse2_extend_round 0
|
||||||
|
sha256_sse2_extend_loop_pre:
|
||||||
sha256_sse2_extend_round 2
|
sha256_sse2_extend_round 2
|
||||||
sha256_sse2_extend_round 4
|
sha256_sse2_extend_round 4
|
||||||
sha256_sse2_extend_round 6
|
sha256_sse2_extend_round 6
|
||||||
|
@ -293,6 +294,7 @@ sha256_sse2_main_loop:
|
||||||
sha256_sse2_main_round 0
|
sha256_sse2_main_round 0
|
||||||
sha256_sse2_main_round 1
|
sha256_sse2_main_round 1
|
||||||
sha256_sse2_main_round 2
|
sha256_sse2_main_round 2
|
||||||
|
sha256_sse2_main_loop_pre:
|
||||||
sha256_sse2_main_round 3
|
sha256_sse2_main_round 3
|
||||||
sha256_sse2_main_round 4
|
sha256_sse2_main_round 4
|
||||||
sha256_sse2_main_round 5
|
sha256_sse2_main_round 5
|
||||||
|
@ -360,8 +362,8 @@ sha256_sse2_main_loop:
|
||||||
#if defined(USE_AVX)
|
#if defined(USE_AVX)
|
||||||
|
|
||||||
.macro sha256_avx_extend_round i
|
.macro sha256_avx_extend_round i
|
||||||
vmovdqa (\i-15)*16(%rcx), %xmm0
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
||||||
vmovdqa (\i-14)*16(%rcx), %xmm4
|
vmovdqa (\i-14)*16(%rax), %xmm4
|
||||||
vpslld $14, %xmm0, %xmm2
|
vpslld $14, %xmm0, %xmm2
|
||||||
vpslld $14, %xmm4, %xmm6
|
vpslld $14, %xmm4, %xmm6
|
||||||
vpsrld $3, %xmm0, %xmm0
|
vpsrld $3, %xmm0, %xmm0
|
||||||
|
@ -381,22 +383,22 @@ sha256_sse2_main_loop:
|
||||||
vpxor %xmm2, %xmm0, %xmm0
|
vpxor %xmm2, %xmm0, %xmm0
|
||||||
vpxor %xmm6, %xmm4, %xmm4
|
vpxor %xmm6, %xmm4, %xmm4
|
||||||
|
|
||||||
vmovdqa (\i-2)*16(%rcx), %xmm3
|
vmovdqa (\i-2)*16(%rax), %xmm3
|
||||||
vmovdqa (\i-1)*16(%rcx), %xmm7
|
vmovdqa (\i-1)*16(%rax), %xmm7
|
||||||
vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0
|
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
|
||||||
vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4
|
vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
|
||||||
|
|
||||||
vpslld $13, %xmm3, %xmm2
|
vpslld $13, %xmm3, %xmm2
|
||||||
vpslld $13, %xmm7, %xmm6
|
vpslld $13, %xmm7, %xmm6
|
||||||
vpsrld $10, %xmm3, %xmm3
|
vpsrld $10, %xmm3, %xmm3
|
||||||
vpsrld $10, %xmm7, %xmm7
|
vpsrld $10, %xmm7, %xmm7
|
||||||
|
|
||||||
vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
||||||
|
|
||||||
vpsrld $7, %xmm3, %xmm1
|
vpsrld $7, %xmm3, %xmm1
|
||||||
vpsrld $7, %xmm7, %xmm5
|
vpsrld $7, %xmm7, %xmm5
|
||||||
|
|
||||||
vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4
|
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
|
||||||
|
|
||||||
vpxor %xmm1, %xmm3, %xmm3
|
vpxor %xmm1, %xmm3, %xmm3
|
||||||
vpxor %xmm5, %xmm7, %xmm7
|
vpxor %xmm5, %xmm7, %xmm7
|
||||||
|
@ -413,14 +415,15 @@ sha256_sse2_main_loop:
|
||||||
|
|
||||||
vpaddd %xmm3, %xmm0, %xmm0
|
vpaddd %xmm3, %xmm0, %xmm0
|
||||||
vpaddd %xmm7, %xmm4, %xmm4
|
vpaddd %xmm7, %xmm4, %xmm4
|
||||||
vmovdqa %xmm0, \i*16(%rcx)
|
vmovdqa %xmm0, \i*16(%rax)
|
||||||
vmovdqa %xmm4, (\i+1)*16(%rcx)
|
vmovdqa %xmm4, (\i+1)*16(%rax)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_avx_extend_loop:
|
sha256_avx_extend_loop:
|
||||||
sha256_avx_extend_round 0
|
sha256_avx_extend_round 0
|
||||||
|
sha256_avx_extend_loop_pre:
|
||||||
sha256_avx_extend_round 2
|
sha256_avx_extend_round 2
|
||||||
sha256_avx_extend_round 4
|
sha256_avx_extend_round 4
|
||||||
sha256_avx_extend_round 6
|
sha256_avx_extend_round 6
|
||||||
|
@ -501,7 +504,11 @@ sha256_avx_extend_loop:
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_avx_main_loop:
|
sha256_avx_main_loop:
|
||||||
sha256_avx_main_quadround 0
|
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
||||||
|
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
||||||
|
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
||||||
|
sha256_avx_main_loop_pre:
|
||||||
|
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
sha256_avx_main_quadround 4
|
sha256_avx_main_quadround 4
|
||||||
sha256_avx_main_quadround 8
|
sha256_avx_main_quadround 8
|
||||||
sha256_avx_main_quadround 12
|
sha256_avx_main_quadround 12
|
||||||
|
@ -525,8 +532,8 @@ sha256_avx_main_loop:
|
||||||
#if defined(USE_XOP)
|
#if defined(USE_XOP)
|
||||||
|
|
||||||
.macro sha256_xop_extend_round i
|
.macro sha256_xop_extend_round i
|
||||||
vmovdqa (\i-15)*16(%rcx), %xmm0
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
||||||
vmovdqa (\i-14)*16(%rcx), %xmm4
|
vmovdqa (\i-14)*16(%rax), %xmm4
|
||||||
vprotd $25, %xmm0, %xmm1
|
vprotd $25, %xmm0, %xmm1
|
||||||
vprotd $25, %xmm4, %xmm5
|
vprotd $25, %xmm4, %xmm5
|
||||||
vprotd $14, %xmm0, %xmm2
|
vprotd $14, %xmm0, %xmm2
|
||||||
|
@ -538,10 +545,10 @@ sha256_avx_main_loop:
|
||||||
vpxor %xmm2, %xmm0, %xmm0
|
vpxor %xmm2, %xmm0, %xmm0
|
||||||
vpxor %xmm6, %xmm4, %xmm4
|
vpxor %xmm6, %xmm4, %xmm4
|
||||||
|
|
||||||
vmovdqa (\i-2)*16(%rcx), %xmm3
|
vmovdqa (\i-2)*16(%rax), %xmm3
|
||||||
vmovdqa (\i-1)*16(%rcx), %xmm7
|
vmovdqa (\i-1)*16(%rax), %xmm7
|
||||||
vpaddd (\i-16)*16(%rcx), %xmm0, %xmm0
|
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
|
||||||
vpaddd (\i-15)*16(%rcx), %xmm4, %xmm4
|
vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
|
||||||
|
|
||||||
vprotd $15, %xmm3, %xmm1
|
vprotd $15, %xmm3, %xmm1
|
||||||
vprotd $15, %xmm7, %xmm5
|
vprotd $15, %xmm7, %xmm5
|
||||||
|
@ -550,8 +557,8 @@ sha256_avx_main_loop:
|
||||||
vpxor %xmm1, %xmm2, %xmm2
|
vpxor %xmm1, %xmm2, %xmm2
|
||||||
vpxor %xmm5, %xmm6, %xmm6
|
vpxor %xmm5, %xmm6, %xmm6
|
||||||
|
|
||||||
vpaddd (\i-7)*16(%rcx), %xmm0, %xmm0
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
||||||
vpaddd (\i-6)*16(%rcx), %xmm4, %xmm4
|
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
|
||||||
|
|
||||||
vpsrld $10, %xmm3, %xmm3
|
vpsrld $10, %xmm3, %xmm3
|
||||||
vpsrld $10, %xmm7, %xmm7
|
vpsrld $10, %xmm7, %xmm7
|
||||||
|
@ -560,14 +567,15 @@ sha256_avx_main_loop:
|
||||||
|
|
||||||
vpaddd %xmm3, %xmm0, %xmm0
|
vpaddd %xmm3, %xmm0, %xmm0
|
||||||
vpaddd %xmm7, %xmm4, %xmm4
|
vpaddd %xmm7, %xmm4, %xmm4
|
||||||
vmovdqa %xmm0, \i*16(%rcx)
|
vmovdqa %xmm0, \i*16(%rax)
|
||||||
vmovdqa %xmm4, (\i+1)*16(%rcx)
|
vmovdqa %xmm4, (\i+1)*16(%rax)
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_xop_extend_loop:
|
sha256_xop_extend_loop:
|
||||||
sha256_xop_extend_round 0
|
sha256_xop_extend_round 0
|
||||||
|
sha256_xop_extend_loop_pre:
|
||||||
sha256_xop_extend_round 2
|
sha256_xop_extend_round 2
|
||||||
sha256_xop_extend_round 4
|
sha256_xop_extend_round 4
|
||||||
sha256_xop_extend_round 6
|
sha256_xop_extend_round 6
|
||||||
|
@ -636,7 +644,11 @@ sha256_xop_extend_loop:
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_xop_main_loop:
|
sha256_xop_main_loop:
|
||||||
sha256_xop_main_quadround 0
|
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
||||||
|
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
||||||
|
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
||||||
|
sha256_xop_main_loop_pre:
|
||||||
|
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
||||||
sha256_xop_main_quadround 4
|
sha256_xop_main_quadround 4
|
||||||
sha256_xop_main_quadround 8
|
sha256_xop_main_quadround 8
|
||||||
sha256_xop_main_quadround 12
|
sha256_xop_main_quadround 12
|
||||||
|
@ -810,11 +822,12 @@ sha256_transform_4way_sse2_main_loop:
|
||||||
jne sha256_transform_4way_sse2_main_loop
|
jne sha256_transform_4way_sse2_main_loop
|
||||||
jmp sha256_transform_4way_finish
|
jmp sha256_transform_4way_finish
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_AVX)
|
#if defined(USE_AVX)
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_transform_4way_core_avx:
|
sha256_transform_4way_core_avx:
|
||||||
leaq 256(%rsp), %rcx
|
leaq 256(%rsp), %rax
|
||||||
call sha256_avx_extend_loop
|
call sha256_avx_extend_loop
|
||||||
movdqu 0(%rdi), %xmm7
|
movdqu 0(%rdi), %xmm7
|
||||||
movdqu 16(%rdi), %xmm5
|
movdqu 16(%rdi), %xmm5
|
||||||
|
@ -830,11 +843,12 @@ sha256_transform_4way_core_avx:
|
||||||
jmp sha256_transform_4way_finish
|
jmp sha256_transform_4way_finish
|
||||||
#endif /* USE_AVX */
|
#endif /* USE_AVX */
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_XOP)
|
#if defined(USE_XOP)
|
||||||
.text
|
.text
|
||||||
.p2align 6
|
.p2align 6
|
||||||
sha256_transform_4way_core_xop:
|
sha256_transform_4way_core_xop:
|
||||||
leaq 256(%rsp), %rcx
|
leaq 256(%rsp), %rax
|
||||||
call sha256_xop_extend_loop
|
call sha256_xop_extend_loop
|
||||||
movdqu 0(%rdi), %xmm7
|
movdqu 0(%rdi), %xmm7
|
||||||
movdqu 16(%rdi), %xmm5
|
movdqu 16(%rdi), %xmm5
|
||||||
|
@ -1019,24 +1033,25 @@ sha256d_4way_sse2:
|
||||||
movq %rcx, %rdi
|
movq %rcx, %rdi
|
||||||
movq %rdx, %rsi
|
movq %rdx, %rsi
|
||||||
movq %r8, %rdx
|
movq %r8, %rdx
|
||||||
|
movq %r9, %rcx
|
||||||
#endif
|
#endif
|
||||||
subq $1032, %rsp
|
subq $1032, %rsp
|
||||||
|
|
||||||
leaq 256(%rsi), %rcx
|
leaq 256(%rsi), %rax
|
||||||
call sha256_sse2_extend_loop
|
call sha256_sse2_extend_loop_pre
|
||||||
|
|
||||||
movdqa 0(%rdx), %xmm7
|
movdqa 0(%rcx), %xmm3
|
||||||
movdqa 16(%rdx), %xmm5
|
movdqa 16(%rcx), %xmm0
|
||||||
movdqa 32(%rdx), %xmm4
|
movdqa 32(%rcx), %xmm8
|
||||||
movdqa 48(%rdx), %xmm3
|
movdqa 48(%rcx), %xmm9
|
||||||
movdqa 64(%rdx), %xmm0
|
movdqa 64(%rcx), %xmm10
|
||||||
movdqa 80(%rdx), %xmm8
|
movdqa 80(%rcx), %xmm7
|
||||||
movdqa 96(%rdx), %xmm9
|
movdqa 96(%rcx), %xmm5
|
||||||
movdqa 112(%rdx), %xmm10
|
movdqa 112(%rcx), %xmm4
|
||||||
|
|
||||||
movq %rsi, %rax
|
movq %rsi, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_sse2_main_loop
|
call sha256_sse2_main_loop_pre
|
||||||
|
|
||||||
paddd 0(%rdx), %xmm7
|
paddd 0(%rdx), %xmm7
|
||||||
paddd 16(%rdx), %xmm5
|
paddd 16(%rdx), %xmm5
|
||||||
|
@ -1070,7 +1085,7 @@ sha256d_4way_sse2:
|
||||||
movdqa %xmm0, 224(%rsp)
|
movdqa %xmm0, 224(%rsp)
|
||||||
movdqa %xmm1, 240(%rsp)
|
movdqa %xmm1, 240(%rsp)
|
||||||
|
|
||||||
leaq 256(%rsp), %rcx
|
leaq 256(%rsp), %rax
|
||||||
call sha256_sse2_extend_loop
|
call sha256_sse2_extend_loop
|
||||||
|
|
||||||
movdqa sha256_4h+0(%rip), %xmm7
|
movdqa sha256_4h+0(%rip), %xmm7
|
||||||
|
@ -1134,24 +1149,25 @@ sha256d_4way_avx:
|
||||||
movq %rcx, %rdi
|
movq %rcx, %rdi
|
||||||
movq %rdx, %rsi
|
movq %rdx, %rsi
|
||||||
movq %r8, %rdx
|
movq %r8, %rdx
|
||||||
|
movq %r9, %rcx
|
||||||
#endif
|
#endif
|
||||||
subq $1032, %rsp
|
subq $1032, %rsp
|
||||||
|
|
||||||
leaq 256(%rsi), %rcx
|
leaq 256(%rsi), %rax
|
||||||
call sha256_avx_extend_loop
|
call sha256_avx_extend_loop_pre
|
||||||
|
|
||||||
movdqa 0(%rdx), %xmm7
|
movdqa 0(%rcx), %xmm7
|
||||||
movdqa 16(%rdx), %xmm5
|
movdqa 16(%rcx), %xmm8
|
||||||
movdqa 32(%rdx), %xmm4
|
movdqa 32(%rcx), %xmm9
|
||||||
movdqa 48(%rdx), %xmm3
|
movdqa 48(%rcx), %xmm10
|
||||||
movdqa 64(%rdx), %xmm0
|
movdqa 64(%rcx), %xmm0
|
||||||
movdqa 80(%rdx), %xmm8
|
movdqa 80(%rcx), %xmm5
|
||||||
movdqa 96(%rdx), %xmm9
|
movdqa 96(%rcx), %xmm4
|
||||||
movdqa 112(%rdx), %xmm10
|
movdqa 112(%rcx), %xmm3
|
||||||
|
|
||||||
movq %rsi, %rax
|
movq %rsi, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_avx_main_loop
|
call sha256_avx_main_loop_pre
|
||||||
|
|
||||||
paddd 0(%rdx), %xmm7
|
paddd 0(%rdx), %xmm7
|
||||||
paddd 16(%rdx), %xmm5
|
paddd 16(%rdx), %xmm5
|
||||||
|
@ -1185,7 +1201,7 @@ sha256d_4way_avx:
|
||||||
movdqa %xmm0, 224(%rsp)
|
movdqa %xmm0, 224(%rsp)
|
||||||
movdqa %xmm1, 240(%rsp)
|
movdqa %xmm1, 240(%rsp)
|
||||||
|
|
||||||
leaq 256(%rsp), %rcx
|
leaq 256(%rsp), %rax
|
||||||
call sha256_avx_extend_loop
|
call sha256_avx_extend_loop
|
||||||
|
|
||||||
movdqa sha256_4h+0(%rip), %xmm7
|
movdqa sha256_4h+0(%rip), %xmm7
|
||||||
|
@ -1251,24 +1267,25 @@ sha256d_4way_xop:
|
||||||
movq %rcx, %rdi
|
movq %rcx, %rdi
|
||||||
movq %rdx, %rsi
|
movq %rdx, %rsi
|
||||||
movq %r8, %rdx
|
movq %r8, %rdx
|
||||||
|
movq %r9, %rcx
|
||||||
#endif
|
#endif
|
||||||
subq $1032, %rsp
|
subq $1032, %rsp
|
||||||
|
|
||||||
leaq 256(%rsi), %rcx
|
leaq 256(%rsi), %rax
|
||||||
call sha256_xop_extend_loop
|
call sha256_xop_extend_loop_pre
|
||||||
|
|
||||||
movdqa 0(%rdx), %xmm7
|
movdqa 0(%rcx), %xmm7
|
||||||
movdqa 16(%rdx), %xmm5
|
movdqa 16(%rcx), %xmm8
|
||||||
movdqa 32(%rdx), %xmm4
|
movdqa 32(%rcx), %xmm9
|
||||||
movdqa 48(%rdx), %xmm3
|
movdqa 48(%rcx), %xmm10
|
||||||
movdqa 64(%rdx), %xmm0
|
movdqa 64(%rcx), %xmm0
|
||||||
movdqa 80(%rdx), %xmm8
|
movdqa 80(%rcx), %xmm5
|
||||||
movdqa 96(%rdx), %xmm9
|
movdqa 96(%rcx), %xmm4
|
||||||
movdqa 112(%rdx), %xmm10
|
movdqa 112(%rcx), %xmm3
|
||||||
|
|
||||||
movq %rsi, %rax
|
movq %rsi, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
call sha256_xop_main_loop
|
call sha256_xop_main_loop_pre
|
||||||
|
|
||||||
paddd 0(%rdx), %xmm7
|
paddd 0(%rdx), %xmm7
|
||||||
paddd 16(%rdx), %xmm5
|
paddd 16(%rdx), %xmm5
|
||||||
|
@ -1302,7 +1319,7 @@ sha256d_4way_xop:
|
||||||
movdqa %xmm0, 224(%rsp)
|
movdqa %xmm0, 224(%rsp)
|
||||||
movdqa %xmm1, 240(%rsp)
|
movdqa %xmm1, 240(%rsp)
|
||||||
|
|
||||||
leaq 256(%rsp), %rcx
|
leaq 256(%rsp), %rax
|
||||||
call sha256_xop_extend_loop
|
call sha256_xop_extend_loop
|
||||||
|
|
||||||
movdqa sha256_4h+0(%rip), %xmm7
|
movdqa sha256_4h+0(%rip), %xmm7
|
||||||
|
|
95
sha2.c
95
sha2.c
|
@ -164,12 +164,6 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
|
||||||
state[i] += S[i];
|
state[i] += S[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_SHA256_4WAY
|
|
||||||
#define SHA256D_MAX_WAYS 4
|
|
||||||
void sha256d_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate);
|
|
||||||
#else
|
|
||||||
#define SHA256D_MAX_WAYS 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static const uint32_t sha256d_hash1[16] = {
|
static const uint32_t sha256d_hash1[16] = {
|
||||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||||
|
@ -178,23 +172,64 @@ static const uint32_t sha256d_hash1[16] = {
|
||||||
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
0x00000000, 0x00000000, 0x00000000, 0x00000100
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static inline void sha256d_preextend(uint32_t *W)
|
||||||
|
{
|
||||||
|
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
|
||||||
|
W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
|
||||||
|
W[18] = s1(W[16]) + W[11] + W[ 2];
|
||||||
|
W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
|
||||||
|
W[20] = W[13] + s0(W[ 5]) + W[ 4];
|
||||||
|
W[21] = W[14] + s0(W[ 6]) + W[ 5];
|
||||||
|
W[22] = W[15] + s0(W[ 7]) + W[ 6];
|
||||||
|
W[23] = W[16] + s0(W[ 8]) + W[ 7];
|
||||||
|
W[24] = W[17] + s0(W[ 9]) + W[ 8];
|
||||||
|
W[25] = s0(W[10]) + W[ 9];
|
||||||
|
W[26] = s0(W[11]) + W[10];
|
||||||
|
W[27] = s0(W[12]) + W[11];
|
||||||
|
W[28] = s0(W[13]) + W[12];
|
||||||
|
W[29] = s0(W[14]) + W[13];
|
||||||
|
W[30] = s0(W[15]) + W[14];
|
||||||
|
W[31] = s0(W[16]) + W[15];
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
|
||||||
|
{
|
||||||
|
uint32_t t0, t1;
|
||||||
|
RNDr(S, W, 0);
|
||||||
|
RNDr(S, W, 1);
|
||||||
|
RNDr(S, W, 2);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void sha256d(uint32_t *hash, uint32_t *W,
|
static inline void sha256d(uint32_t *hash, uint32_t *W,
|
||||||
const uint32_t *midstate)
|
const uint32_t *midstate, const uint32_t *prehash)
|
||||||
{
|
{
|
||||||
uint32_t S[64];
|
uint32_t S[64];
|
||||||
|
uint32_t E[14];
|
||||||
uint32_t t0, t1;
|
uint32_t t0, t1;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i = 16; i < 64; i += 2) {
|
memcpy(E, W + 18, sizeof(E));
|
||||||
|
W[18] += s0(W[3]);
|
||||||
|
W[19] += W[3];
|
||||||
|
W[20] += s1(W[18]);
|
||||||
|
W[21] += s1(W[19]);
|
||||||
|
W[22] += s1(W[20]);
|
||||||
|
W[23] += s1(W[21]);
|
||||||
|
W[24] += s1(W[22]);
|
||||||
|
W[25] += s1(W[23]) + W[18];
|
||||||
|
W[26] += s1(W[24]) + W[19];
|
||||||
|
W[27] += s1(W[25]) + W[20];
|
||||||
|
W[28] += s1(W[26]) + W[21];
|
||||||
|
W[29] += s1(W[27]) + W[22];
|
||||||
|
W[30] += s1(W[28]) + W[23];
|
||||||
|
W[31] += s1(W[29]) + W[24];
|
||||||
|
for (i = 32; i < 64; i += 2) {
|
||||||
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
|
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
|
||||||
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
|
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(S, midstate, 32);
|
memcpy(S, prehash, 32);
|
||||||
|
|
||||||
RNDr(S, W, 0);
|
|
||||||
RNDr(S, W, 1);
|
|
||||||
RNDr(S, W, 2);
|
|
||||||
RNDr(S, W, 3);
|
RNDr(S, W, 3);
|
||||||
RNDr(S, W, 4);
|
RNDr(S, W, 4);
|
||||||
RNDr(S, W, 5);
|
RNDr(S, W, 5);
|
||||||
|
@ -260,6 +295,8 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
|
||||||
for (i = 0; i < 8; i++)
|
for (i = 0; i < 8; i++)
|
||||||
S[i] += midstate[i];
|
S[i] += midstate[i];
|
||||||
|
|
||||||
|
memcpy(W + 18, E, sizeof(E));
|
||||||
|
|
||||||
memcpy(S + 8, sha256d_hash1 + 8, 32);
|
memcpy(S + 8, sha256d_hash1 + 8, 32);
|
||||||
for (i = 16; i < 64; i += 2) {
|
for (i = 16; i < 64; i += 2) {
|
||||||
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
|
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
|
||||||
|
@ -337,12 +374,21 @@ static inline void sha256d(uint32_t *hash, uint32_t *W,
|
||||||
hash[i] += sha256_h[i];
|
hash[i] += sha256_h[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef HAVE_SHA256_4WAY
|
||||||
|
#define SHA256D_MAX_WAYS 4
|
||||||
|
void sha256d_4way(uint32_t *hash, uint32_t *data,
|
||||||
|
const uint32_t *midstate, const uint32_t *prehash);
|
||||||
|
#else
|
||||||
|
#define SHA256D_MAX_WAYS 1
|
||||||
|
#endif
|
||||||
|
|
||||||
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
uint32_t max_nonce, unsigned long *hashes_done)
|
uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
{
|
{
|
||||||
uint32_t data[SHA256D_MAX_WAYS * 64] __attribute__((aligned(128)));
|
uint32_t data[SHA256D_MAX_WAYS * 64] __attribute__((aligned(128)));
|
||||||
uint32_t hash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
uint32_t hash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
||||||
uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
||||||
|
uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32)));
|
||||||
uint32_t n = pdata[19] - 1;
|
uint32_t n = pdata[19] - 1;
|
||||||
const uint32_t Htarg = ptarget[7];
|
const uint32_t Htarg = ptarget[7];
|
||||||
#ifdef HAVE_SHA256_4WAY
|
#ifdef HAVE_SHA256_4WAY
|
||||||
|
@ -352,15 +398,22 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
#endif
|
#endif
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
for (i = 15; i >= 0; i--)
|
memcpy(data, pdata + 16, 64);
|
||||||
|
sha256d_preextend(data);
|
||||||
|
for (i = 31; i >= 0; i--)
|
||||||
for (j = 0; j < ways; j++)
|
for (j = 0; j < ways; j++)
|
||||||
data[i * ways + j] = pdata[16 + i];
|
data[i * ways + j] = data[i];
|
||||||
|
|
||||||
sha256_init(midstate);
|
sha256_init(midstate);
|
||||||
sha256_transform(midstate, pdata, 0);
|
sha256_transform(midstate, pdata, 0);
|
||||||
for (i = 7; i >= 0; i--)
|
memcpy(prehash, midstate, 32);
|
||||||
for (j = 0; j < ways; j++)
|
sha256d_prehash(prehash, pdata + 16);
|
||||||
|
for (i = 7; i >= 0; i--) {
|
||||||
|
for (j = 0; j < ways; j++) {
|
||||||
midstate[i * ways + j] = midstate[i];
|
midstate[i * ways + j] = midstate[i];
|
||||||
|
prehash[i * ways + j] = prehash[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef HAVE_SHA256_4WAY
|
#ifdef HAVE_SHA256_4WAY
|
||||||
if (ways == 4)
|
if (ways == 4)
|
||||||
|
@ -368,7 +421,7 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
for (i = 0; i < 4; i++)
|
for (i = 0; i < 4; i++)
|
||||||
data[4 * 3 + i] = ++n;
|
data[4 * 3 + i] = ++n;
|
||||||
|
|
||||||
sha256d_4way(hash, data, midstate);
|
sha256d_4way(hash, data, midstate, prehash);
|
||||||
|
|
||||||
for (i = 0; i < 4; i++) {
|
for (i = 0; i < 4; i++) {
|
||||||
if (hash[4 * 7 + i] <= Htarg) {
|
if (hash[4 * 7 + i] <= Htarg) {
|
||||||
|
@ -386,12 +439,12 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
do {
|
do {
|
||||||
data[3 + i] = ++n;
|
data[3] = ++n;
|
||||||
sha256d(hash, data, midstate);
|
sha256d(hash, data, midstate, prehash);
|
||||||
if (hash[7 + i] <= Htarg) {
|
if (hash[7] <= Htarg) {
|
||||||
if (fulltest(hash, ptarget)) {
|
if (fulltest(hash, ptarget)) {
|
||||||
*hashes_done = n - pdata[19] + 1;
|
*hashes_done = n - pdata[19] + 1;
|
||||||
pdata[19] = data[3 + i];
|
pdata[19] = data[3];
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue