diff --git a/sha2-x64.S b/sha2-x64.S index c4f3655..3c83762 100644 --- a/sha2-x64.S +++ b/sha2-x64.S @@ -95,6 +95,7 @@ sha256_4k: .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + .text .p2align 6 .globl sha256_init_4way @@ -125,9 +126,43 @@ _sha256_init_4way: popq %rdi #endif ret - + .macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%rax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + + movdqa (\i-2)*16(%rax), %xmm3 + movdqa %xmm3, %xmm2 + + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-7)*16(%rax), %xmm0 + + psrld $10, %xmm3 + movdqa %xmm3, %xmm1 + pslld $13, %xmm2 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm3, %xmm0 + movdqa %xmm0, \i*16(%rax) +.endm + +.macro sha256_sse2_extend_doubleround i movdqa (\i-15)*16(%rax), %xmm0 movdqa (\i-14)*16(%rax), %xmm4 movdqa %xmm0, %xmm2 @@ -193,36 +228,6 @@ _sha256_init_4way: movdqa %xmm4, (\i+1)*16(%rax) .endm - .text - .p2align 6 -sha256_sse2_extend_loop: - sha256_sse2_extend_round 0 -sha256_sse2_extend_loop_pre: - sha256_sse2_extend_round 2 - sha256_sse2_extend_round 4 - sha256_sse2_extend_round 6 - sha256_sse2_extend_round 8 - sha256_sse2_extend_round 10 - sha256_sse2_extend_round 12 - sha256_sse2_extend_round 14 - sha256_sse2_extend_round 16 - sha256_sse2_extend_round 18 - sha256_sse2_extend_round 20 - sha256_sse2_extend_round 22 - sha256_sse2_extend_round 24 - sha256_sse2_extend_round 26 - sha256_sse2_extend_round 28 - sha256_sse2_extend_round 30 - sha256_sse2_extend_round 32 - sha256_sse2_extend_round 34 - sha256_sse2_extend_round 36 - sha256_sse2_extend_round 38 - sha256_sse2_extend_round 40 - sha256_sse2_extend_round 42 - sha256_sse2_extend_round 44 - sha256_sse2_extend_round 46 - ret - .macro sha256_sse2_main_round i movdqa 16*\i(%rax), %xmm6 paddd 16*\i(%rcx), %xmm6 @@ -288,80 +293,39 @@ sha256_sse2_extend_loop_pre: paddd %xmm6, %xmm7 .endm - .text - .p2align 6 -sha256_sse2_main_loop: - sha256_sse2_main_round 0 - sha256_sse2_main_round 1 - sha256_sse2_main_round 2 -sha256_sse2_main_loop_pre: - sha256_sse2_main_round 3 - sha256_sse2_main_round 4 - sha256_sse2_main_round 5 - sha256_sse2_main_round 6 - sha256_sse2_main_round 7 - sha256_sse2_main_round 8 - sha256_sse2_main_round 9 - sha256_sse2_main_round 10 - sha256_sse2_main_round 11 - sha256_sse2_main_round 12 - sha256_sse2_main_round 13 - sha256_sse2_main_round 14 - sha256_sse2_main_round 15 - sha256_sse2_main_round 16 - sha256_sse2_main_round 17 - sha256_sse2_main_round 18 - sha256_sse2_main_round 19 - sha256_sse2_main_round 20 - sha256_sse2_main_round 21 - sha256_sse2_main_round 22 - sha256_sse2_main_round 23 - sha256_sse2_main_round 24 - sha256_sse2_main_round 25 - sha256_sse2_main_round 26 - sha256_sse2_main_round 27 - sha256_sse2_main_round 28 - sha256_sse2_main_round 29 - sha256_sse2_main_round 30 - sha256_sse2_main_round 31 - sha256_sse2_main_round 32 - sha256_sse2_main_round 33 - sha256_sse2_main_round 34 - sha256_sse2_main_round 35 - sha256_sse2_main_round 36 - sha256_sse2_main_round 37 - sha256_sse2_main_round 38 - sha256_sse2_main_round 39 - sha256_sse2_main_round 40 - sha256_sse2_main_round 41 - sha256_sse2_main_round 42 - sha256_sse2_main_round 43 - sha256_sse2_main_round 44 - sha256_sse2_main_round 45 - sha256_sse2_main_round 46 - sha256_sse2_main_round 47 - sha256_sse2_main_round 48 - sha256_sse2_main_round 49 - sha256_sse2_main_round 50 - sha256_sse2_main_round 51 - sha256_sse2_main_round 52 - sha256_sse2_main_round 53 - sha256_sse2_main_round 54 - sha256_sse2_main_round 55 - sha256_sse2_main_round 56 - sha256_sse2_main_round 57 - sha256_sse2_main_round 58 - sha256_sse2_main_round 59 - sha256_sse2_main_round 60 - sha256_sse2_main_round 61 - sha256_sse2_main_round 62 - sha256_sse2_main_round 63 - ret - -#if defined(USE_AVX) +#if defined(USE_AVX) .macro sha256_avx_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + + vmovdqa (\i-2)*16(%rax), %xmm3 + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpsrld $7, %xmm3, %xmm1 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpsrld $2, %xmm1, %xmm1 + vpslld $2, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm3, %xmm0, %xmm0 + vmovdqa %xmm0, \i*16(%rax) +.endm + +.macro sha256_avx_extend_doubleround i vmovdqa (\i-15)*16(%rax), %xmm0 vmovdqa (\i-14)*16(%rax), %xmm4 vpslld $14, %xmm0, %xmm2 @@ -419,36 +383,6 @@ sha256_sse2_main_loop_pre: vmovdqa %xmm4, (\i+1)*16(%rax) .endm - .text - .p2align 6 -sha256_avx_extend_loop: - sha256_avx_extend_round 0 -sha256_avx_extend_loop_pre: - sha256_avx_extend_round 2 - sha256_avx_extend_round 4 - sha256_avx_extend_round 6 - sha256_avx_extend_round 8 - sha256_avx_extend_round 10 - sha256_avx_extend_round 12 - sha256_avx_extend_round 14 - sha256_avx_extend_round 16 - sha256_avx_extend_round 18 - sha256_avx_extend_round 20 - sha256_avx_extend_round 22 - sha256_avx_extend_round 24 - sha256_avx_extend_round 26 - sha256_avx_extend_round 28 - sha256_avx_extend_round 30 - sha256_avx_extend_round 32 - sha256_avx_extend_round 34 - sha256_avx_extend_round 36 - sha256_avx_extend_round 38 - sha256_avx_extend_round 40 - sha256_avx_extend_round 42 - sha256_avx_extend_round 44 - sha256_avx_extend_round 46 - ret - .macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 vpaddd 16*(\i)(%rax), \r0, %xmm6 vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 @@ -501,37 +435,33 @@ sha256_avx_extend_loop_pre: sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 .endm - .text - .p2align 6 -sha256_avx_main_loop: - sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 -sha256_avx_main_loop_pre: - sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_avx_main_quadround 4 - sha256_avx_main_quadround 8 - sha256_avx_main_quadround 12 - sha256_avx_main_quadround 16 - sha256_avx_main_quadround 20 - sha256_avx_main_quadround 24 - sha256_avx_main_quadround 28 - sha256_avx_main_quadround 32 - sha256_avx_main_quadround 36 - sha256_avx_main_quadround 40 - sha256_avx_main_quadround 44 - sha256_avx_main_quadround 48 - sha256_avx_main_quadround 52 - sha256_avx_main_quadround 56 - sha256_avx_main_quadround 60 - ret - #endif /* USE_AVX */ #if defined(USE_XOP) .macro sha256_xop_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + + vmovdqa (\i-2)*16(%rax), %xmm3 + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vprotd $15, %xmm3, %xmm1 + vprotd $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm3, %xmm0, %xmm0 + vmovdqa %xmm0, \i*16(%rax) +.endm + +.macro sha256_xop_extend_doubleround i vmovdqa (\i-15)*16(%rax), %xmm0 vmovdqa (\i-14)*16(%rax), %xmm4 vprotd $25, %xmm0, %xmm1 @@ -570,36 +500,6 @@ sha256_avx_main_loop_pre: vmovdqa %xmm0, \i*16(%rax) vmovdqa %xmm4, (\i+1)*16(%rax) .endm - - .text - .p2align 6 -sha256_xop_extend_loop: - sha256_xop_extend_round 0 -sha256_xop_extend_loop_pre: - sha256_xop_extend_round 2 - sha256_xop_extend_round 4 - sha256_xop_extend_round 6 - sha256_xop_extend_round 8 - sha256_xop_extend_round 10 - sha256_xop_extend_round 12 - sha256_xop_extend_round 14 - sha256_xop_extend_round 16 - sha256_xop_extend_round 18 - sha256_xop_extend_round 20 - sha256_xop_extend_round 22 - sha256_xop_extend_round 24 - sha256_xop_extend_round 26 - sha256_xop_extend_round 28 - sha256_xop_extend_round 30 - sha256_xop_extend_round 32 - sha256_xop_extend_round 34 - sha256_xop_extend_round 36 - sha256_xop_extend_round 38 - sha256_xop_extend_round 40 - sha256_xop_extend_round 42 - sha256_xop_extend_round 44 - sha256_xop_extend_round 46 - ret .macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 vpaddd 16*(\i)(%rax), \r0, %xmm6 @@ -641,31 +541,6 @@ sha256_xop_extend_loop_pre: sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 .endm - .text - .p2align 6 -sha256_xop_main_loop: - sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 -sha256_xop_main_loop_pre: - sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_xop_main_quadround 4 - sha256_xop_main_quadround 8 - sha256_xop_main_quadround 12 - sha256_xop_main_quadround 16 - sha256_xop_main_quadround 20 - sha256_xop_main_quadround 24 - sha256_xop_main_quadround 28 - sha256_xop_main_quadround 32 - sha256_xop_main_quadround 36 - sha256_xop_main_quadround 40 - sha256_xop_main_quadround 44 - sha256_xop_main_quadround 48 - sha256_xop_main_quadround 52 - sha256_xop_main_quadround 56 - sha256_xop_main_quadround 60 - ret - #endif /* USE_XOP */ @@ -828,7 +703,30 @@ sha256_transform_4way_sse2_main_loop: .p2align 6 sha256_transform_4way_core_avx: leaq 256(%rsp), %rax - call sha256_avx_extend_loop + sha256_avx_extend_doubleround 0 + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + sha256_avx_extend_doubleround 6 + sha256_avx_extend_doubleround 8 + sha256_avx_extend_doubleround 10 + sha256_avx_extend_doubleround 12 + sha256_avx_extend_doubleround 14 + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 movdqu 0(%rdi), %xmm7 movdqu 16(%rdi), %xmm5 movdqu 32(%rdi), %xmm4 @@ -839,7 +737,22 @@ sha256_transform_4way_core_avx: movdqu 112(%rdi), %xmm10 movq %rsp, %rax leaq sha256_4k(%rip), %rcx - call sha256_avx_main_loop + sha256_avx_main_quadround 0 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_quadround 56 + sha256_avx_main_quadround 60 jmp sha256_transform_4way_finish #endif /* USE_AVX */ @@ -849,7 +762,30 @@ sha256_transform_4way_core_avx: .p2align 6 sha256_transform_4way_core_xop: leaq 256(%rsp), %rax - call sha256_xop_extend_loop + sha256_xop_extend_doubleround 0 + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + sha256_xop_extend_doubleround 6 + sha256_xop_extend_doubleround 8 + sha256_xop_extend_doubleround 10 + sha256_xop_extend_doubleround 12 + sha256_xop_extend_doubleround 14 + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 movdqu 0(%rdi), %xmm7 movdqu 16(%rdi), %xmm5 movdqu 32(%rdi), %xmm4 @@ -860,7 +796,22 @@ sha256_transform_4way_core_xop: movdqu 112(%rdi), %xmm10 movq %rsp, %rax leaq sha256_4k(%rip), %rcx - call sha256_xop_main_loop + sha256_xop_main_quadround 0 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_quadround 56 + sha256_xop_main_quadround 60 jmp sha256_transform_4way_finish #endif /* USE_XOP */ @@ -1007,20 +958,20 @@ sha256_transform_4way_finish: .data .p2align 3 -sha256d_4way_addr: +sha256d_ms_4way_addr: .quad 0x0 .text .p2align 6 - .globl sha256d_4way - .globl _sha256d_4way -sha256d_4way: -_sha256d_4way: - jmp *sha256d_4way_addr(%rip) + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + jmp *sha256d_ms_4way_addr(%rip) .p2align 6 -sha256d_4way_sse2: +sha256d_ms_4way_sse2: #if defined(WIN64) pushq %rdi subq $80, %rsp @@ -1038,7 +989,35 @@ sha256d_4way_sse2: subq $1032, %rsp leaq 256(%rsi), %rax - call sha256_sse2_extend_loop_pre + jmp sha256d_ms_4way_sse2_extend_loop1 + +sha256d_ms_4way_sse2_extend_loop2: + sha256_sse2_extend_doubleround 0 +sha256d_ms_4way_sse2_extend_loop1: + sha256_sse2_extend_doubleround 2 + sha256_sse2_extend_doubleround 4 + sha256_sse2_extend_doubleround 6 + sha256_sse2_extend_doubleround 8 + sha256_sse2_extend_doubleround 10 + sha256_sse2_extend_doubleround 12 + sha256_sse2_extend_doubleround 14 + sha256_sse2_extend_doubleround 16 + sha256_sse2_extend_doubleround 18 + sha256_sse2_extend_doubleround 20 + sha256_sse2_extend_doubleround 22 + sha256_sse2_extend_doubleround 24 + sha256_sse2_extend_doubleround 26 + sha256_sse2_extend_doubleround 28 + sha256_sse2_extend_doubleround 30 + sha256_sse2_extend_doubleround 32 + sha256_sse2_extend_doubleround 34 + sha256_sse2_extend_doubleround 36 + sha256_sse2_extend_doubleround 38 + sha256_sse2_extend_doubleround 40 + sha256_sse2_extend_doubleround 42 + jz sha256d_ms_4way_sse2_extend_coda2 + sha256_sse2_extend_doubleround 44 + sha256_sse2_extend_doubleround 46 movdqa 0(%rcx), %xmm3 movdqa 16(%rcx), %xmm0 @@ -1051,7 +1030,75 @@ sha256d_4way_sse2: movq %rsi, %rax leaq sha256_4k(%rip), %rcx - call sha256_sse2_main_loop_pre + jmp sha256d_ms_4way_sse2_main_loop1 + +sha256d_ms_4way_sse2_main_loop2: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 +sha256d_ms_4way_sse2_main_loop1: + sha256_sse2_main_round 3 + sha256_sse2_main_round 4 + sha256_sse2_main_round 5 + sha256_sse2_main_round 6 + sha256_sse2_main_round 7 + sha256_sse2_main_round 8 + sha256_sse2_main_round 9 + sha256_sse2_main_round 10 + sha256_sse2_main_round 11 + sha256_sse2_main_round 12 + sha256_sse2_main_round 13 + sha256_sse2_main_round 14 + sha256_sse2_main_round 15 + sha256_sse2_main_round 16 + sha256_sse2_main_round 17 + sha256_sse2_main_round 18 + sha256_sse2_main_round 19 + sha256_sse2_main_round 20 + sha256_sse2_main_round 21 + sha256_sse2_main_round 22 + sha256_sse2_main_round 23 + sha256_sse2_main_round 24 + sha256_sse2_main_round 25 + sha256_sse2_main_round 26 + sha256_sse2_main_round 27 + sha256_sse2_main_round 28 + sha256_sse2_main_round 29 + sha256_sse2_main_round 30 + sha256_sse2_main_round 31 + sha256_sse2_main_round 32 + sha256_sse2_main_round 33 + sha256_sse2_main_round 34 + sha256_sse2_main_round 35 + sha256_sse2_main_round 36 + sha256_sse2_main_round 37 + sha256_sse2_main_round 38 + sha256_sse2_main_round 39 + sha256_sse2_main_round 40 + sha256_sse2_main_round 41 + sha256_sse2_main_round 42 + sha256_sse2_main_round 43 + sha256_sse2_main_round 44 + sha256_sse2_main_round 45 + sha256_sse2_main_round 46 + sha256_sse2_main_round 47 + sha256_sse2_main_round 48 + sha256_sse2_main_round 49 + sha256_sse2_main_round 50 + sha256_sse2_main_round 51 + sha256_sse2_main_round 52 + sha256_sse2_main_round 53 + sha256_sse2_main_round 54 + sha256_sse2_main_round 55 + sha256_sse2_main_round 56 + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_round 60 + jz sha256d_ms_4way_sse2_finish + sha256_sse2_main_round 61 + sha256_sse2_main_round 62 + sha256_sse2_main_round 63 paddd 0(%rdx), %xmm7 paddd 16(%rdx), %xmm5 @@ -1086,7 +1133,11 @@ sha256d_4way_sse2: movdqa %xmm1, 240(%rsp) leaq 256(%rsp), %rax - call sha256_sse2_extend_loop + cmpq %rax, %rax + jmp sha256d_ms_4way_sse2_extend_loop2 + +sha256d_ms_4way_sse2_extend_coda2: + sha256_sse2_extend_round 44 movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+16(%rip), %xmm5 @@ -1099,25 +1150,11 @@ sha256d_4way_sse2: movq %rsp, %rax leaq sha256_4k(%rip), %rcx - call sha256_sse2_main_loop + jmp sha256d_ms_4way_sse2_main_loop2 - paddd sha256_4h+0(%rip), %xmm7 - paddd sha256_4h+16(%rip), %xmm5 - paddd sha256_4h+32(%rip), %xmm4 - paddd sha256_4h+48(%rip), %xmm3 - paddd sha256_4h+64(%rip), %xmm0 - paddd sha256_4h+80(%rip), %xmm8 - paddd sha256_4h+96(%rip), %xmm9 - paddd sha256_4h+112(%rip), %xmm10 - - movdqa %xmm7, 0(%rdi) - movdqa %xmm5, 16(%rdi) - movdqa %xmm4, 32(%rdi) - movdqa %xmm3, 48(%rdi) - movdqa %xmm0, 64(%rdi) - movdqa %xmm8, 80(%rdi) - movdqa %xmm9, 96(%rdi) - movdqa %xmm10, 112(%rdi) +sha256d_ms_4way_sse2_finish: + paddd sha256_4h+112(%rip), %xmm0 + movdqa %xmm0, 112(%rdi) addq $1032, %rsp #if defined(WIN64) @@ -1136,7 +1173,7 @@ sha256d_4way_sse2: #if defined(USE_AVX) .p2align 6 -sha256d_4way_avx: +sha256d_ms_4way_avx: #if defined(WIN64) pushq %rdi subq $80, %rsp @@ -1154,7 +1191,35 @@ sha256d_4way_avx: subq $1032, %rsp leaq 256(%rsi), %rax - call sha256_avx_extend_loop_pre + jmp sha256d_ms_4way_avx_extend_loop1 + +sha256d_ms_4way_avx_extend_loop2: + sha256_avx_extend_doubleround 0 +sha256d_ms_4way_avx_extend_loop1: + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + sha256_avx_extend_doubleround 6 + sha256_avx_extend_doubleround 8 + sha256_avx_extend_doubleround 10 + sha256_avx_extend_doubleround 12 + sha256_avx_extend_doubleround 14 + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + jz sha256d_ms_4way_avx_extend_coda2 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 movdqa 0(%rcx), %xmm7 movdqa 16(%rcx), %xmm8 @@ -1167,7 +1232,33 @@ sha256d_4way_avx: movq %rsi, %rax leaq sha256_4k(%rip), %rcx - call sha256_avx_main_loop_pre + jmp sha256d_ms_4way_avx_main_loop1 + +sha256d_ms_4way_avx_main_loop2: + sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_avx_main_loop1: + sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_quadround 56 + sha256_avx_main_round 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_avx_finish + sha256_avx_main_round 61, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 62, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round 63, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 paddd 0(%rdx), %xmm7 paddd 16(%rdx), %xmm5 @@ -1202,7 +1293,11 @@ sha256d_4way_avx: movdqa %xmm1, 240(%rsp) leaq 256(%rsp), %rax - call sha256_avx_extend_loop + cmpq %rax, %rax + jmp sha256d_ms_4way_avx_extend_loop2 + +sha256d_ms_4way_avx_extend_coda2: + sha256_avx_extend_round 44 movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+16(%rip), %xmm5 @@ -1215,24 +1310,10 @@ sha256d_4way_avx: movq %rsp, %rax leaq sha256_4k(%rip), %rcx - call sha256_avx_main_loop + jmp sha256d_ms_4way_avx_main_loop2 - paddd sha256_4h+0(%rip), %xmm7 - paddd sha256_4h+16(%rip), %xmm5 - paddd sha256_4h+32(%rip), %xmm4 - paddd sha256_4h+48(%rip), %xmm3 - paddd sha256_4h+64(%rip), %xmm0 - paddd sha256_4h+80(%rip), %xmm8 - paddd sha256_4h+96(%rip), %xmm9 +sha256d_ms_4way_avx_finish: paddd sha256_4h+112(%rip), %xmm10 - - movdqa %xmm7, 0(%rdi) - movdqa %xmm5, 16(%rdi) - movdqa %xmm4, 32(%rdi) - movdqa %xmm3, 48(%rdi) - movdqa %xmm0, 64(%rdi) - movdqa %xmm8, 80(%rdi) - movdqa %xmm9, 96(%rdi) movdqa %xmm10, 112(%rdi) addq $1032, %rsp @@ -1254,7 +1335,7 @@ sha256d_4way_avx: #if defined(USE_XOP) .p2align 6 -sha256d_4way_xop: +sha256d_ms_4way_xop: #if defined(WIN64) pushq %rdi subq $80, %rsp @@ -1272,7 +1353,35 @@ sha256d_4way_xop: subq $1032, %rsp leaq 256(%rsi), %rax - call sha256_xop_extend_loop_pre + jmp sha256d_ms_4way_xop_extend_loop1 + +sha256d_ms_4way_xop_extend_loop2: + sha256_xop_extend_doubleround 0 +sha256d_ms_4way_xop_extend_loop1: + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + sha256_xop_extend_doubleround 6 + sha256_xop_extend_doubleround 8 + sha256_xop_extend_doubleround 10 + sha256_xop_extend_doubleround 12 + sha256_xop_extend_doubleround 14 + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + jz sha256d_ms_4way_xop_extend_coda2 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 movdqa 0(%rcx), %xmm7 movdqa 16(%rcx), %xmm8 @@ -1285,7 +1394,33 @@ sha256d_4way_xop: movq %rsi, %rax leaq sha256_4k(%rip), %rcx - call sha256_xop_main_loop_pre + jmp sha256d_ms_4way_xop_main_loop1 + +sha256d_ms_4way_xop_main_loop2: + sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_xop_main_loop1: + sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_quadround 56 + sha256_xop_main_round 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_xop_finish + sha256_xop_main_round 61, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 62, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round 63, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 paddd 0(%rdx), %xmm7 paddd 16(%rdx), %xmm5 @@ -1320,7 +1455,11 @@ sha256d_4way_xop: movdqa %xmm1, 240(%rsp) leaq 256(%rsp), %rax - call sha256_xop_extend_loop + cmpq %rax, %rax + jmp sha256d_ms_4way_xop_extend_loop2 + +sha256d_ms_4way_xop_extend_coda2: + sha256_xop_extend_round 44 movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+16(%rip), %xmm5 @@ -1333,24 +1472,10 @@ sha256d_4way_xop: movq %rsp, %rax leaq sha256_4k(%rip), %rcx - call sha256_xop_main_loop + jmp sha256d_ms_4way_xop_main_loop2 - paddd sha256_4h+0(%rip), %xmm7 - paddd sha256_4h+16(%rip), %xmm5 - paddd sha256_4h+32(%rip), %xmm4 - paddd sha256_4h+48(%rip), %xmm3 - paddd sha256_4h+64(%rip), %xmm0 - paddd sha256_4h+80(%rip), %xmm8 - paddd sha256_4h+96(%rip), %xmm9 +sha256d_ms_4way_xop_finish: paddd sha256_4h+112(%rip), %xmm10 - - movdqa %xmm7, 0(%rdi) - movdqa %xmm5, 16(%rdi) - movdqa %xmm4, 32(%rdi) - movdqa %xmm3, 48(%rdi) - movdqa %xmm0, 64(%rdi) - movdqa %xmm8, 80(%rdi) - movdqa %xmm9, 96(%rdi) movdqa %xmm10, 112(%rdi) addq $1032, %rsp @@ -1400,23 +1525,23 @@ _sha256_use_4way: jz sha2_4way_init_avx sha2_4way_init_xop: - leaq sha256d_4way_xop(%rip), %rax + leaq sha256d_ms_4way_xop(%rip), %rax leaq sha256_transform_4way_core_xop(%rip), %rdx jmp sha2_4way_init_done #endif /* USE_XOP */ sha2_4way_init_avx: - leaq sha256d_4way_avx(%rip), %rax + leaq sha256d_ms_4way_avx(%rip), %rax leaq sha256_transform_4way_core_avx(%rip), %rdx jmp sha2_4way_init_done #endif /* USE_AVX */ sha2_4way_init_sse2: - leaq sha256d_4way_sse2(%rip), %rax + leaq sha256d_ms_4way_sse2(%rip), %rax leaq sha256_transform_4way_core_sse2(%rip), %rdx sha2_4way_init_done: - movq %rax, sha256d_4way_addr(%rip) + movq %rax, sha256d_ms_4way_addr(%rip) movq %rdx, sha256_transform_4way_core_addr(%rip) popq %rdx popq %rcx diff --git a/sha2.c b/sha2.c index 21500e3..112fb52 100644 --- a/sha2.c +++ b/sha2.c @@ -172,6 +172,18 @@ static const uint32_t sha256d_hash1[16] = { 0x00000000, 0x00000000, 0x00000000, 0x00000100 }; +static void sha256d(uint32_t *hash, uint32_t *data) +{ + uint32_t S[16]; + + sha256_init(S); + sha256_transform(S, data, 0); + sha256_transform(S, data + 16, 0); + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(hash); + sha256_transform(hash, S, 0); +} + static inline void sha256d_preextend(uint32_t *W) { W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; @@ -200,7 +212,7 @@ static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) RNDr(S, W, 2); } -static inline void sha256d(uint32_t *hash, uint32_t *W, +static inline void sha256d_ms(uint32_t *hash, uint32_t *W, const uint32_t *midstate, const uint32_t *prehash) { uint32_t S[64]; @@ -298,10 +310,27 @@ static inline void sha256d(uint32_t *hash, uint32_t *W, memcpy(W + 18, E, sizeof(E)); memcpy(S + 8, sha256d_hash1 + 8, 32); - for (i = 16; i < 64; i += 2) { + S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; + S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; + S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; + S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; + S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; + S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; + S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; + S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; + S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; + S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; + S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; + S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; + S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; + S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; + S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; + S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; + for (i = 32; i < 60; i += 2) { S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; } + S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; sha256_init(hash); @@ -362,21 +391,21 @@ static inline void sha256d(uint32_t *hash, uint32_t *W, RNDr(hash, S, 54); RNDr(hash, S, 55); RNDr(hash, S, 56); - RNDr(hash, S, 57); - RNDr(hash, S, 58); - RNDr(hash, S, 59); - RNDr(hash, S, 60); - RNDr(hash, S, 61); - RNDr(hash, S, 62); - RNDr(hash, S, 63); - - for (i = 0; i < 8; i++) - hash[i] += sha256_h[i]; + + hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) + + S[57] + sha256_k[57]; + hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) + + S[58] + sha256_k[58]; + hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) + + S[59] + sha256_k[59]; + hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) + + S[60] + sha256_k[60] + + sha256_h[7]; } #ifdef HAVE_SHA256_4WAY #define SHA256D_MAX_WAYS 4 -void sha256d_4way(uint32_t *hash, uint32_t *data, +void sha256d_ms_4way(uint32_t *hash, uint32_t *data, const uint32_t *midstate, const uint32_t *prehash); #else #define SHA256D_MAX_WAYS 1 @@ -390,6 +419,7 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t midstate[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32))); uint32_t prehash[SHA256D_MAX_WAYS * 8] __attribute__((aligned(32))); uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; #ifdef HAVE_SHA256_4WAY const int ways = sha256_use_4way() ? 4 : 1; @@ -421,16 +451,14 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, for (i = 0; i < 4; i++) data[4 * 3 + i] = ++n; - sha256d_4way(hash, data, midstate, prehash); + sha256d_ms_4way(hash, data, midstate, prehash); for (i = 0; i < 4; i++) { if (hash[4 * 7 + i] <= Htarg) { - uint32_t tmp[8]; - for (j = 0; j < 8; j++) - tmp[j] = hash[4 * j + i]; - if (fulltest(tmp, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdata[19] = data[4 * 3 + i]; + pdata[19] = data[4 * 3 + i]; + sha256d(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; return 1; } } @@ -440,17 +468,18 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, #endif do { data[3] = ++n; - sha256d(hash, data, midstate, prehash); + sha256d_ms(hash, data, midstate, prehash); if (hash[7] <= Htarg) { + pdata[19] = data[3]; + sha256d(hash, pdata); if (fulltest(hash, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdata[19] = data[3]; + *hashes_done = n - first_nonce + 1; return 1; } } } while (n < max_nonce && !work_restart[thr_id].restart); - *hashes_done = n - pdata[19] + 1; + *hashes_done = n - first_nonce + 1; pdata[19] = n; return 0; }