Reduce register usage in 4-way SHA-256
This commit is contained in:
parent
7ca71eb324
commit
80c762b0da
2 changed files with 120 additions and 152 deletions
252
sha2-x64.S
252
sha2-x64.S
|
@ -233,21 +233,21 @@ _sha256_init_4way:
|
|||
.endm
|
||||
|
||||
.macro sha256_sse2_main_round i
|
||||
movdqa 16*\i(%rax), %xmm6
|
||||
paddd 16*\i(%rcx), %xmm6
|
||||
paddd %xmm10, %xmm6
|
||||
movdqa 16*(\i)(%rax), %xmm6
|
||||
paddd 16*(\i)(%rcx), %xmm6
|
||||
paddd 32(%rsp), %xmm6
|
||||
|
||||
movdqa %xmm0, %xmm1
|
||||
movdqa %xmm9, %xmm2
|
||||
movdqa 16(%rsp), %xmm2
|
||||
pandn %xmm2, %xmm1
|
||||
|
||||
movdqa %xmm2, %xmm10
|
||||
movdqa %xmm8, %xmm2
|
||||
movdqa %xmm2, %xmm9
|
||||
movdqa %xmm2, 32(%rsp)
|
||||
movdqa 0(%rsp), %xmm2
|
||||
movdqa %xmm2, 16(%rsp)
|
||||
|
||||
pand %xmm0, %xmm2
|
||||
pxor %xmm2, %xmm1
|
||||
movdqa %xmm0, %xmm8
|
||||
movdqa %xmm0, 0(%rsp)
|
||||
|
||||
paddd %xmm1, %xmm6
|
||||
|
||||
|
@ -297,6 +297,13 @@ _sha256_init_4way:
|
|||
paddd %xmm6, %xmm7
|
||||
.endm
|
||||
|
||||
.macro sha256_sse2_main_quadround i
|
||||
sha256_sse2_main_round \i+0
|
||||
sha256_sse2_main_round \i+1
|
||||
sha256_sse2_main_round \i+2
|
||||
sha256_sse2_main_round \i+3
|
||||
.endm
|
||||
|
||||
|
||||
#if defined(USE_AVX)
|
||||
|
||||
|
@ -969,19 +976,16 @@ _sha256d_ms_4way:
|
|||
sha256d_ms_4way_sse2:
|
||||
#if defined(WIN64)
|
||||
pushq %rdi
|
||||
subq $80, %rsp
|
||||
subq $32, %rsp
|
||||
movdqa %xmm6, 0(%rsp)
|
||||
movdqa %xmm7, 16(%rsp)
|
||||
movdqa %xmm8, 32(%rsp)
|
||||
movdqa %xmm9, 48(%rsp)
|
||||
movdqa %xmm10, 64(%rsp)
|
||||
pushq %rsi
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
movq %r9, %rcx
|
||||
#endif
|
||||
subq $1032, %rsp
|
||||
subq $8+67*16, %rsp
|
||||
|
||||
leaq 256(%rsi), %rax
|
||||
|
||||
|
@ -989,8 +993,8 @@ sha256d_ms_4way_sse2_extend_loop1:
|
|||
movdqa 3*16(%rsi), %xmm0
|
||||
movdqa 2*16(%rax), %xmm3
|
||||
movdqa 3*16(%rax), %xmm7
|
||||
movdqa %xmm3, 2*16(%rsp)
|
||||
movdqa %xmm7, 3*16(%rsp)
|
||||
movdqa %xmm3, 5*16(%rsp)
|
||||
movdqa %xmm7, 6*16(%rsp)
|
||||
movdqa %xmm0, %xmm2
|
||||
paddd %xmm0, %xmm7
|
||||
psrld $3, %xmm0
|
||||
|
@ -1008,7 +1012,7 @@ sha256d_ms_4way_sse2_extend_loop1:
|
|||
movdqa %xmm7, 3*16(%rax)
|
||||
|
||||
movdqa 4*16(%rax), %xmm0
|
||||
movdqa %xmm0, 4*16(%rsp)
|
||||
movdqa %xmm0, 7*16(%rsp)
|
||||
movdqa %xmm3, %xmm2
|
||||
movdqa %xmm7, %xmm6
|
||||
psrld $10, %xmm3
|
||||
|
@ -1037,8 +1041,8 @@ sha256d_ms_4way_sse2_extend_loop1:
|
|||
|
||||
movdqa 6*16(%rax), %xmm0
|
||||
movdqa 7*16(%rax), %xmm4
|
||||
movdqa %xmm0, 6*16(%rsp)
|
||||
movdqa %xmm4, 7*16(%rsp)
|
||||
movdqa %xmm0, 9*16(%rsp)
|
||||
movdqa %xmm4, 10*16(%rsp)
|
||||
movdqa %xmm3, %xmm2
|
||||
movdqa %xmm7, %xmm6
|
||||
psrld $10, %xmm3
|
||||
|
@ -1068,7 +1072,7 @@ sha256d_ms_4way_sse2_extend_loop1:
|
|||
|
||||
movdqa 8*16(%rax), %xmm0
|
||||
movdqa 2*16(%rax), %xmm4
|
||||
movdqa %xmm0, 8*16(%rsp)
|
||||
movdqa %xmm0, 11*16(%rsp)
|
||||
movdqa %xmm3, %xmm2
|
||||
movdqa %xmm7, %xmm6
|
||||
psrld $10, %xmm3
|
||||
|
@ -1152,8 +1156,8 @@ sha256d_ms_4way_sse2_extend_loop1:
|
|||
|
||||
movdqa 14*16(%rax), %xmm0
|
||||
movdqa 15*16(%rax), %xmm4
|
||||
movdqa %xmm0, 14*16(%rsp)
|
||||
movdqa %xmm4, 15*16(%rsp)
|
||||
movdqa %xmm0, 17*16(%rsp)
|
||||
movdqa %xmm4, 18*16(%rsp)
|
||||
movdqa %xmm3, %xmm2
|
||||
movdqa %xmm7, %xmm6
|
||||
psrld $10, %xmm3
|
||||
|
@ -1204,12 +1208,15 @@ sha256d_ms_4way_sse2_extend_loop2:
|
|||
|
||||
movdqa 0(%rcx), %xmm3
|
||||
movdqa 16(%rcx), %xmm0
|
||||
movdqa 32(%rcx), %xmm8
|
||||
movdqa 48(%rcx), %xmm9
|
||||
movdqa 64(%rcx), %xmm10
|
||||
movdqa 32(%rcx), %xmm1
|
||||
movdqa 48(%rcx), %xmm2
|
||||
movdqa 64(%rcx), %xmm6
|
||||
movdqa 80(%rcx), %xmm7
|
||||
movdqa 96(%rcx), %xmm5
|
||||
movdqa 112(%rcx), %xmm4
|
||||
movdqa %xmm1, 0(%rsp)
|
||||
movdqa %xmm2, 16(%rsp)
|
||||
movdqa %xmm6, 32(%rsp)
|
||||
|
||||
movq %rsi, %rax
|
||||
leaq sha256_4k(%rip), %rcx
|
||||
|
@ -1221,118 +1228,79 @@ sha256d_ms_4way_sse2_main_loop2:
|
|||
sha256_sse2_main_round 2
|
||||
sha256d_ms_4way_sse2_main_loop1:
|
||||
sha256_sse2_main_round 3
|
||||
sha256_sse2_main_round 4
|
||||
sha256_sse2_main_round 5
|
||||
sha256_sse2_main_round 6
|
||||
sha256_sse2_main_round 7
|
||||
sha256_sse2_main_round 8
|
||||
sha256_sse2_main_round 9
|
||||
sha256_sse2_main_round 10
|
||||
sha256_sse2_main_round 11
|
||||
sha256_sse2_main_round 12
|
||||
sha256_sse2_main_round 13
|
||||
sha256_sse2_main_round 14
|
||||
sha256_sse2_main_round 15
|
||||
sha256_sse2_main_round 16
|
||||
sha256_sse2_main_round 17
|
||||
sha256_sse2_main_round 18
|
||||
sha256_sse2_main_round 19
|
||||
sha256_sse2_main_round 20
|
||||
sha256_sse2_main_round 21
|
||||
sha256_sse2_main_round 22
|
||||
sha256_sse2_main_round 23
|
||||
sha256_sse2_main_round 24
|
||||
sha256_sse2_main_round 25
|
||||
sha256_sse2_main_round 26
|
||||
sha256_sse2_main_round 27
|
||||
sha256_sse2_main_round 28
|
||||
sha256_sse2_main_round 29
|
||||
sha256_sse2_main_round 30
|
||||
sha256_sse2_main_round 31
|
||||
sha256_sse2_main_round 32
|
||||
sha256_sse2_main_round 33
|
||||
sha256_sse2_main_round 34
|
||||
sha256_sse2_main_round 35
|
||||
sha256_sse2_main_round 36
|
||||
sha256_sse2_main_round 37
|
||||
sha256_sse2_main_round 38
|
||||
sha256_sse2_main_round 39
|
||||
sha256_sse2_main_round 40
|
||||
sha256_sse2_main_round 41
|
||||
sha256_sse2_main_round 42
|
||||
sha256_sse2_main_round 43
|
||||
sha256_sse2_main_round 44
|
||||
sha256_sse2_main_round 45
|
||||
sha256_sse2_main_round 46
|
||||
sha256_sse2_main_round 47
|
||||
sha256_sse2_main_round 48
|
||||
sha256_sse2_main_round 49
|
||||
sha256_sse2_main_round 50
|
||||
sha256_sse2_main_round 51
|
||||
sha256_sse2_main_round 52
|
||||
sha256_sse2_main_round 53
|
||||
sha256_sse2_main_round 54
|
||||
sha256_sse2_main_round 55
|
||||
sha256_sse2_main_quadround 4
|
||||
sha256_sse2_main_quadround 8
|
||||
sha256_sse2_main_quadround 12
|
||||
sha256_sse2_main_quadround 16
|
||||
sha256_sse2_main_quadround 20
|
||||
sha256_sse2_main_quadround 24
|
||||
sha256_sse2_main_quadround 28
|
||||
sha256_sse2_main_quadround 32
|
||||
sha256_sse2_main_quadround 36
|
||||
sha256_sse2_main_quadround 40
|
||||
sha256_sse2_main_quadround 44
|
||||
sha256_sse2_main_quadround 48
|
||||
sha256_sse2_main_quadround 52
|
||||
sha256_sse2_main_round 56
|
||||
jz sha256d_ms_4way_sse2_finish
|
||||
sha256_sse2_main_round 57
|
||||
sha256_sse2_main_round 58
|
||||
sha256_sse2_main_round 59
|
||||
sha256_sse2_main_round 60
|
||||
sha256_sse2_main_round 61
|
||||
sha256_sse2_main_round 62
|
||||
sha256_sse2_main_round 63
|
||||
sha256_sse2_main_quadround 60
|
||||
|
||||
movdqa 2*16(%rsp), %xmm1
|
||||
movdqa 3*16(%rsp), %xmm2
|
||||
movdqa 4*16(%rsp), %xmm6
|
||||
movdqa 5*16(%rsp), %xmm1
|
||||
movdqa 6*16(%rsp), %xmm2
|
||||
movdqa 7*16(%rsp), %xmm6
|
||||
movdqa %xmm1, 18*16(%rsi)
|
||||
movdqa %xmm2, 19*16(%rsi)
|
||||
movdqa %xmm6, 20*16(%rsi)
|
||||
movdqa 6*16(%rsp), %xmm1
|
||||
movdqa 7*16(%rsp), %xmm2
|
||||
movdqa 8*16(%rsp), %xmm6
|
||||
movdqa 9*16(%rsp), %xmm1
|
||||
movdqa 10*16(%rsp), %xmm2
|
||||
movdqa 11*16(%rsp), %xmm6
|
||||
movdqa %xmm1, 22*16(%rsi)
|
||||
movdqa %xmm2, 23*16(%rsi)
|
||||
movdqa %xmm6, 24*16(%rsi)
|
||||
movdqa 14*16(%rsp), %xmm1
|
||||
movdqa 15*16(%rsp), %xmm2
|
||||
movdqa 17*16(%rsp), %xmm1
|
||||
movdqa 18*16(%rsp), %xmm2
|
||||
movdqa %xmm1, 30*16(%rsi)
|
||||
movdqa %xmm2, 31*16(%rsi)
|
||||
|
||||
movdqa 0(%rsp), %xmm1
|
||||
movdqa 16(%rsp), %xmm2
|
||||
movdqa 32(%rsp), %xmm6
|
||||
paddd 0(%rdx), %xmm7
|
||||
paddd 16(%rdx), %xmm5
|
||||
paddd 32(%rdx), %xmm4
|
||||
paddd 48(%rdx), %xmm3
|
||||
paddd 64(%rdx), %xmm0
|
||||
paddd 80(%rdx), %xmm8
|
||||
paddd 96(%rdx), %xmm9
|
||||
paddd 112(%rdx), %xmm10
|
||||
paddd 80(%rdx), %xmm1
|
||||
paddd 96(%rdx), %xmm2
|
||||
paddd 112(%rdx), %xmm6
|
||||
|
||||
movdqa %xmm7, 0(%rsp)
|
||||
movdqa %xmm5, 16(%rsp)
|
||||
movdqa %xmm4, 32(%rsp)
|
||||
movdqa %xmm3, 48(%rsp)
|
||||
movdqa %xmm0, 64(%rsp)
|
||||
movdqa %xmm8, 80(%rsp)
|
||||
movdqa %xmm9, 96(%rsp)
|
||||
movdqa %xmm10, 112(%rsp)
|
||||
movdqa %xmm7, 48+0(%rsp)
|
||||
movdqa %xmm5, 48+16(%rsp)
|
||||
movdqa %xmm4, 48+32(%rsp)
|
||||
movdqa %xmm3, 48+48(%rsp)
|
||||
movdqa %xmm0, 48+64(%rsp)
|
||||
movdqa %xmm1, 48+80(%rsp)
|
||||
movdqa %xmm2, 48+96(%rsp)
|
||||
movdqa %xmm6, 48+112(%rsp)
|
||||
|
||||
pxor %xmm0, %xmm0
|
||||
movq $0x8000000000000100, %rax
|
||||
movd %rax, %xmm1
|
||||
pshufd $0x55, %xmm1, %xmm2
|
||||
pshufd $0x00, %xmm1, %xmm1
|
||||
movdqa %xmm2, 128(%rsp)
|
||||
movdqa %xmm0, 144(%rsp)
|
||||
movdqa %xmm0, 160(%rsp)
|
||||
movdqa %xmm0, 176(%rsp)
|
||||
movdqa %xmm0, 192(%rsp)
|
||||
movdqa %xmm0, 208(%rsp)
|
||||
movdqa %xmm0, 224(%rsp)
|
||||
movdqa %xmm1, 240(%rsp)
|
||||
movdqa %xmm2, 48+128(%rsp)
|
||||
movdqa %xmm0, 48+144(%rsp)
|
||||
movdqa %xmm0, 48+160(%rsp)
|
||||
movdqa %xmm0, 48+176(%rsp)
|
||||
movdqa %xmm0, 48+192(%rsp)
|
||||
movdqa %xmm0, 48+208(%rsp)
|
||||
movdqa %xmm0, 48+224(%rsp)
|
||||
movdqa %xmm1, 48+240(%rsp)
|
||||
|
||||
leaq 256(%rsp), %rax
|
||||
leaq 19*16(%rsp), %rax
|
||||
cmpq %rax, %rax
|
||||
|
||||
movdqa -15*16(%rax), %xmm0
|
||||
|
@ -1550,61 +1518,63 @@ sha256d_ms_4way_sse2_extend_coda2:
|
|||
movdqa sha256_4h+32(%rip), %xmm4
|
||||
movdqa sha256_4h+48(%rip), %xmm3
|
||||
movdqa sha256_4h+64(%rip), %xmm0
|
||||
movdqa sha256_4h+80(%rip), %xmm8
|
||||
movdqa sha256_4h+96(%rip), %xmm9
|
||||
movdqa sha256_4h+112(%rip), %xmm10
|
||||
movdqa sha256_4h+80(%rip), %xmm1
|
||||
movdqa sha256_4h+96(%rip), %xmm2
|
||||
movdqa sha256_4h+112(%rip), %xmm6
|
||||
movdqa %xmm1, 0(%rsp)
|
||||
movdqa %xmm2, 16(%rsp)
|
||||
movdqa %xmm6, 32(%rsp)
|
||||
|
||||
movq %rsp, %rax
|
||||
leaq 48(%rsp), %rax
|
||||
leaq sha256_4k(%rip), %rcx
|
||||
jmp sha256d_ms_4way_sse2_main_loop2
|
||||
|
||||
.macro sha256_sse2_main_round_red i, r0, r1, r2, r3, r4
|
||||
.macro sha256_sse2_main_round_red i, r7
|
||||
movdqa 16*\i(%rax), %xmm6
|
||||
paddd 16*\i(%rcx), %xmm6
|
||||
paddd \r0, %xmm6
|
||||
movdqa \r3, %xmm1
|
||||
movdqa \r1, %xmm2
|
||||
paddd 32(%rsp), %xmm6
|
||||
movdqa %xmm0, %xmm1
|
||||
movdqa 16(%rsp), %xmm2
|
||||
paddd \r7, %xmm6
|
||||
pandn %xmm2, %xmm1
|
||||
movdqa \r2, %xmm2
|
||||
pand \r3, %xmm2
|
||||
movdqa %xmm2, 32(%rsp)
|
||||
movdqa 0(%rsp), %xmm2
|
||||
movdqa %xmm2, 16(%rsp)
|
||||
pand %xmm0, %xmm2
|
||||
pxor %xmm2, %xmm1
|
||||
movdqa \r3, \r0
|
||||
movdqa %xmm0, 0(%rsp)
|
||||
paddd %xmm1, %xmm6
|
||||
movdqa \r3, %xmm1
|
||||
psrld $6, \r0
|
||||
movdqa \r0, %xmm2
|
||||
movdqa %xmm0, %xmm1
|
||||
psrld $6, %xmm0
|
||||
movdqa %xmm0, %xmm2
|
||||
pslld $7, %xmm1
|
||||
psrld $5, %xmm2
|
||||
pxor %xmm1, \r0
|
||||
pxor %xmm2, \r0
|
||||
pxor %xmm1, %xmm0
|
||||
pxor %xmm2, %xmm0
|
||||
pslld $14, %xmm1
|
||||
psrld $14, %xmm2
|
||||
pxor %xmm1, \r0
|
||||
pxor %xmm2, \r0
|
||||
pxor %xmm1, %xmm0
|
||||
pxor %xmm2, %xmm0
|
||||
pslld $5, %xmm1
|
||||
pxor %xmm1, \r0
|
||||
paddd %xmm6, \r0
|
||||
paddd \r4, \r0
|
||||
pxor %xmm1, %xmm0
|
||||
paddd %xmm6, %xmm0
|
||||
.endm
|
||||
|
||||
sha256d_ms_4way_sse2_finish:
|
||||
sha256_sse2_main_round_red 57, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
|
||||
sha256_sse2_main_round_red 58, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
|
||||
sha256_sse2_main_round_red 59, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
|
||||
sha256_sse2_main_round_red 60, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
|
||||
sha256_sse2_main_round_red 57, %xmm3
|
||||
sha256_sse2_main_round_red 58, %xmm4
|
||||
sha256_sse2_main_round_red 59, %xmm5
|
||||
sha256_sse2_main_round_red 60, %xmm7
|
||||
|
||||
paddd sha256_4h+112(%rip), %xmm0
|
||||
movdqa %xmm0, 112(%rdi)
|
||||
|
||||
addq $1032, %rsp
|
||||
addq $8+67*16, %rsp
|
||||
#if defined(WIN64)
|
||||
popq %rsi
|
||||
movdqa 0(%rsp), %xmm6
|
||||
movdqa 16(%rsp), %xmm7
|
||||
movdqa 32(%rsp), %xmm8
|
||||
movdqa 48(%rsp), %xmm9
|
||||
movdqa 64(%rsp), %xmm10
|
||||
addq $80, %rsp
|
||||
addq $32, %rsp
|
||||
popq %rdi
|
||||
#endif
|
||||
ret
|
||||
|
@ -2596,13 +2566,13 @@ _sha256_use_4way:
|
|||
cpuid
|
||||
andl $0x18000000, %ecx
|
||||
cmpl $0x18000000, %ecx
|
||||
jne sha256_use_4way_sse2
|
||||
jne sha256_use_4way_base
|
||||
# Check for XMM and YMM state support
|
||||
xorl %ecx, %ecx
|
||||
xgetbv
|
||||
andl $0x00000006, %eax
|
||||
cmpl $0x00000006, %eax
|
||||
jne sha256_use_4way_sse2
|
||||
jne sha256_use_4way_base
|
||||
#if defined(USE_XOP)
|
||||
# Check for XOP support
|
||||
movl $0x80000001, %eax
|
||||
|
@ -2622,7 +2592,7 @@ sha256_use_4way_avx:
|
|||
jmp sha256_use_4way_done
|
||||
#endif /* USE_AVX */
|
||||
|
||||
sha256_use_4way_sse2:
|
||||
sha256_use_4way_base:
|
||||
leaq sha256d_ms_4way_sse2(%rip), %rcx
|
||||
leaq sha256_transform_4way_core_sse2(%rip), %rdx
|
||||
|
||||
|
|
20
sha2-x86.S
20
sha2-x86.S
|
@ -115,25 +115,23 @@ sha256d_4preext2_30:
|
|||
.globl _sha256_init_4way
|
||||
sha256_init_4way:
|
||||
_sha256_init_4way:
|
||||
pushl %edi
|
||||
movl 8(%esp), %edi
|
||||
movl 8(%esp), %edx
|
||||
movdqa sha256_4h+0, %xmm0
|
||||
movdqa sha256_4h+16, %xmm1
|
||||
movdqa sha256_4h+32, %xmm2
|
||||
movdqa sha256_4h+48, %xmm3
|
||||
movdqu %xmm0, 0(%edi)
|
||||
movdqu %xmm1, 16(%edi)
|
||||
movdqu %xmm2, 32(%edi)
|
||||
movdqu %xmm3, 48(%edi)
|
||||
movdqu %xmm0, 0(%edx)
|
||||
movdqu %xmm1, 16(%edx)
|
||||
movdqu %xmm2, 32(%edx)
|
||||
movdqu %xmm3, 48(%edx)
|
||||
movdqa sha256_4h+64, %xmm0
|
||||
movdqa sha256_4h+80, %xmm1
|
||||
movdqa sha256_4h+96, %xmm2
|
||||
movdqa sha256_4h+112, %xmm3
|
||||
movdqu %xmm0, 64(%edi)
|
||||
movdqu %xmm1, 80(%edi)
|
||||
movdqu %xmm2, 96(%edi)
|
||||
movdqu %xmm3, 112(%edi)
|
||||
popl %edi
|
||||
movdqu %xmm0, 64(%edx)
|
||||
movdqu %xmm1, 80(%edx)
|
||||
movdqu %xmm2, 96(%edx)
|
||||
movdqu %xmm3, 112(%edx)
|
||||
ret
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue