Reduce register usage in 4-way SHA-256

This commit is contained in:
pooler 2012-04-01 19:39:01 +02:00
parent 7ca71eb324
commit 80c762b0da
2 changed files with 120 additions and 152 deletions

View file

@ -233,21 +233,21 @@ _sha256_init_4way:
.endm
.macro sha256_sse2_main_round i
movdqa 16*\i(%rax), %xmm6
paddd 16*\i(%rcx), %xmm6
paddd %xmm10, %xmm6
movdqa 16*(\i)(%rax), %xmm6
paddd 16*(\i)(%rcx), %xmm6
paddd 32(%rsp), %xmm6
movdqa %xmm0, %xmm1
movdqa %xmm9, %xmm2
movdqa 16(%rsp), %xmm2
pandn %xmm2, %xmm1
movdqa %xmm2, %xmm10
movdqa %xmm8, %xmm2
movdqa %xmm2, %xmm9
movdqa %xmm2, 32(%rsp)
movdqa 0(%rsp), %xmm2
movdqa %xmm2, 16(%rsp)
pand %xmm0, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm0, %xmm8
movdqa %xmm0, 0(%rsp)
paddd %xmm1, %xmm6
@ -297,6 +297,13 @@ _sha256_init_4way:
paddd %xmm6, %xmm7
.endm
.macro sha256_sse2_main_quadround i
sha256_sse2_main_round \i+0
sha256_sse2_main_round \i+1
sha256_sse2_main_round \i+2
sha256_sse2_main_round \i+3
.endm
#if defined(USE_AVX)
@ -969,19 +976,16 @@ _sha256d_ms_4way:
sha256d_ms_4way_sse2:
#if defined(WIN64)
pushq %rdi
subq $80, %rsp
subq $32, %rsp
movdqa %xmm6, 0(%rsp)
movdqa %xmm7, 16(%rsp)
movdqa %xmm8, 32(%rsp)
movdqa %xmm9, 48(%rsp)
movdqa %xmm10, 64(%rsp)
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
#endif
subq $1032, %rsp
subq $8+67*16, %rsp
leaq 256(%rsi), %rax
@ -989,8 +993,8 @@ sha256d_ms_4way_sse2_extend_loop1:
movdqa 3*16(%rsi), %xmm0
movdqa 2*16(%rax), %xmm3
movdqa 3*16(%rax), %xmm7
movdqa %xmm3, 2*16(%rsp)
movdqa %xmm7, 3*16(%rsp)
movdqa %xmm3, 5*16(%rsp)
movdqa %xmm7, 6*16(%rsp)
movdqa %xmm0, %xmm2
paddd %xmm0, %xmm7
psrld $3, %xmm0
@ -1008,7 +1012,7 @@ sha256d_ms_4way_sse2_extend_loop1:
movdqa %xmm7, 3*16(%rax)
movdqa 4*16(%rax), %xmm0
movdqa %xmm0, 4*16(%rsp)
movdqa %xmm0, 7*16(%rsp)
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
psrld $10, %xmm3
@ -1037,8 +1041,8 @@ sha256d_ms_4way_sse2_extend_loop1:
movdqa 6*16(%rax), %xmm0
movdqa 7*16(%rax), %xmm4
movdqa %xmm0, 6*16(%rsp)
movdqa %xmm4, 7*16(%rsp)
movdqa %xmm0, 9*16(%rsp)
movdqa %xmm4, 10*16(%rsp)
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
psrld $10, %xmm3
@ -1068,7 +1072,7 @@ sha256d_ms_4way_sse2_extend_loop1:
movdqa 8*16(%rax), %xmm0
movdqa 2*16(%rax), %xmm4
movdqa %xmm0, 8*16(%rsp)
movdqa %xmm0, 11*16(%rsp)
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
psrld $10, %xmm3
@ -1152,8 +1156,8 @@ sha256d_ms_4way_sse2_extend_loop1:
movdqa 14*16(%rax), %xmm0
movdqa 15*16(%rax), %xmm4
movdqa %xmm0, 14*16(%rsp)
movdqa %xmm4, 15*16(%rsp)
movdqa %xmm0, 17*16(%rsp)
movdqa %xmm4, 18*16(%rsp)
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
psrld $10, %xmm3
@ -1204,12 +1208,15 @@ sha256d_ms_4way_sse2_extend_loop2:
movdqa 0(%rcx), %xmm3
movdqa 16(%rcx), %xmm0
movdqa 32(%rcx), %xmm8
movdqa 48(%rcx), %xmm9
movdqa 64(%rcx), %xmm10
movdqa 32(%rcx), %xmm1
movdqa 48(%rcx), %xmm2
movdqa 64(%rcx), %xmm6
movdqa 80(%rcx), %xmm7
movdqa 96(%rcx), %xmm5
movdqa 112(%rcx), %xmm4
movdqa %xmm1, 0(%rsp)
movdqa %xmm2, 16(%rsp)
movdqa %xmm6, 32(%rsp)
movq %rsi, %rax
leaq sha256_4k(%rip), %rcx
@ -1221,118 +1228,79 @@ sha256d_ms_4way_sse2_main_loop2:
sha256_sse2_main_round 2
sha256d_ms_4way_sse2_main_loop1:
sha256_sse2_main_round 3
sha256_sse2_main_round 4
sha256_sse2_main_round 5
sha256_sse2_main_round 6
sha256_sse2_main_round 7
sha256_sse2_main_round 8
sha256_sse2_main_round 9
sha256_sse2_main_round 10
sha256_sse2_main_round 11
sha256_sse2_main_round 12
sha256_sse2_main_round 13
sha256_sse2_main_round 14
sha256_sse2_main_round 15
sha256_sse2_main_round 16
sha256_sse2_main_round 17
sha256_sse2_main_round 18
sha256_sse2_main_round 19
sha256_sse2_main_round 20
sha256_sse2_main_round 21
sha256_sse2_main_round 22
sha256_sse2_main_round 23
sha256_sse2_main_round 24
sha256_sse2_main_round 25
sha256_sse2_main_round 26
sha256_sse2_main_round 27
sha256_sse2_main_round 28
sha256_sse2_main_round 29
sha256_sse2_main_round 30
sha256_sse2_main_round 31
sha256_sse2_main_round 32
sha256_sse2_main_round 33
sha256_sse2_main_round 34
sha256_sse2_main_round 35
sha256_sse2_main_round 36
sha256_sse2_main_round 37
sha256_sse2_main_round 38
sha256_sse2_main_round 39
sha256_sse2_main_round 40
sha256_sse2_main_round 41
sha256_sse2_main_round 42
sha256_sse2_main_round 43
sha256_sse2_main_round 44
sha256_sse2_main_round 45
sha256_sse2_main_round 46
sha256_sse2_main_round 47
sha256_sse2_main_round 48
sha256_sse2_main_round 49
sha256_sse2_main_round 50
sha256_sse2_main_round 51
sha256_sse2_main_round 52
sha256_sse2_main_round 53
sha256_sse2_main_round 54
sha256_sse2_main_round 55
sha256_sse2_main_quadround 4
sha256_sse2_main_quadround 8
sha256_sse2_main_quadround 12
sha256_sse2_main_quadround 16
sha256_sse2_main_quadround 20
sha256_sse2_main_quadround 24
sha256_sse2_main_quadround 28
sha256_sse2_main_quadround 32
sha256_sse2_main_quadround 36
sha256_sse2_main_quadround 40
sha256_sse2_main_quadround 44
sha256_sse2_main_quadround 48
sha256_sse2_main_quadround 52
sha256_sse2_main_round 56
jz sha256d_ms_4way_sse2_finish
sha256_sse2_main_round 57
sha256_sse2_main_round 58
sha256_sse2_main_round 59
sha256_sse2_main_round 60
sha256_sse2_main_round 61
sha256_sse2_main_round 62
sha256_sse2_main_round 63
sha256_sse2_main_quadround 60
movdqa 2*16(%rsp), %xmm1
movdqa 3*16(%rsp), %xmm2
movdqa 4*16(%rsp), %xmm6
movdqa 5*16(%rsp), %xmm1
movdqa 6*16(%rsp), %xmm2
movdqa 7*16(%rsp), %xmm6
movdqa %xmm1, 18*16(%rsi)
movdqa %xmm2, 19*16(%rsi)
movdqa %xmm6, 20*16(%rsi)
movdqa 6*16(%rsp), %xmm1
movdqa 7*16(%rsp), %xmm2
movdqa 8*16(%rsp), %xmm6
movdqa 9*16(%rsp), %xmm1
movdqa 10*16(%rsp), %xmm2
movdqa 11*16(%rsp), %xmm6
movdqa %xmm1, 22*16(%rsi)
movdqa %xmm2, 23*16(%rsi)
movdqa %xmm6, 24*16(%rsi)
movdqa 14*16(%rsp), %xmm1
movdqa 15*16(%rsp), %xmm2
movdqa 17*16(%rsp), %xmm1
movdqa 18*16(%rsp), %xmm2
movdqa %xmm1, 30*16(%rsi)
movdqa %xmm2, 31*16(%rsi)
movdqa 0(%rsp), %xmm1
movdqa 16(%rsp), %xmm2
movdqa 32(%rsp), %xmm6
paddd 0(%rdx), %xmm7
paddd 16(%rdx), %xmm5
paddd 32(%rdx), %xmm4
paddd 48(%rdx), %xmm3
paddd 64(%rdx), %xmm0
paddd 80(%rdx), %xmm8
paddd 96(%rdx), %xmm9
paddd 112(%rdx), %xmm10
paddd 80(%rdx), %xmm1
paddd 96(%rdx), %xmm2
paddd 112(%rdx), %xmm6
movdqa %xmm7, 0(%rsp)
movdqa %xmm5, 16(%rsp)
movdqa %xmm4, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm0, 64(%rsp)
movdqa %xmm8, 80(%rsp)
movdqa %xmm9, 96(%rsp)
movdqa %xmm10, 112(%rsp)
movdqa %xmm7, 48+0(%rsp)
movdqa %xmm5, 48+16(%rsp)
movdqa %xmm4, 48+32(%rsp)
movdqa %xmm3, 48+48(%rsp)
movdqa %xmm0, 48+64(%rsp)
movdqa %xmm1, 48+80(%rsp)
movdqa %xmm2, 48+96(%rsp)
movdqa %xmm6, 48+112(%rsp)
pxor %xmm0, %xmm0
movq $0x8000000000000100, %rax
movd %rax, %xmm1
pshufd $0x55, %xmm1, %xmm2
pshufd $0x00, %xmm1, %xmm1
movdqa %xmm2, 128(%rsp)
movdqa %xmm0, 144(%rsp)
movdqa %xmm0, 160(%rsp)
movdqa %xmm0, 176(%rsp)
movdqa %xmm0, 192(%rsp)
movdqa %xmm0, 208(%rsp)
movdqa %xmm0, 224(%rsp)
movdqa %xmm1, 240(%rsp)
movdqa %xmm2, 48+128(%rsp)
movdqa %xmm0, 48+144(%rsp)
movdqa %xmm0, 48+160(%rsp)
movdqa %xmm0, 48+176(%rsp)
movdqa %xmm0, 48+192(%rsp)
movdqa %xmm0, 48+208(%rsp)
movdqa %xmm0, 48+224(%rsp)
movdqa %xmm1, 48+240(%rsp)
leaq 256(%rsp), %rax
leaq 19*16(%rsp), %rax
cmpq %rax, %rax
movdqa -15*16(%rax), %xmm0
@ -1550,61 +1518,63 @@ sha256d_ms_4way_sse2_extend_coda2:
movdqa sha256_4h+32(%rip), %xmm4
movdqa sha256_4h+48(%rip), %xmm3
movdqa sha256_4h+64(%rip), %xmm0
movdqa sha256_4h+80(%rip), %xmm8
movdqa sha256_4h+96(%rip), %xmm9
movdqa sha256_4h+112(%rip), %xmm10
movdqa sha256_4h+80(%rip), %xmm1
movdqa sha256_4h+96(%rip), %xmm2
movdqa sha256_4h+112(%rip), %xmm6
movdqa %xmm1, 0(%rsp)
movdqa %xmm2, 16(%rsp)
movdqa %xmm6, 32(%rsp)
movq %rsp, %rax
leaq 48(%rsp), %rax
leaq sha256_4k(%rip), %rcx
jmp sha256d_ms_4way_sse2_main_loop2
.macro sha256_sse2_main_round_red i, r0, r1, r2, r3, r4
.macro sha256_sse2_main_round_red i, r7
movdqa 16*\i(%rax), %xmm6
paddd 16*\i(%rcx), %xmm6
paddd \r0, %xmm6
movdqa \r3, %xmm1
movdqa \r1, %xmm2
paddd 32(%rsp), %xmm6
movdqa %xmm0, %xmm1
movdqa 16(%rsp), %xmm2
paddd \r7, %xmm6
pandn %xmm2, %xmm1
movdqa \r2, %xmm2
pand \r3, %xmm2
movdqa %xmm2, 32(%rsp)
movdqa 0(%rsp), %xmm2
movdqa %xmm2, 16(%rsp)
pand %xmm0, %xmm2
pxor %xmm2, %xmm1
movdqa \r3, \r0
movdqa %xmm0, 0(%rsp)
paddd %xmm1, %xmm6
movdqa \r3, %xmm1
psrld $6, \r0
movdqa \r0, %xmm2
movdqa %xmm0, %xmm1
psrld $6, %xmm0
movdqa %xmm0, %xmm2
pslld $7, %xmm1
psrld $5, %xmm2
pxor %xmm1, \r0
pxor %xmm2, \r0
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $14, %xmm1
psrld $14, %xmm2
pxor %xmm1, \r0
pxor %xmm2, \r0
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $5, %xmm1
pxor %xmm1, \r0
paddd %xmm6, \r0
paddd \r4, \r0
pxor %xmm1, %xmm0
paddd %xmm6, %xmm0
.endm
sha256d_ms_4way_sse2_finish:
sha256_sse2_main_round_red 57, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
sha256_sse2_main_round_red 58, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
sha256_sse2_main_round_red 59, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
sha256_sse2_main_round_red 60, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
sha256_sse2_main_round_red 57, %xmm3
sha256_sse2_main_round_red 58, %xmm4
sha256_sse2_main_round_red 59, %xmm5
sha256_sse2_main_round_red 60, %xmm7
paddd sha256_4h+112(%rip), %xmm0
movdqa %xmm0, 112(%rdi)
addq $1032, %rsp
addq $8+67*16, %rsp
#if defined(WIN64)
popq %rsi
movdqa 0(%rsp), %xmm6
movdqa 16(%rsp), %xmm7
movdqa 32(%rsp), %xmm8
movdqa 48(%rsp), %xmm9
movdqa 64(%rsp), %xmm10
addq $80, %rsp
addq $32, %rsp
popq %rdi
#endif
ret
@ -2596,13 +2566,13 @@ _sha256_use_4way:
cpuid
andl $0x18000000, %ecx
cmpl $0x18000000, %ecx
jne sha256_use_4way_sse2
jne sha256_use_4way_base
# Check for XMM and YMM state support
xorl %ecx, %ecx
xgetbv
andl $0x00000006, %eax
cmpl $0x00000006, %eax
jne sha256_use_4way_sse2
jne sha256_use_4way_base
#if defined(USE_XOP)
# Check for XOP support
movl $0x80000001, %eax
@ -2622,7 +2592,7 @@ sha256_use_4way_avx:
jmp sha256_use_4way_done
#endif /* USE_AVX */
sha256_use_4way_sse2:
sha256_use_4way_base:
leaq sha256d_ms_4way_sse2(%rip), %rcx
leaq sha256_transform_4way_core_sse2(%rip), %rdx

View file

@ -115,25 +115,23 @@ sha256d_4preext2_30:
.globl _sha256_init_4way
sha256_init_4way:
_sha256_init_4way:
pushl %edi
movl 8(%esp), %edi
movl 8(%esp), %edx
movdqa sha256_4h+0, %xmm0
movdqa sha256_4h+16, %xmm1
movdqa sha256_4h+32, %xmm2
movdqa sha256_4h+48, %xmm3
movdqu %xmm0, 0(%edi)
movdqu %xmm1, 16(%edi)
movdqu %xmm2, 32(%edi)
movdqu %xmm3, 48(%edi)
movdqu %xmm0, 0(%edx)
movdqu %xmm1, 16(%edx)
movdqu %xmm2, 32(%edx)
movdqu %xmm3, 48(%edx)
movdqa sha256_4h+64, %xmm0
movdqa sha256_4h+80, %xmm1
movdqa sha256_4h+96, %xmm2
movdqa sha256_4h+112, %xmm3
movdqu %xmm0, 64(%edi)
movdqu %xmm1, 80(%edi)
movdqu %xmm2, 96(%edi)
movdqu %xmm3, 112(%edi)
popl %edi
movdqu %xmm0, 64(%edx)
movdqu %xmm1, 80(%edx)
movdqu %xmm2, 96(%edx)
movdqu %xmm3, 112(%edx)
ret